43 files changed, 3671 insertions, 719 deletions
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index b94b148..c18db98 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -463,7 +463,7 @@ void HexagonOperand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
     break;
   case Register:
     OS << "<register R";
-    OS << getReg() << ">";
+    OS << getReg().id() << ">";
     break;
   case Token:
     OS << "'" << getToken() << "'";
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 1a5f096..eddab5a 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -37,6 +37,8 @@ add_llvm_target(HexagonCodeGen
   HexagonGenMemAbsolute.cpp
   HexagonGenMux.cpp
   HexagonGenPredicate.cpp
+  HexagonGenWideningVecFloatInstr.cpp
+  HexagonGenWideningVecInstr.cpp
   HexagonHardwareLoops.cpp
   HexagonHazardRecognizer.cpp
   HexagonInstrInfo.cpp
@@ -53,6 +55,7 @@ add_llvm_target(HexagonCodeGen
   HexagonNewValueJump.cpp
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
+  HexagonOptShuffleVector.cpp
   HexagonPeephole.cpp
   HexagonQFPOptimizer.cpp
   HexagonRDFOpt.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 422ab20..b98369d 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -92,6 +92,9 @@ FunctionPass *createHexagonGenInsert();
 FunctionPass *createHexagonGenMemAbsolute();
 FunctionPass *createHexagonGenMux();
 FunctionPass *createHexagonGenPredicate();
+FunctionPass *
+createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &);
+FunctionPass *createHexagonGenWideningVecInstr(const HexagonTargetMachine &);
 FunctionPass *createHexagonHardwareLoops();
 FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                    CodeGenOptLevel OptLevel);
@@ -102,6 +105,7 @@ FunctionPass *createHexagonMergeActivateWeight();
 FunctionPass *createHexagonNewValueJump();
 FunctionPass *createHexagonOptAddrMode();
 FunctionPass *createHexagonOptimizeSZextends();
+FunctionPass *createHexagonOptShuffleVector(const HexagonTargetMachine &);
 FunctionPass *createHexagonPacketizer(bool Minimal);
 FunctionPass *createHexagonPeephole();
 FunctionPass *createHexagonRDFOpt();
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index ede8463..17c72c3 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -413,6 +413,8 @@ include "HexagonPatternsV65.td"
 include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
 
+defm : RemapAllTargetPseudoPointerOperands<IntRegs>;
+
 def HexagonInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 68f5312..8483374 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -137,8 +137,7 @@ namespace {
       return !Bits.any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.test(B)  <=>  A-B != {}
-      return !Rs.Bits.test(Bits);
+      return Rs.Bits.subsetOf(Bits);
     }
     bool intersects(const RegisterSet &Rs) const {
       return Bits.anyCommon(Rs.Bits);
@@ -1796,7 +1795,7 @@ namespace {
 
     const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
-    const HexagonRegisterInfo &HRI;
+    [[maybe_unused]] const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
     BitTracker &BT;
   };
@@ -1886,7 +1885,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
 bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
       unsigned OpNum) {
-  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI);
+  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum);
   auto *RRC = HBS::getFinalVRegClass(R, MRI);
   return OpRC->hasSubClassEq(RRC);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index eca5ac1..bae3484 100644
--- a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -24,7 +24,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index dceb70c..80adde8 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -25,6 +25,8 @@ def CC_HexagonStack: CallingConv<[
 def CC_Hexagon_Legacy: CallingConv<[
   CCIfType<[i1,i8,i16],
     CCPromoteToType<i32>>,
+  CCIfType<[bf16],
+    CCBitConvertToType<i32>>,
   CCIfType<[f32],
     CCBitConvertToType<i32>>,
   CCIfType<[f64],
@@ -55,6 +57,8 @@ def CC_Hexagon_Legacy: CallingConv<[
 def CC_Hexagon: CallingConv<[
   CCIfType<[i1,i8,i16],
     CCPromoteToType<i32>>,
+  CCIfType<[bf16],
+    CCBitConvertToType<i32>>,
   CCIfType<[f32],
     CCBitConvertToType<i32>>,
   CCIfType<[f64],
@@ -88,6 +92,8 @@ def CC_Hexagon: CallingConv<[
 def RetCC_Hexagon: CallingConv<[
   CCIfType<[i1,i8,i16],
     CCPromoteToType<i32>>,
+  CCIfType<[bf16],
+    CCBitConvertToType<i32>>,
   CCIfType<[f32],
     CCBitConvertToType<i32>>,
   CCIfType<[f64],
@@ -149,16 +155,16 @@ def CC_Hexagon_HVX: CallingConv<[
         CCIfType<[v128i1], CCPromoteToType<v128i8>>>,
 
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16,v64bf16],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16,v128bf16],
       CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16,v64bf16],
       CCAssignToStack<128,128>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16,v64bf16],
       CCAssignToStack<256,128>>>,
 
   CCDelegateTo<CC_Hexagon>
@@ -175,10 +181,10 @@ def RetCC_Hexagon_HVX: CallingConv<[
 
   // HVX 128-byte mode
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16,v64bf16],
       CCAssignToReg<[V0]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16,v128bf16],
       CCAssignToReg<[W0]>>>,
 
   CCDelegateTo<RetCC_Hexagon>
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index dd343d9..df61226 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1405,7 +1405,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
     int FI = I.getFrameIdx();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
-    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register());
+    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, Register());
     if (IsKill)
       MBB.addLiveIn(Reg);
   }
@@ -1470,7 +1470,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     MCRegister Reg = I.getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     int FI = I.getFrameIdx();
-    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register());
+    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, Register());
   }
 
   return true;
@@ -1814,8 +1814,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
     .addReg(SrcR, getKillRegState(IsKill))
     .addReg(TmpR0, RegState::Kill);
 
-  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI, Register());
+  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, Register());
   expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
 
   NewRegs.push_back(TmpR0);
@@ -1844,9 +1843,7 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
 
   BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
     .addImm(0x01010101);
-  MachineFunction &MF = *B.getParent();
-  auto *HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI, Register());
+  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, Register());
   expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
 
   BuildMI(B, It, DL, HII.get(Hexagon::V6_vandvrt), DstR)
@@ -2225,7 +2222,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (!Bad) {
           // If the addressing mode is ok, check the register class.
           unsigned OpNum = Load ? 0 : 2;
-          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI);
+          auto *RC = HII.getRegClass(In.getDesc(), OpNum);
           RC = getCommonRC(SI.RC, RC);
           if (RC == nullptr)
             Bad = true;
@@ -2395,7 +2392,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
 
         HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
                                                   SrcOp.getSubReg() };
-        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI);
+        auto *RC = HII.getRegClass(SI.getDesc(), 2);
         // The this-> is needed to unconfuse MSVC.
         Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
         LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index ff876f6..18fcd6a 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -153,8 +153,7 @@ namespace {
       return !BitVector::any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.BitVector::test(B)  <=>  A-B != {}
-      return !Rs.BitVector::test(*this);
+      return Rs.BitVector::subsetOf(*this);
     }
     bool intersects(const RegisterSet &Rs) const {
       return BitVector::anyCommon(Rs);
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 74e5abe..c6fffde 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -43,7 +43,6 @@
 #include <cassert>
 #include <iterator>
 #include <limits>
-#include <utility>
 
 #define DEBUG_TYPE "hexmux"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 9c81e963..412d587 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -30,7 +30,6 @@
 #include <cassert>
 #include <iterator>
 #include <queue>
-#include <utility>
 
 #define DEBUG_TYPE "gen-pred"
 
@@ -52,8 +51,7 @@ private:
 };
 
 [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
-                                         const PrintRegister &PR);
-raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &PR) {
+                                         const PrintRegister &PR) {
   return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp b/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp
new file mode 100644
index 0000000..7271f1f
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenWideningVecFloatInstr.cpp
@@ -0,0 +1,565 @@
+//===------------------- HexagonGenWideningVecFloatInstr.cpp --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace widening vector float operations with hexagon intrinsics.
+//
+//===----------------------------------------------------------------------===//
+//
+// Brief overview of working of GenWideningVecFloatInstr pass.
+// This version of pass is replica of already existing pass(which will replace
+// widen vector integer operations with it's respective intrinsics). In this
+// pass we will generate hexagon intrinsics for widen vector float instructions.
+//
+// Example1(64 vector-width widening):
+// %wide.load = load <64 x half>, <64 x half>* %0, align 2
+// %wide.load53 = load <64 x half>, <64 x half>* %2, align 2
+// %1 = fpext <64 x half> %wide.load to <64 x float>
+// %3 = fpext <64 x half> %wide.load53 to <64 x float>
+// %4 = fmul <64 x float> %1, %3
+//
+// If we run this pass on the above example, it will first find fmul
+// instruction, and then it will check whether the operands of fmul instruction
+// (%1 and %3) belongs to either of these categories [%1 ->fpext, %3 ->fpext]
+// or [%1 ->fpext, %3 ->constant_vector] or [%1 ->constant_vector, %3 ->fpext].
+// If it sees such pattern, then this pass will replace such pattern with
+// appropriate hexagon intrinsics.
+//
+// After replacement:
+// %wide.load = load <64 x half>, <64 x half>* %0, align 2
+// %wide.load53 = load <64 x half>, <64 x half>* %2, align 2
+// %3 = bitcast <64 x half> %wide.load to <32 x i32>
+// %4 = bitcast <64 x half> %wide.load53 to <32 x i32>
+// %5 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%3, %4)
+// %6 = shufflevector <64 x i32> %5, <64 x i32> poison, <64 x i32> ShuffMask1
+// %7 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %6)
+// %8 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %6)
+// %9 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %7)
+// %10 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %8)
+// %11 = bitcast <32 x i32> %9 to <32 x float>
+// %12 = bitcast <32 x i32> %10 to <32 x float>
+// %13 = shufflevector <32 x float> %12, <32 x float> %11, <64 x i32> ShuffMask2
+//
+//
+//
+// Example2(128 vector-width widening):
+// %0 = bitcast half* %a to <128 x half>*
+// %wide.load = load <128 x half>, <128 x half>* %0, align 2
+// %1 = fpext <128 x half> %wide.load to <128 x float>
+// %2 = bitcast half* %b to <128 x half>*
+// %wide.load2 = load <128 x half>, <128 x half>* %2, align 2
+// %3 = fpext <128 x half> %wide.load2 to <128 x float>
+// %4 = fmul <128 x float> %1, %3
+//
+// After replacement:
+// %0 = bitcast half* %a to <128 x half>*
+// %wide.load = load <128 x half>, <128 x half>* %0, align 2
+// %1 = bitcast half* %b to <128 x half>*
+// %wide.load2 = load <128 x half>, <128 x half>* %1, align 2
+// %2 = bitcast <128 x half> %wide.load to <64 x i32>
+// %3 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %2)
+// %4 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %2)
+// %5 = bitcast <128 x half> %wide.load2 to <64 x i32>
+// %6 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %5)
+// %7 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %5)
+// %8 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%3, %6)
+// %9 = shufflevector <64 x i32> %8, <64 x i32> poison, <64 x i32> Mask1
+// %10 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %9)
+// %11 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %9)
+// %12 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %10)
+// %13 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %11)
+// %14 = bitcast <32 x i32> %12 to <32 x float>
+// %15 = bitcast <32 x i32> %13 to <32 x float>
+// %16 = shufflevector <32 x float> %15, <32 x float> %14, <64 x i32> Mask2
+// %17 = call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%4, %7)
+// %18 = shufflevector <64 x i32> %17, <64 x i32> poison, <64 x i32> Mask1
+// %19 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %18)
+// %20 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %18)
+// %21 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %19)
+// %22 = call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %20)
+// %23 = bitcast <32 x i32> %21 to <32 x float>
+// %24 = bitcast <32 x i32> %22 to <32 x float>
+// %25 = shufflevector <32 x float> %24, <32 x float> %23, <64 x i32> Mask2
+// %26 = shufflevector <64 x float> %25, <64 x float> %16, <128 x i32> Mask3
+//
+//
+//===----------------------------------------------------------------------===//
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+void initializeHexagonGenWideningVecFloatInstrPass(PassRegistry &);
+FunctionPass *
+createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonGenWideningVecFloatInstr : public FunctionPass {
+public:
+  static char ID;
+
+  HexagonGenWideningVecFloatInstr() : FunctionPass(ID) {
+    initializeHexagonGenWideningVecFloatInstrPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  HexagonGenWideningVecFloatInstr(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonGenWideningVecFloatInstrPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon generate widening vector float instructions";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  Module *M = nullptr;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  unsigned HwVLen;
+  unsigned NumHalfEltsInFullVec;
+
+  struct OPInfo {
+    Value *OP;
+    Value *ExtInOP;
+    unsigned ExtInSize;
+  };
+
+  bool visitBlock(BasicBlock *B);
+  bool processInstruction(Instruction *Inst);
+  bool replaceWithIntrinsic(Instruction *Inst, OPInfo &OP1Info,
+                            OPInfo &OP2Info);
+
+  bool getOperandInfo(Value *V, OPInfo &OPI);
+  bool isExtendedConstant(Constant *C);
+  unsigned getElementSizeInBits(Value *V);
+  Type *getElementTy(unsigned size, IRBuilder<> &IRB);
+
+  Value *adjustExtensionForOp(OPInfo &OPI, IRBuilder<> &IRB,
+                              unsigned NewEltsize, unsigned NumElts);
+
+  std::pair<Value *, Value *> opSplit(Value *OP, Instruction *Inst);
+
+  Value *createIntrinsic(Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1,
+                         Value *NewOP2, FixedVectorType *ResType,
+                         unsigned NumElts, bool BitCastOp);
+};
+
+} // end anonymous namespace
+
+char HexagonGenWideningVecFloatInstr::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonGenWideningVecFloatInstr, "widening-vec-float",
+                      "Hexagon generate "
+                      "widening vector float instructions",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenWideningVecFloatInstr, "widening-vec-float",
+                    "Hexagon generate "
+                    "widening vector float instructions",
+                    false, false)
+
+bool HexagonGenWideningVecFloatInstr::isExtendedConstant(Constant *C) {
+  if (Value *SplatV = C->getSplatValue()) {
+    if (auto *CFP = dyn_cast<ConstantFP>(SplatV)) {
+      bool Ignored;
+      APFloat APF = CFP->getValueAPF();
+      APFloat::opStatus sts = APF.convert(
+          APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+      if (sts == APFloat::opStatus::opOK || sts == APFloat::opStatus::opInexact)
+        return true;
+    }
+    return false;
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    if (auto *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(i))) {
+      bool Ignored;
+      APFloat APF = CFP->getValueAPF();
+      APFloat::opStatus sts = APF.convert(
+          APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+      if (sts != APFloat::opStatus::opOK && sts != APFloat::opStatus::opInexact)
+        return false;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+unsigned HexagonGenWideningVecFloatInstr::getElementSizeInBits(Value *V) {
+  Type *ValTy = V->getType();
+  Type *EltTy = ValTy;
+  if (dyn_cast<Constant>(V)) {
+    unsigned EltSize =
+        cast<VectorType>(EltTy)->getElementType()->getPrimitiveSizeInBits();
+    unsigned ReducedSize = EltSize / 2;
+
+    return ReducedSize;
+  }
+
+  if (ValTy->isVectorTy())
+    EltTy = cast<VectorType>(ValTy)->getElementType();
+  return EltTy->getPrimitiveSizeInBits();
+}
+
+bool HexagonGenWideningVecFloatInstr::getOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV = nullptr;
+  Constant *C = nullptr;
+
+  if (match(V, (m_FPExt(m_Value(ExtV)))) ||
+      match(V,
+            m_Shuffle(m_InsertElt(m_Poison(), m_FPExt(m_Value(ExtV)), m_Zero()),
+                      m_Poison(), m_ZeroMask()))) {
+
+    if (auto *ExtVType = dyn_cast<VectorType>(ExtV->getType())) {
+      // Matches the first branch.
+      if (ExtVType->getElementType()->isBFloatTy())
+        // do not confuse bf16 with ieee-fp16.
+        return false;
+    } else {
+      // Matches the second branch (insert element branch)
+      if (ExtV->getType()->isBFloatTy())
+        return false;
+    }
+
+    OPI.ExtInOP = ExtV;
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP);
+    return true;
+  }
+
+  if (match(V, m_Constant(C))) {
+    if (!isExtendedConstant(C))
+      return false;
+    OPI.ExtInOP = C;
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP);
+    return true;
+  }
+
+  return false;
+}
+
+Type *HexagonGenWideningVecFloatInstr::getElementTy(unsigned size,
+                                                    IRBuilder<> &IRB) {
+  switch (size) {
+  case 16:
+    return IRB.getHalfTy();
+  case 32:
+    return IRB.getFloatTy();
+  default:
+    llvm_unreachable("Unhandled Element size");
+  }
+}
+
+Value *HexagonGenWideningVecFloatInstr::adjustExtensionForOp(
+    OPInfo &OPI, IRBuilder<> &IRB, unsigned NewExtSize, unsigned NumElts) {
+  Value *V = OPI.ExtInOP;
+  unsigned EltSize = getElementSizeInBits(OPI.ExtInOP);
+  assert(NewExtSize >= EltSize);
+  Type *EltType = getElementTy(NewExtSize, IRB);
+  auto *NewOpTy = FixedVectorType::get(EltType, NumElts);
+
+  if (auto *C = dyn_cast<Constant>(V))
+    return IRB.CreateFPTrunc(C, NewOpTy);
+
+  if (V->getType()->isVectorTy())
+    if (NewExtSize == EltSize)
+      return V;
+
+  return nullptr;
+}
+
+std::pair<Value *, Value *>
+HexagonGenWideningVecFloatInstr::opSplit(Value *OP, Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  IRBuilder<> IRB(Inst);
+  Intrinsic::ID IntHi = Intrinsic::hexagon_V6_hi_128B;
+  Intrinsic::ID IntLo = Intrinsic::hexagon_V6_lo_128B;
+  Function *ExtFHi = Intrinsic::getOrInsertDeclaration(M, IntHi);
+  Function *ExtFLo = Intrinsic::getOrInsertDeclaration(M, IntLo);
+  if (NumElts == 128) {
+    auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 64);
+    OP = IRB.CreateBitCast(OP, InType);
+  }
+  Value *OP1Hi = IRB.CreateCall(ExtFHi, {OP});
+  Value *OP1Lo = IRB.CreateCall(ExtFLo, {OP});
+  return std::pair<Value *, Value *>(OP1Hi, OP1Lo);
+}
+
+Value *HexagonGenWideningVecFloatInstr::createIntrinsic(
+    Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1, Value *NewOP2,
+    FixedVectorType *ResType, unsigned NumElts, bool BitCastOp) {
+
+  IRBuilder<> IRB(Inst);
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, IntId);
+  Function *ConvF = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::hexagon_V6_vconv_sf_qf32_128B);
+  auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 32);
+  auto *RType = FixedVectorType::get(IRB.getFloatTy(), 32);
+
+  // Make sure inputs to vmpy instrinsic are full vectors
+  if (NumElts == NumHalfEltsInFullVec / 2) {
+    SmallVector<Constant *, 16> ConcatMask1;
+    for (unsigned i = 0; i < NumHalfEltsInFullVec; ++i)
+      ConcatMask1.push_back(IRB.getInt32(i));
+    NewOP1 =
+        IRB.CreateShuffleVector(NewOP1, PoisonValue::get(NewOP1->getType()),
+                                ConstantVector::get(ConcatMask1));
+    NewOP2 =
+        IRB.CreateShuffleVector(NewOP2, PoisonValue::get(NewOP2->getType()),
+                                ConstantVector::get(ConcatMask1));
+  }
+
+  if (BitCastOp) {
+    NewOP1 = IRB.CreateBitCast(NewOP1, InType);
+    NewOP2 = IRB.CreateBitCast(NewOP2, InType);
+  }
+
+  Value *NewIn = IRB.CreateCall(ExtF, {NewOP1, NewOP2});
+  // Interleave the output elements to ensure correct order in Hi and Lo vectors
+  // Shuffled Mask: [0, 32, 1, 33, ..., 31, 63]
+  // Hi: [0, 1, ..., 31] and Lo: [32, 33, ..., 63]
+  SmallVector<Constant *, 16> Mask;
+  unsigned HalfVecPoint = NumHalfEltsInFullVec / 2;
+  for (unsigned i = 0; i < HalfVecPoint; ++i) {
+    Mask.push_back(IRB.getInt32(i));
+    Mask.push_back(IRB.getInt32(HalfVecPoint + i));
+  }
+  NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(NewIn->getType()),
+                                  ConstantVector::get(Mask));
+
+  std::pair<Value *, Value *> SplitOP = opSplit(NewIn, Inst);
+  Value *ConvHi = IRB.CreateCall(ConvF, {SplitOP.first});
+  ConvHi = IRB.CreateBitCast(ConvHi, RType);
+
+  if (ResType->getNumElements() == NumHalfEltsInFullVec / 2) {
+    return ConvHi;
+  }
+
+  Value *ConvLo = IRB.CreateCall(ConvF, {SplitOP.second});
+  ConvLo = IRB.CreateBitCast(ConvLo, RType);
+
+  SmallVector<Constant *, 16> ShuffleMask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i));
+  // Concat Hi and Lo.
+  NewIn =
+      IRB.CreateShuffleVector(ConvLo, ConvHi, ConstantVector::get(ShuffleMask));
+  return NewIn;
+}
+
+bool HexagonGenWideningVecFloatInstr::replaceWithIntrinsic(Instruction *Inst,
+                                                           OPInfo &OP1Info,
+                                                           OPInfo &OP2Info) {
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  [[maybe_unused]] unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  unsigned MaxEltSize = OP1Info.ExtInSize;
+  unsigned NewOpEltSize = MaxEltSize;
+  unsigned NewResEltSize = 2 * MaxEltSize;
+
+  unsigned ResVLen = NewResEltSize * NumElts;
+  if (NewOpEltSize > 16 || ((ResVLen > HwVLen) && (ResVLen % HwVLen) != 0))
+    return false;
+
+  Intrinsic::ID IntId = Intrinsic::hexagon_V6_vmpy_qf32_hf_128B;
+  IRBuilder<> IRB(Inst);
+  Value *NewOP1 = adjustExtensionForOp(OP1Info, IRB, NewOpEltSize, NumElts);
+  Value *NewOP2 = adjustExtensionForOp(OP2Info, IRB, NewOpEltSize, NumElts);
+
+  if (NewOP1 == nullptr || NewOP2 == nullptr)
+    return false;
+
+  if (ResVLen > 2 * HwVLen) {
+    // The code written in this if block generates the widening code when
+    // vector-width is 128:
+    //
+    // Step 1: Bitcast <128 x half> type to <64 x i32>
+    // %wide.load = load <128 x half>, <128 x half>* %0 is bitcasted to,
+    // bitcast <128 x half> %wide.load to <64 x i32>
+    //
+    // Step 2: Generate Hi and Lo vectors
+    // call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %4)
+    // call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %4)
+    //
+    // Perform above 2 steps for both the operands of fmul instruction
+    //
+    // Step 3: Generate vmpy_qf32_hf multiply instruction to multiply two Hi
+    // vectors from both operands.
+    // call <64 x i32> @llvm.hexagon.V6.vmpy.qf32.hf.128B(%5, %8)
+    //
+    // Step 4: Convert the resultant 'qf32' output to 'sf' format
+    // %11 = shufflevector <64 x i32> %10, <64 x i32> poison, <64 x i32> Mask1
+    // %12 = call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %11)
+    // %13 = call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %11)
+    // call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %12)
+    // call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %13)
+    //
+    // Repeat steps 3 and 4 for mutiplication and conversion of Lo vectors.
+    // Finally merge the output values in correct sequence using shuffle
+    // vectors.
+
+    assert(ResVLen == 4 * HwVLen);
+    // Split the operands
+    unsigned HalfElts = NumElts / 2;
+    std::pair<Value *, Value *> SplitOP1 = opSplit(NewOP1, Inst);
+    std::pair<Value *, Value *> SplitOP2 = opSplit(NewOP2, Inst);
+    auto *castResType = FixedVectorType::get(IRB.getInt32Ty(), HalfElts);
+    Value *NewInHi =
+        createIntrinsic(IntId, Inst, SplitOP1.first, SplitOP2.first,
+                        castResType, HalfElts, false);
+    Value *NewInLo =
+        createIntrinsic(IntId, Inst, SplitOP1.second, SplitOP2.second,
+                        castResType, HalfElts, false);
+    assert(InstEltSize == NewResEltSize);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    Value *NewIn = IRB.CreateShuffleVector(NewInLo, NewInHi,
+                                           ConstantVector::get(ShuffleMask));
+
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+
+  // The following widening code can only be generated in cases where
+  // input vectors are 64xhalf/32xhalf and the results are 64xfloat/32xfloat
+  // respectively.
+  if (!(NumElts == NumHalfEltsInFullVec &&
+        ResType->getNumElements() == NumHalfEltsInFullVec) &&
+      !(NumElts == NumHalfEltsInFullVec / 2 &&
+        ResType->getNumElements() == NumHalfEltsInFullVec / 2))
+    return false;
+  Value *NewIn =
+      createIntrinsic(IntId, Inst, NewOP1, NewOP2, ResType, NumElts, true);
+
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+// Process instruction and replace them with widening vector
+// intrinsics if possible.
+bool HexagonGenWideningVecFloatInstr::processInstruction(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  if (!InstTy->isVectorTy() ||
+      cast<FixedVectorType>(InstTy)->getNumElements() > 128)
+    return false;
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+  if (!HST->isTypeForHVX(cast<VectorType>(InstTy)) && InstLen != 4 * HwVLen)
+    return false;
+  if (InstLen < HwVLen)
+    return false;
+
+  using namespace PatternMatch;
+
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  OPInfo OP1Info, OP2Info;
+
+  // Handle the case when Inst = fpext(fmul<64xhalf>(op1, op2)). The Inst can
+  // be replaced with widening multiply.
+  if (match(Inst, (m_FPExt((m_FMul(m_Value(OP1), m_Value(OP2))))))) {
+    OP1Info.ExtInOP = OP1;
+    OP1Info.ExtInSize = getElementSizeInBits(OP1);
+    OP2Info.ExtInOP = OP2;
+    OP2Info.ExtInSize = getElementSizeInBits(OP2);
+
+    if (auto *Op1Vtype = dyn_cast<VectorType>(OP1->getType())) {
+      if (!Op1Vtype->getElementType()->isHalfTy()) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+
+    if (OP1Info.ExtInSize == OP2Info.ExtInSize && OP1Info.ExtInSize == 16 &&
+        getElementSizeInBits(Inst) == 32) {
+      return replaceWithIntrinsic(Inst, OP1Info, OP2Info);
+    }
+  }
+
+  if (!match(Inst, (m_FMul(m_Value(OP1), m_Value(OP2)))))
+    return false;
+
+  if (!getOperandInfo(OP1, OP1Info) || !getOperandInfo(OP2, OP2Info))
+    return false;
+
+  if (!OP1Info.ExtInOP || !OP2Info.ExtInOP)
+    return false;
+
+  if (OP1Info.ExtInSize == OP2Info.ExtInSize && OP1Info.ExtInSize == 16) {
+    return replaceWithIntrinsic(Inst, OP1Info, OP2Info);
+  }
+
+  return false;
+}
+
+bool HexagonGenWideningVecFloatInstr::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  for (auto &I : *B)
+    Changed |= processInstruction(&I);
+  return Changed;
+}
+
+bool HexagonGenWideningVecFloatInstr::runOnFunction(Function &F) {
+  M = F.getParent();
+  HST = TM->getSubtargetImpl(F);
+
+  // Return if useHVX128BOps is not set. It can be enabled for 64B mode
+  // but wil require some changes. For example, bitcast for intrinsics
+  // assumes 128B mode.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  unsigned VecLength = HST->getVectorLength(); // Vector Length in Bytes
+  HwVLen = HST->getVectorLength() * 8;         // Vector Length in bits
+  NumHalfEltsInFullVec =
+      VecLength /
+      2; // Number of half (2B) elements that fit into a full HVX vector
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonGenWideningVecFloatInstr(const HexagonTargetMachine &TM) {
+  return new HexagonGenWideningVecFloatInstr(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp b/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp
new file mode 100644
index 0000000..297410b
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonGenWideningVecInstr.cpp
@@ -0,0 +1,1181 @@
+//===--------------------- HexagonGenWideningVecInstr.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace widening vector operations with hexagon intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+// A command line argument to enable the generation of widening instructions
+// for short-vectors.
+static cl::opt<bool> WidenShortVector(
+    "hexagon-widen-short-vector",
+    cl::desc("Generate widening instructions for short vectors."), cl::Hidden);
+
+namespace llvm {
+void initializeHexagonGenWideningVecInstrPass(PassRegistry &);
+FunctionPass *createHexagonGenWideningVecInstr(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonGenWideningVecInstr : public FunctionPass {
+public:
+  static char ID;
+
+  HexagonGenWideningVecInstr() : FunctionPass(ID) {
+    initializeHexagonGenWideningVecInstrPass(*PassRegistry::getPassRegistry());
+  }
+
+  HexagonGenWideningVecInstr(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonGenWideningVecInstrPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon generate widening vector instructions";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  Module *M = nullptr;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  unsigned HwVLen;
+  enum OPKind { OP_None = 0, OP_Add, OP_Sub, OP_Mul, OP_Shl };
+
+  struct OPInfo {
+    Value *OP = nullptr;
+    Value *ExtInOP = nullptr;
+    bool IsZExt = false;
+    unsigned ExtInSize = 0;
+    bool IsScalar = false;
+  };
+
+  bool visitBlock(BasicBlock *B);
+  bool processInstruction(Instruction *Inst);
+  bool replaceWithIntrinsic(Instruction *Inst, OPKind OPK, OPInfo &OP1Info,
+                            OPInfo &OP2Info);
+  bool getOperandInfo(Value *V, OPInfo &OPI);
+  bool isExtendedConstant(Constant *C, bool IsSigned);
+  unsigned getElementSizeInBits(Value *V, bool IsZExt);
+  Type *getElementTy(unsigned size, IRBuilder<> &IRB);
+
+  Value *adjustExtensionForOp(OPInfo &OPI, IRBuilder<> &IRB,
+                              unsigned NewEltsize, unsigned NumElts);
+
+  Intrinsic::ID getIntrinsic(OPKind OPK, bool IsOP1ZExt, bool IsOP2ZExt,
+                             unsigned NewOpEltSize, unsigned NewResEltSize,
+                             bool IsConstScalar, int ConstOpNum);
+
+  std::pair<Value *, Value *> opSplit(Value *OP, Instruction *Inst,
+                                      Type *NewOpType);
+
+  Value *createIntrinsic(Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1,
+                         Value *NewOP2, Type *ResType, unsigned NumElts,
+                         bool Interleave);
+  bool processInstructionForVMPA(Instruction *Inst);
+  bool getVmpaOperandInfo(Value *V, OPInfo &OPI);
+  void reorderVmpaOperands(OPInfo *OPI);
+  bool replaceWithVmpaIntrinsic(Instruction *Inst, OPInfo *OPI);
+  bool genSaturatingInst(Instruction *Inst);
+  bool getMinMax(Constant *MinC, Constant *MaxC, std::pair<int, int> &MinMax);
+  bool isSaturatingVAsr(Instruction *Inst, Value *S, int MinV, int MaxV,
+                        bool &IsResSigned);
+  Value *extendShiftByVal(Value *ShiftByVal, IRBuilder<> &IRB);
+  Intrinsic::ID getVAsrIntrinsic(bool IsInSigned, bool IsResSigned);
+  Value *createVAsrIntrinsic(Instruction *Inst, Value *VecOP, Value *ShiftByVal,
+                             bool IsResSigned);
+  bool genVAvg(Instruction *Inst);
+  bool checkConstantVector(Value *OP, int64_t &SplatVal, bool IsOPZExt);
+  void updateMPYConst(Intrinsic::ID IntId, int64_t SplatVal, bool IsOPZExt,
+                      Value *&OP, IRBuilder<> &IRB);
+  void packConstant(Intrinsic::ID IntId, int64_t SplatVal, Value *&OP,
+                    IRBuilder<> &IRB);
+};
+
+} // end anonymous namespace
+
+char HexagonGenWideningVecInstr::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonGenWideningVecInstr, "widening-vec",
+                      "Hexagon generate "
+                      "widening vector instructions",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenWideningVecInstr, "widening-vec",
+                    "Hexagon generate "
+                    "widening vector instructions",
+                    false, false)
+
+static bool hasNegativeValues(Constant *C) {
+  if (Value *SplatV = C->getSplatValue()) {
+    auto *CI = dyn_cast<ConstantInt>(SplatV);
+    assert(CI);
+    return CI->getValue().isNegative();
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i));
+    assert(CI);
+    if (CI->getValue().isNegative())
+      return true;
+    continue;
+  }
+  return false;
+}
+
+bool HexagonGenWideningVecInstr::getOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV = nullptr;
+  Constant *C = nullptr;
+
+  bool Match = false;
+  if ((Match = (match(V, (m_ZExt(m_Value(ExtV)))) ||
+                match(V, m_Shuffle(m_InsertElt(m_Poison(),
+                                               m_ZExt(m_Value(ExtV)), m_Zero()),
+                                   m_Poison(), m_ZeroMask()))))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = true;
+  }
+
+  if (!Match &&
+      (Match = (match(V, (m_SExt(m_Value(ExtV)))) ||
+                match(V, m_Shuffle(m_InsertElt(m_Poison(),
+                                               m_SExt(m_Value(ExtV)), m_Zero()),
+                                   m_Poison(), m_ZeroMask()))))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = false;
+  }
+  if (!Match &&
+      (Match =
+           (match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                               m_Poison(), m_ZeroMask()))))) {
+    if (match(ExtV, m_And(m_Value(), m_SpecificInt(255)))) {
+      OPI.ExtInOP = ExtV;
+      OPI.IsZExt = true;
+      OPI.ExtInSize = 8;
+      return true;
+    }
+    if (match(ExtV, m_And(m_Value(), m_SpecificInt(65535)))) {
+      OPI.ExtInOP = ExtV;
+      OPI.IsZExt = true;
+      OPI.ExtInSize = 16;
+      return true;
+    }
+    return false;
+  }
+
+  if (!Match && (Match = match(V, m_Constant(C)))) {
+    if (!isExtendedConstant(C, false) && !isExtendedConstant(C, true))
+      return false;
+    OPI.ExtInOP = C;
+    OPI.IsZExt = !hasNegativeValues(C);
+  }
+
+  if (!Match)
+    return false;
+
+  // If the operand is extended, find the element size of its input.
+  if (OPI.ExtInOP)
+    OPI.ExtInSize = getElementSizeInBits(OPI.ExtInOP, OPI.IsZExt);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::isExtendedConstant(Constant *C,
+                                                    bool IsSigned) {
+  Type *CTy = cast<FixedVectorType>(C->getType())->getElementType();
+  unsigned EltSize = CTy->getPrimitiveSizeInBits();
+  unsigned HalfSize = EltSize / 2;
+  if (Value *SplatV = C->getSplatValue()) {
+    if (auto *CI = dyn_cast<ConstantInt>(SplatV))
+      return IsSigned ? isIntN(HalfSize, CI->getSExtValue())
+                      : isUIntN(HalfSize, CI->getZExtValue());
+    return false;
+  }
+  unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    if (auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
+      if ((IsSigned && !isIntN(HalfSize, CI->getSExtValue())) ||
+          (!IsSigned && !isUIntN(HalfSize, CI->getZExtValue())))
+        return false;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+unsigned HexagonGenWideningVecInstr::getElementSizeInBits(Value *V,
+                                                          bool IsZExt = false) {
+  using namespace PatternMatch;
+  Type *ValTy = V->getType();
+  Type *EltTy = ValTy;
+  if (auto *C = dyn_cast<Constant>(V)) {
+    unsigned NumElts = cast<FixedVectorType>(EltTy)->getNumElements();
+    unsigned EltSize = cast<FixedVectorType>(EltTy)
+                           ->getElementType()
+                           ->getPrimitiveSizeInBits()
+                           .getKnownMinValue();
+    unsigned ReducedSize = EltSize / 2;
+
+    while (ReducedSize >= 8) {
+      for (unsigned i = 0, e = NumElts; i != e; ++i) {
+        if (auto *CI = dyn_cast<ConstantInt>(C->getAggregateElement(i))) {
+          if (IsZExt) {
+            if (!isUIntN(ReducedSize, CI->getZExtValue()))
+              return EltSize;
+          } else if (!isIntN(ReducedSize, CI->getSExtValue()))
+            return EltSize;
+        }
+      }
+      EltSize = ReducedSize;
+      ReducedSize = ReducedSize / 2;
+    }
+    return EltSize;
+  }
+
+  if (ValTy->isVectorTy())
+    EltTy = cast<FixedVectorType>(ValTy)->getElementType();
+  return EltTy->getPrimitiveSizeInBits();
+}
+
+Value *HexagonGenWideningVecInstr::adjustExtensionForOp(OPInfo &OPI,
+                                                        IRBuilder<> &IRB,
+                                                        unsigned NewExtSize,
+                                                        unsigned NumElts) {
+  Value *V = OPI.ExtInOP;
+  bool IsZExt = OPI.IsZExt;
+  unsigned EltSize = getElementSizeInBits(OPI.ExtInOP, OPI.IsZExt);
+  Type *EltType = getElementTy(NewExtSize, IRB);
+  auto *NewOpTy = FixedVectorType::get(EltType, NumElts);
+
+  if (dyn_cast<Constant>(V))
+    return IRB.CreateTrunc(V, NewOpTy);
+
+  if (V->getType()->isVectorTy()) {
+    if (NewExtSize == EltSize)
+      return V;
+    assert(NewExtSize == 16);
+    auto *NewOpTy = FixedVectorType::get(IRB.getInt16Ty(), NumElts);
+    return (IsZExt) ? IRB.CreateZExt(V, NewOpTy) : IRB.CreateSExt(V, NewOpTy);
+  }
+
+  // The operand must correspond to a shuffle vector which is used to construct
+  // a vector out of a scalar. Since the scalar value (V) is extended,
+  // replace it with a new shuffle vector with the smaller element size.
+  [[maybe_unused]] auto *I = dyn_cast<Instruction>(OPI.OP);
+  assert(I && I->getOpcode() == Instruction::ShuffleVector);
+
+  if (NewExtSize > EltSize)
+    V = (IsZExt) ? IRB.CreateZExt(V, EltType) : IRB.CreateSExt(V, EltType);
+  else if (NewExtSize < EltSize)
+    V = IRB.CreateTrunc(V, EltType);
+
+  Value *IE =
+      IRB.CreateInsertElement(PoisonValue::get(NewOpTy), V, IRB.getInt32(0));
+
+  SmallVector<Constant *, 8> ShuffleMask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(0));
+
+  return IRB.CreateShuffleVector(IE, PoisonValue::get(NewOpTy),
+                                 ConstantVector::get(ShuffleMask));
+}
+
+Intrinsic::ID HexagonGenWideningVecInstr::getIntrinsic(
+    OPKind OPK, bool IsOP1ZExt, bool IsOP2ZExt, unsigned InEltSize,
+    unsigned ResEltSize, bool IsConstScalar, int ConstOpNum) {
+  // Since the operands have been extended, the ResEltSize must be 16 or more.
+  switch (OPK) {
+  case OP_Add:
+    // Both operands should be either zero extended or sign extended.
+    assert(IsOP1ZExt == IsOP2ZExt);
+    if (InEltSize == 8 && ResEltSize == 16) {
+      // Operands must be zero extended as we don't have a widening vector
+      // 'add' that can take signed exteded values.
+      assert(IsOP1ZExt && "Operands must be zero-extended");
+      return Intrinsic::hexagon_vadd_uu;
+    }
+    if (InEltSize == 16 && ResEltSize == 32)
+      return (IsOP1ZExt) ? Intrinsic::hexagon_vadd_uu
+                         : Intrinsic::hexagon_vadd_ss;
+
+    llvm_unreachable("Incorrect input and output operand sizes");
+
+  case OP_Sub:
+    // Both operands should be either zero extended or sign extended.
+    assert(IsOP1ZExt == IsOP2ZExt);
+    if (InEltSize == 8 && ResEltSize == 16) {
+      // Operands must be zero extended as we don't have a widening vector
+      // 'sub' that can take signed exteded values.
+      assert(IsOP1ZExt && "Operands must be zero-extended");
+      return Intrinsic::hexagon_vsub_uu;
+    }
+    if (InEltSize == 16 && ResEltSize == 32)
+      return (IsOP1ZExt) ? Intrinsic::hexagon_vsub_uu
+                         : Intrinsic::hexagon_vsub_ss;
+
+    llvm_unreachable("Incorrect input and output operand sizes");
+
+  case OP_Mul:
+    assert(ResEltSize == 2 * InEltSize);
+    // Enter inside 'if' block when one of the operand is constant vector
+    if (IsConstScalar) {
+      // When inputs are of 8bit type and output is 16bit type, enter 'if' block
+      if (InEltSize == 8 && ResEltSize == 16) {
+        // Enter the 'if' block, when 2nd operand of the mul instruction is
+        // constant vector, otherwise enter 'else' block
+        if (ConstOpNum == 2 && IsOP1ZExt) {
+          // If the value inside the constant vector is zero-extended, then
+          // return hexagon_vmpy_ub_ub, else return hexagon_vmpy_ub_b
+          return (IsOP2ZExt) ? Intrinsic::hexagon_vmpy_ub_ub
+                             : Intrinsic::hexagon_vmpy_ub_b;
+        } else if (ConstOpNum == 1 && IsOP2ZExt) {
+          return (IsOP1ZExt) ? Intrinsic::hexagon_vmpy_ub_ub
+                             : Intrinsic::hexagon_vmpy_ub_b;
+        }
+      }
+      // When inputs are of 16bit type and output is 32bit type,
+      // enter 'if' block
+      if (InEltSize == 16 && ResEltSize == 32) {
+        if (IsOP1ZExt && IsOP2ZExt) {
+          // If the value inside the constant vector and other operand is
+          // zero-extended, then return hexagon_vmpy_uh_uh
+          return Intrinsic::hexagon_vmpy_uh_uh;
+        } else if (!IsOP1ZExt && !IsOP2ZExt) {
+          // If the value inside the constant vector and other operand is
+          // sign-extended, then return hexagon_vmpy_h_h
+          return Intrinsic::hexagon_vmpy_h_h;
+        }
+      }
+    }
+    if (IsOP1ZExt)
+      return IsOP2ZExt ? Intrinsic::hexagon_vmpy_uu
+                       : Intrinsic::hexagon_vmpy_us;
+    else
+      return IsOP2ZExt ? Intrinsic::hexagon_vmpy_su
+                       : Intrinsic::hexagon_vmpy_ss;
+  default:
+    llvm_unreachable("Instruction not handled!");
+  }
+}
+
+Type *HexagonGenWideningVecInstr::getElementTy(unsigned size,
+                                               IRBuilder<> &IRB) {
+  switch (size) {
+  case 8:
+    return IRB.getInt8Ty();
+  case 16:
+    return IRB.getInt16Ty();
+  case 32:
+    return IRB.getInt32Ty();
+  default:
+    llvm_unreachable("Unhandled Element size");
+  }
+}
+
+Value *HexagonGenWideningVecInstr::createIntrinsic(
+    Intrinsic::ID IntId, Instruction *Inst, Value *NewOP1, Value *NewOP2,
+    Type *ResType, unsigned NumElts, bool Interleave = true) {
+  IRBuilder<> IRB(Inst);
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, IntId, ResType);
+  Value *NewIn = IRB.CreateCall(ExtF, {NewOP1, NewOP2});
+  if (Interleave) {
+    // Interleave elements in the output vector.
+    SmallVector<Constant *, 16> ShuffleMask;
+    unsigned HalfElts = NumElts / 2;
+    for (unsigned i = 0; i < HalfElts; ++i) {
+      ShuffleMask.push_back(IRB.getInt32(i));
+      ShuffleMask.push_back(IRB.getInt32(HalfElts + i));
+    }
+    NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(ResType),
+                                    ConstantVector::get(ShuffleMask));
+  }
+  return NewIn;
+}
+
+std::pair<Value *, Value *>
+HexagonGenWideningVecInstr::opSplit(Value *OP, Instruction *Inst,
+                                    Type *NewOpType) {
+  Type *InstTy = Inst->getType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  IRBuilder<> IRB(Inst);
+  if (InstTy->getPrimitiveSizeInBits() < 2 * HwVLen) {
+    // The only time we need to split an OP even though it is not a
+    // vector-pair is while generating vasr instruction for the short vector.
+    // Since hi/lo intrinsics can't be used here as they expect the operands to
+    // be of 64xi32 type, the shuffle_vector pair with the appropriate masks is
+    // used instead.
+    assert(NumElts % 2 == 0 && "Unexpected Vector Type!!");
+    unsigned HalfElts = NumElts / 2;
+    SmallVector<Constant *, 8> HiM;
+    SmallVector<Constant *, 8> LoM;
+    for (unsigned i = 0; i < HalfElts; ++i)
+      LoM.push_back(IRB.getInt32(i));
+    for (unsigned i = 0; i < HalfElts; ++i)
+      HiM.push_back(IRB.getInt32(HalfElts + i));
+
+    Value *Hi = IRB.CreateShuffleVector(OP, PoisonValue::get(OP->getType()),
+                                        ConstantVector::get(HiM));
+    Value *Lo = IRB.CreateShuffleVector(OP, PoisonValue::get(OP->getType()),
+                                        ConstantVector::get(LoM));
+    return std::pair<Value *, Value *>(Hi, Lo);
+  }
+
+  Intrinsic::ID IntHi = Intrinsic::hexagon_V6_hi_128B;
+  Intrinsic::ID IntLo = Intrinsic::hexagon_V6_lo_128B;
+  Function *ExtFHi = Intrinsic::getOrInsertDeclaration(M, IntHi);
+  Function *ExtFLo = Intrinsic::getOrInsertDeclaration(M, IntLo);
+  auto *InType = FixedVectorType::get(IRB.getInt32Ty(), 64);
+  OP = IRB.CreateBitCast(OP, InType);
+  Value *Hi = IRB.CreateCall(ExtFHi, {OP}); // 32xi32
+  Value *Lo = IRB.CreateCall(ExtFLo, {OP});
+  Hi = IRB.CreateBitCast(Hi, NewOpType);
+  Lo = IRB.CreateBitCast(Lo, NewOpType);
+  return std::pair<Value *, Value *>(Hi, Lo);
+}
+
+bool HexagonGenWideningVecInstr::checkConstantVector(Value *OP,
+                                                     int64_t &SplatVal,
+                                                     bool IsOPZExt) {
+  if (auto *C1 = dyn_cast<Constant>(OP)) {
+    if (Value *SplatV = C1->getSplatValue()) {
+      auto *CI = dyn_cast<ConstantInt>(SplatV);
+      if (IsOPZExt) {
+        SplatVal = CI->getZExtValue();
+      } else {
+        SplatVal = CI->getSExtValue();
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void HexagonGenWideningVecInstr::updateMPYConst(Intrinsic::ID IntId,
+                                                int64_t SplatVal, bool IsOPZExt,
+                                                Value *&OP, IRBuilder<> &IRB) {
+  if ((IntId == Intrinsic::hexagon_vmpy_uu ||
+       IntId == Intrinsic::hexagon_vmpy_us ||
+       IntId == Intrinsic::hexagon_vmpy_su ||
+       IntId == Intrinsic::hexagon_vmpy_ss) &&
+      OP->getType()->isVectorTy()) {
+    // Create a vector with all elements equal to SplatVal
+    Type *VecTy = OP->getType();
+    Value *splatVector =
+        ConstantInt::get(VecTy, static_cast<uint32_t>(SplatVal));
+    OP = IsOPZExt ? IRB.CreateZExt(splatVector, VecTy)
+                  : IRB.CreateSExt(splatVector, VecTy);
+  } else {
+    packConstant(IntId, SplatVal, OP, IRB);
+  }
+}
+
+void HexagonGenWideningVecInstr::packConstant(Intrinsic::ID IntId,
+                                              int64_t SplatVal, Value *&OP,
+                                              IRBuilder<> &IRB) {
+  uint32_t Val32 = static_cast<uint32_t>(SplatVal);
+  if (IntId == Intrinsic::hexagon_vmpy_ub_ub) {
+    assert(SplatVal >= 0 && SplatVal <= UINT8_MAX);
+    uint32_t packed = (Val32 << 24) | (Val32 << 16) | (Val32 << 8) | Val32;
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_ub_b) {
+    assert(SplatVal >= INT8_MIN && SplatVal <= INT8_MAX);
+    uint32_t packed = (Val32 << 24) | ((Val32 << 16) & ((1 << 24) - 1)) |
+                      ((Val32 << 8) & ((1 << 16) - 1)) |
+                      (Val32 & ((1 << 8) - 1));
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_uh_uh) {
+    assert(SplatVal >= 0 && SplatVal <= UINT16_MAX);
+    uint32_t packed = (Val32 << 16) | Val32;
+    OP = IRB.getInt32(packed);
+  } else if (IntId == Intrinsic::hexagon_vmpy_h_h) {
+    assert(SplatVal >= INT16_MIN && SplatVal <= INT16_MAX);
+    uint32_t packed = (Val32 << 16) | (Val32 & ((1 << 16) - 1));
+    OP = IRB.getInt32(packed);
+  }
+}
+
+bool HexagonGenWideningVecInstr::replaceWithIntrinsic(Instruction *Inst,
+                                                      OPKind OPK,
+                                                      OPInfo &OP1Info,
+                                                      OPInfo &OP2Info) {
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  bool IsOP1ZExt = OP1Info.IsZExt;
+  bool IsOP2ZExt = OP2Info.IsZExt;
+
+  // The resulting values of 'add' and 'sub' are always sign-extended.
+  bool IsResZExt = (OPK == OP_Mul || OPK == OP_Shl)
+                       ? (OP1Info.IsZExt && OP2Info.IsZExt)
+                       : false;
+
+  unsigned MaxEltSize = std::max(OP1Info.ExtInSize, OP2Info.ExtInSize);
+  unsigned NewOpEltSize = MaxEltSize;
+  unsigned NewResEltSize = 2 * MaxEltSize;
+
+  // For Add and Sub, both the operands should be either zero extended
+  // or sign extended. In case of a mismatch, they are extended  to the
+  // next size (ex: 8 bits -> 16 bits) so that the sign-extended vadd/vsub
+  // instructions can be used. Also, we don't support 8-bits signed vadd/vsub
+  // instructions. They are extended to 16-bits and then signed 16-bits
+  // non-widening vadd/vsub is used to perform the operation.
+  if (OPK != OP_Mul && OPK != OP_Shl &&
+      (IsOP1ZExt != IsOP2ZExt || (!IsOP1ZExt && NewOpEltSize == 8)))
+    NewOpEltSize = 2 * NewOpEltSize;
+
+  unsigned ResVLen = NewResEltSize * NumElts;
+  if (ResVLen < HwVLen && !WidenShortVector)
+    return false;
+  if (NewOpEltSize > 16 || ((ResVLen > HwVLen) && (ResVLen % HwVLen) != 0))
+    return false;
+
+  IRBuilder<> IRB(Inst);
+  Value *NewOP1 = adjustExtensionForOp(OP1Info, IRB, NewOpEltSize, NumElts);
+  Value *NewOP2 = adjustExtensionForOp(OP2Info, IRB, NewOpEltSize, NumElts);
+
+  if (NewOpEltSize == NewResEltSize) {
+    assert(OPK != OP_Mul && OPK != OP_Shl);
+    // Instead of intrinsics, use vector add/sub.
+    Value *NewIn = IRB.CreateBinOp(cast<BinaryOperator>(Inst)->getOpcode(),
+                                   NewOP1, NewOP2);
+    if (InstEltSize > NewResEltSize)
+      NewIn = IRB.CreateSExt(NewIn, InstTy);
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  bool IsConstScalar = false;
+  int64_t SplatVal = 0;
+  int ConstOpNum = 1;
+  if (OPK == OP_Mul || OPK == OP_Shl) {
+    IsConstScalar = checkConstantVector(NewOP1, SplatVal, IsOP1ZExt);
+    if (!IsConstScalar) {
+      IsConstScalar = checkConstantVector(NewOP2, SplatVal, IsOP2ZExt);
+      ConstOpNum = 2;
+    }
+  }
+
+  if (IsConstScalar && OPK == OP_Shl) {
+    if (((NewOpEltSize == 8) && (SplatVal > 0) && (SplatVal < 8)) ||
+        ((NewOpEltSize == 16) && (SplatVal > 0) && (SplatVal < 16))) {
+      SplatVal = 1LL << SplatVal;
+      OPK = OP_Mul;
+    } else {
+      return false;
+    }
+  } else if (!IsConstScalar && OPK == OP_Shl) {
+    return false;
+  }
+
+  Intrinsic::ID IntId = getIntrinsic(OPK, IsOP1ZExt, IsOP2ZExt, NewOpEltSize,
+                                     NewResEltSize, IsConstScalar, ConstOpNum);
+
+  if (IsConstScalar) {
+    updateMPYConst(IntId, SplatVal, IsOP2ZExt, NewOP2, IRB);
+  }
+
+  // Split the node if it needs more than a vector pair for the result.
+  if (ResVLen > 2 * HwVLen) {
+    assert(ResVLen == 4 * HwVLen);
+    // Split the operands
+    unsigned HalfElts = NumElts / 2;
+    auto *NewOpType =
+        FixedVectorType::get(getElementTy(NewOpEltSize, IRB), HalfElts);
+    auto *ResType =
+        FixedVectorType::get(getElementTy(NewResEltSize, IRB), HalfElts);
+    std::pair<Value *, Value *> SplitOP1 = opSplit(NewOP1, Inst, NewOpType);
+    std::pair<Value *, Value *> SplitOP2;
+    if (IsConstScalar && (IntId == Intrinsic::hexagon_vmpy_h_h ||
+                          IntId == Intrinsic::hexagon_vmpy_uh_uh)) {
+      SplitOP2 = std::pair<Value *, Value *>(NewOP2, NewOP2);
+    } else {
+      SplitOP2 = opSplit(NewOP2, Inst, NewOpType);
+    }
+    Value *NewInHi = createIntrinsic(IntId, Inst, SplitOP1.first,
+                                     SplitOP2.first, ResType, HalfElts, true);
+    Value *NewInLo = createIntrinsic(IntId, Inst, SplitOP1.second,
+                                     SplitOP2.second, ResType, HalfElts, true);
+    assert(InstEltSize == NewResEltSize);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    Value *NewIn = IRB.CreateShuffleVector(NewInLo, NewInHi,
+                                           ConstantVector::get(ShuffleMask));
+
+    Inst->replaceAllUsesWith(NewIn);
+    return true;
+  }
+
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+  Value *NewIn =
+      createIntrinsic(IntId, Inst, NewOP1, NewOP2, ResType, NumElts, true);
+  if (InstEltSize > NewResEltSize)
+    NewIn = (IsResZExt) ? IRB.CreateZExt(NewIn, InstTy)
+                        : IRB.CreateSExt(NewIn, InstTy);
+
+  Inst->replaceAllUsesWith(NewIn);
+
+  return true;
+}
+
+// Process instruction and replace them with widening vector
+// intrinsics if possible.
+bool HexagonGenWideningVecInstr::processInstruction(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  if (!InstTy->isVectorTy() ||
+      cast<FixedVectorType>(InstTy)->getNumElements() > 128)
+    return false;
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+  if (!HST->isTypeForHVX(cast<VectorType>(InstTy)) && InstLen != 4 * HwVLen)
+    return false;
+  if (InstLen < HwVLen && !WidenShortVector)
+    return false;
+
+  using namespace PatternMatch;
+
+  OPKind OPK;
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  if (match(Inst, (m_Sub(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Sub;
+  else if (match(Inst, (m_Add(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Add;
+  else if (match(Inst, (m_Mul(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Mul;
+  else if (match(Inst, (m_Shl(m_Value(OP1), m_Value(OP2)))))
+    OPK = OP_Shl;
+  else
+    return false;
+
+  OPInfo OP1Info, OP2Info;
+
+  if (!getOperandInfo(OP1, OP1Info) || !getOperandInfo(OP2, OP2Info))
+    return false;
+
+  // Proceed only if both input operands are extended.
+  if (!OP1Info.ExtInOP || !OP2Info.ExtInOP)
+    return false;
+
+  return replaceWithIntrinsic(Inst, OPK, OP1Info, OP2Info);
+}
+
+bool HexagonGenWideningVecInstr::getVmpaOperandInfo(Value *V, OPInfo &OPI) {
+  using namespace PatternMatch;
+  OPI.OP = V;
+  Value *ExtV, *OP1 = nullptr;
+
+  if (match(V,
+            m_ZExt(m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                             m_Poison(), m_ZeroMask()))) ||
+      match(V,
+            m_Shuffle(m_InsertElt(m_Poison(), m_ZExt(m_Value(ExtV)), m_Zero()),
+                      m_Poison(), m_ZeroMask()))) {
+    OPI.ExtInOP = ExtV;
+    OPI.IsZExt = true;
+    OPI.IsScalar = true;
+    OPI.ExtInSize = ExtV->getType()->getPrimitiveSizeInBits();
+    return true;
+  }
+
+  ConstantInt *I = nullptr;
+  if ((match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(ExtV), m_Zero()),
+                          m_Poison(), m_ZeroMask())))) {
+    if (match(ExtV, m_And(m_Value(OP1), m_ConstantInt(I)))) {
+      uint32_t IValue = I->getZExtValue();
+      if (IValue <= 255) {
+        OPI.ExtInOP = ExtV;
+        OPI.IsZExt = true;
+        OPI.ExtInSize = 8;
+        OPI.IsScalar = true;
+        return true;
+      }
+    }
+  }
+
+  // Match for non-scalar operands
+  return getOperandInfo(V, OPI);
+}
+
+// Process instruction and replace with the vmpa intrinsic if possible.
+bool HexagonGenWideningVecInstr::processInstructionForVMPA(Instruction *Inst) {
+  using namespace PatternMatch;
+  Type *InstTy = Inst->getType();
+  // TODO: Extend it to handle short vector instructions (< HwVLen).
+  // vmpa instructions produce a vector register pair.
+  if (!InstTy->isVectorTy() || InstTy->getPrimitiveSizeInBits() != 2 * HwVLen)
+    return false;
+
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  if (!match(Inst, (m_Add(m_Value(OP1), m_Value(OP2)))))
+    return false;
+
+  Value *OP[4] = {nullptr, nullptr, nullptr, nullptr};
+  if (!match(OP1, m_Mul(m_Value(OP[0]), m_Value(OP[1]))) ||
+      !match(OP2, m_Mul(m_Value(OP[2]), m_Value(OP[3]))))
+    return false;
+
+  OPInfo OP_Info[4];
+  for (unsigned i = 0; i < 4; i++)
+    if (!getVmpaOperandInfo(OP[i], OP_Info[i]) || !OP_Info[i].ExtInOP)
+      return false;
+
+  return replaceWithVmpaIntrinsic(Inst, OP_Info);
+}
+
+// Reorder operand info in OPI so that the vector operands come before their
+// scalar counterparts.
+void HexagonGenWideningVecInstr::reorderVmpaOperands(OPInfo *OPI) {
+  for (unsigned i = 0; i < 2; i++)
+    if (!OPI[2 * i].ExtInOP->getType()->isVectorTy()) {
+      OPInfo Temp;
+      Temp = OPI[2 * i];
+      OPI[2 * i] = OPI[2 * i + 1];
+      OPI[2 * i + 1] = Temp;
+    }
+}
+
+// Only handles the case where one input to vmpa has to be a scalar
+// and another is a vector. It can be easily extended to cover
+// other types of vmpa instructions.
+bool HexagonGenWideningVecInstr::replaceWithVmpaIntrinsic(Instruction *Inst,
+                                                          OPInfo *OPI) {
+  reorderVmpaOperands(OPI);
+
+  // After reordering of the operands in OPI, the odd elements must have
+  // IsScalar flag set to true. Also, check the even elements for non-scalars.
+  if (!OPI[1].IsScalar || !OPI[3].IsScalar || OPI[0].IsScalar ||
+      OPI[2].IsScalar)
+    return false;
+
+  OPInfo SOPI1 = OPI[1];
+  OPInfo SOPI2 = OPI[3];
+
+  // The scalar operand in the vmpa instructions needs to be an int8.
+  if (SOPI1.ExtInSize != SOPI2.ExtInSize || SOPI1.ExtInSize != 8)
+    return false;
+
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  unsigned MaxVEltSize = std::max(OPI[0].ExtInSize, OPI[2].ExtInSize);
+  unsigned NewVOpEltSize = MaxVEltSize;
+  unsigned NewResEltSize = 2 * MaxVEltSize;
+
+  if (NumElts * NewVOpEltSize < HwVLen) {
+    // Extend the operand so that we don't end up with an invalid vector size.
+    NewVOpEltSize = 2 * NewVOpEltSize;
+    NewResEltSize = 2 * NewResEltSize;
+  }
+
+  IRBuilder<> IRB(Inst);
+
+  // Construct scalar operand
+  Value *NewSOP1 = SOPI1.ExtInOP;
+  Value *NewSOP2 = SOPI2.ExtInOP;
+
+  Type *S1Ty = NewSOP1->getType();
+  Type *S2Ty = NewSOP2->getType();
+  if (S1Ty->getPrimitiveSizeInBits() < 32)
+    NewSOP1 = IRB.CreateZExt(NewSOP1, IRB.getInt32Ty());
+  if (S2Ty->getPrimitiveSizeInBits() < 32)
+    NewSOP2 = IRB.CreateZExt(NewSOP2, IRB.getInt32Ty());
+
+  Value *SHL = IRB.CreateShl(NewSOP1, IRB.getInt32(8));
+  Value *OR = IRB.CreateOr(SHL, NewSOP2);
+  Intrinsic::ID CombineIntID = Intrinsic::hexagon_A2_combine_ll;
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(M, CombineIntID);
+  Value *ScalarOP = IRB.CreateCall(ExtF, {OR, OR});
+
+  // Construct vector operand
+  Value *NewVOP1 = adjustExtensionForOp(OPI[0], IRB, NewVOpEltSize, NumElts);
+  Value *NewVOP2 = adjustExtensionForOp(OPI[2], IRB, NewVOpEltSize, NumElts);
+
+  // Combine both vector operands to form the vector-pair for vmpa
+  Intrinsic::ID VCombineIntID = Intrinsic::hexagon_V6_vcombine_128B;
+  ExtF = Intrinsic::getOrInsertDeclaration(M, VCombineIntID);
+  Type *InType = FixedVectorType::get(IRB.getInt32Ty(), 32);
+  NewVOP1 = IRB.CreateBitCast(NewVOP1, InType);
+  NewVOP2 = IRB.CreateBitCast(NewVOP2, InType);
+  Value *VecOP = IRB.CreateCall(ExtF, {NewVOP1, NewVOP2});
+
+  Intrinsic::ID VmpaIntID = (NewResEltSize == 16)
+                                ? Intrinsic::hexagon_V6_vmpabus_128B
+                                : Intrinsic::hexagon_V6_vmpauhb_128B;
+  ExtF = Intrinsic::getOrInsertDeclaration(M, VmpaIntID);
+  auto *ResType =
+      FixedVectorType::get(getElementTy(NewResEltSize, IRB), NumElts);
+  Value *NewIn = IRB.CreateCall(ExtF, {VecOP, ScalarOP});
+  NewIn = IRB.CreateBitCast(NewIn, ResType);
+
+  if (InstEltSize > NewResEltSize)
+    // Extend the output to match the original instruction type.
+    NewIn = IRB.CreateSExt(NewIn, InstTy);
+
+  // Interleave elements in the output vector.
+  SmallVector<Constant *, 16> ShuffleMask;
+  unsigned HalfElts = NumElts / 2;
+  for (unsigned i = 0; i < HalfElts; ++i) {
+    ShuffleMask.push_back(IRB.getInt32(i));
+    ShuffleMask.push_back(IRB.getInt32(HalfElts + i));
+  }
+  NewIn = IRB.CreateShuffleVector(NewIn, PoisonValue::get(ResType),
+                                  ConstantVector::get(ShuffleMask));
+
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::genSaturatingInst(Instruction *Inst) {
+  Type *InstTy = Inst->getType();
+  assert(InstTy->isVectorTy());
+  if (InstTy->getPrimitiveSizeInBits() > HwVLen)
+    return false;
+
+  using namespace PatternMatch;
+  CmpPredicate P1, P2;
+  Value *L1 = nullptr, *T1 = nullptr, *L2 = nullptr, *T2 = nullptr,
+        *L3 = nullptr;
+  Constant *RC1 = nullptr, *FC1 = nullptr, *RC2 = nullptr, *FC2 = nullptr,
+           *RC3 = nullptr;
+
+  // Pattern of interest: ashr -> llvm.smin -> llvm.smax -> trunc
+  // Match trunc instruction
+  if (match(Inst, m_Trunc(m_Intrinsic<Intrinsic::smax>(m_Value(L1),
+                                                       m_Constant(RC1))))) {
+    // Match llvm.smin instruction
+    if (match(L1, m_Intrinsic<Intrinsic::smin>(m_Value(L2), m_Constant(RC2)))) {
+      // Match ashr instruction
+      if (match(L2, m_AShr(m_Value(L3), m_Constant(RC3)))) {
+        std::pair<int, int> MinMax;
+        // get min, max values from operatands of smin and smax
+        if (getMinMax(RC1, RC2, MinMax)) {
+          bool IsResSigned;
+          // Validate the saturating vasr pattern
+          if (isSaturatingVAsr(Inst, L2, MinMax.first, MinMax.second,
+                               IsResSigned)) {
+            // Get the shift value from the ashr operand
+            ConstantInt *shift_val =
+                dyn_cast<ConstantInt>(RC3->getSplatValue());
+            if (shift_val) {
+              Value *NewIn =
+                  createVAsrIntrinsic(Inst, L3, shift_val, IsResSigned);
+              Inst->replaceAllUsesWith(NewIn);
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!match(Inst, (m_Trunc(m_Select(m_ICmp(P1, m_Value(L1), m_Constant(RC1)),
+                                     m_Value(T1), m_Constant(FC1))))) ||
+      (T1 != L1 || FC1 != RC1))
+    return false;
+
+  if (!match(L1, m_Select(m_ICmp(P2, m_Value(L2), m_Constant(RC2)), m_Value(T2),
+                          m_Constant(FC2))) ||
+      (T2 != L2 || FC2 != RC2))
+    return false;
+
+  if (!((P1 == CmpInst::ICMP_SGT && P2 == CmpInst::ICMP_SLT) ||
+        (P1 == CmpInst::ICMP_SLT && P2 == CmpInst::ICMP_SGT)))
+    return false;
+
+  std::pair<int, int> MinMax;
+  if ((P1 == CmpInst::ICMP_SGT) && (P2 == CmpInst::ICMP_SLT)) {
+    if (!getMinMax(RC1, RC2, MinMax))
+      return false;
+  } else if (!getMinMax(RC2, RC1, MinMax))
+    return false;
+
+  Value *S = L2; // Value being saturated
+
+  // Only AShr instructions are handled.
+  // Also, second operand to AShr must be a scalar.
+  Value *OP1 = nullptr, *ShiftByVal = nullptr;
+  if (!match(S, m_AShr(m_Value(OP1),
+                       m_Shuffle(m_InsertElt(m_Poison(), m_Value(ShiftByVal),
+                                             m_Zero()),
+                                 m_Poison(), m_ZeroMask()))))
+    return false;
+
+  bool IsResSigned;
+  if (!isSaturatingVAsr(Inst, S, MinMax.first, MinMax.second, IsResSigned))
+    return false;
+
+  Value *NewIn = createVAsrIntrinsic(Inst, OP1, ShiftByVal, IsResSigned);
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+Value *HexagonGenWideningVecInstr::extendShiftByVal(Value *ShiftByVal,
+                                                    IRBuilder<> &IRB) {
+  using namespace PatternMatch;
+  Value *A = nullptr;
+  if (match(ShiftByVal, m_Trunc(m_Value(A))))
+    return A;
+  return IRB.CreateZExt(ShiftByVal, IRB.getInt32Ty());
+}
+
+bool HexagonGenWideningVecInstr::getMinMax(Constant *MinC, Constant *MaxC,
+                                           std::pair<int, int> &MinMax) {
+  Value *SplatV;
+  if (!(SplatV = MinC->getSplatValue()) || !(dyn_cast<ConstantInt>(SplatV)))
+    return false;
+  if (!(SplatV = MaxC->getSplatValue()) || !(dyn_cast<ConstantInt>(SplatV)))
+    return false;
+
+  ConstantInt *MinI = dyn_cast<ConstantInt>(MinC->getSplatValue());
+  ConstantInt *MaxI = dyn_cast<ConstantInt>(MaxC->getSplatValue());
+  MinMax = std::pair<int, int>(MinI->getSExtValue(), MaxI->getSExtValue());
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::isSaturatingVAsr(Instruction *Inst, Value *S,
+                                                  int MinV, int MaxV,
+                                                  bool &IsResSigned) {
+  if (MinV >= MaxV)
+    return false;
+
+  IsResSigned = true;
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<VectorType>(InstTy)->getElementType();
+  unsigned TruncSize = EltTy->getPrimitiveSizeInBits();
+
+  int MaxRange, MinRange;
+  if (MinV < 0) { // Saturate to a signed value
+    MaxRange = (1 << (TruncSize - 1)) - 1;
+    MinRange = -(1 << (TruncSize - 1));
+  } else if (MinV == 0) { // Saturate to an unsigned value
+    MaxRange = (1 << (TruncSize)) - 1;
+    MinRange = 0;
+    IsResSigned = false;
+  } else
+    return false;
+
+  if (MinV != MinRange || MaxV != MaxRange)
+    return false;
+
+  auto *SInst = dyn_cast<Instruction>(S);
+  if (SInst->getOpcode() == Instruction::AShr) {
+    Type *SInstTy = SInst->getType();
+    Type *SEltTy = cast<VectorType>(SInstTy)->getElementType();
+    unsigned SInstEltSize = SEltTy->getPrimitiveSizeInBits();
+    if (SInstEltSize != 2 * TruncSize || TruncSize > 16)
+      return false;
+  }
+  return true;
+}
+
+Intrinsic::ID HexagonGenWideningVecInstr::getVAsrIntrinsic(bool IsInSigned,
+                                                           bool IsResSigned) {
+  if (!IsResSigned)
+    return (IsInSigned) ? Intrinsic::hexagon_vasrsat_su
+                        : Intrinsic::hexagon_vasrsat_uu;
+  return Intrinsic::hexagon_vasrsat_ss;
+}
+
+Value *HexagonGenWideningVecInstr::createVAsrIntrinsic(Instruction *Inst,
+                                                       Value *VecOP,
+                                                       Value *ShiftByVal,
+                                                       bool IsResSigned) {
+  IRBuilder<> IRB(Inst);
+  Type *ShiftByTy = ShiftByVal->getType();
+  if (ShiftByTy->getPrimitiveSizeInBits() < 32)
+    ShiftByVal = extendShiftByVal(ShiftByVal, IRB);
+
+  Type *InstTy = Inst->getType();
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+
+  // Replace the instruction with saturating vasr intrinsic.
+  // Since vasr with saturation interleaves elements from both input vectors,
+  // they must be deinterleaved for output to end up in the right order.
+  SmallVector<Constant *, 16> ShuffleMask;
+  unsigned HalfElts = NumElts / 2;
+  // Even elements
+  for (unsigned i = 0; i < HalfElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i * 2));
+  // Odd elements
+  for (unsigned i = 0; i < HalfElts; ++i)
+    ShuffleMask.push_back(IRB.getInt32(i * 2 + 1));
+
+  VecOP = IRB.CreateShuffleVector(VecOP, PoisonValue::get(VecOP->getType()),
+                                  ConstantVector::get(ShuffleMask));
+
+  auto *InVecOPTy =
+      FixedVectorType::get(getElementTy(InstEltSize * 2, IRB), HalfElts);
+  std::pair<Value *, Value *> HiLo = opSplit(VecOP, Inst, InVecOPTy);
+  Intrinsic::ID IntID = getVAsrIntrinsic(true, IsResSigned);
+  Function *F = Intrinsic::getOrInsertDeclaration(M, IntID, InVecOPTy);
+  Value *NewIn = IRB.CreateCall(F, {HiLo.first, HiLo.second, ShiftByVal});
+  return IRB.CreateBitCast(NewIn, InstTy);
+}
+
+// Generate vavg instruction.
+bool HexagonGenWideningVecInstr::genVAvg(Instruction *Inst) {
+  using namespace PatternMatch;
+  Type *InstTy = Inst->getType();
+  assert(InstTy->isVectorTy());
+
+  bool Match = false;
+  Value *OP1 = nullptr, *OP2 = nullptr;
+  bool IsSigned;
+  if ((Match = (match(Inst, m_Trunc(m_LShr(m_Add(m_ZExt(m_Value(OP1)),
+                                                 m_ZExt(m_Value(OP2))),
+                                           m_SpecificInt(1)))))))
+    IsSigned = false;
+  if (!Match &&
+      (Match = (match(Inst, m_Trunc(m_LShr(m_Add(m_SExt(m_Value(OP1)),
+                                                 m_SExt(m_Value(OP2))),
+                                           m_SpecificInt(1))))) ||
+               match(Inst, m_LShr(m_Add(m_Value(OP1), m_Value(OP2)),
+                                  m_SpecificInt(1)))))
+    IsSigned = true;
+
+  if (!Match)
+    return false;
+
+  unsigned OP1EltSize = getElementSizeInBits(OP1);
+  unsigned OP2EltSize = getElementSizeInBits(OP2);
+  unsigned NewEltSize = std::max(OP1EltSize, OP2EltSize);
+
+  Type *EltTy = cast<FixedVectorType>(InstTy)->getElementType();
+  unsigned InstEltSize = EltTy->getPrimitiveSizeInBits();
+  unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+
+  // Only vectors that are either smaller, same or twice of the hardware
+  // vector length are allowed.
+  if (InstEltSize < NewEltSize || (InstLen > 2 * HwVLen))
+    return false;
+
+  if ((InstLen > HwVLen) && (InstLen % HwVLen != 0))
+    return false;
+
+  IRBuilder<> IRB(Inst);
+  unsigned NumElts = cast<FixedVectorType>(InstTy)->getNumElements();
+  auto *AvgInstTy =
+      FixedVectorType::get(getElementTy(NewEltSize, IRB), NumElts);
+  if (OP1EltSize < NewEltSize)
+    OP1 = (IsSigned) ? IRB.CreateSExt(OP1, AvgInstTy)
+                     : IRB.CreateZExt(OP1, AvgInstTy);
+  if (OP2EltSize < NewEltSize)
+    OP2 = (IsSigned) ? IRB.CreateSExt(OP2, AvgInstTy)
+                     : IRB.CreateZExt(OP2, AvgInstTy);
+
+  Intrinsic::ID AvgIntID =
+      (IsSigned) ? Intrinsic::hexagon_vavgs : Intrinsic::hexagon_vavgu;
+  Value *NewIn = nullptr;
+
+  // Split operands if they need more than a vector length.
+  if (NewEltSize * NumElts > HwVLen) {
+    unsigned HalfElts = NumElts / 2;
+    auto *ResType =
+        FixedVectorType::get(getElementTy(NewEltSize, IRB), HalfElts);
+    std::pair<Value *, Value *> SplitOP1 = opSplit(OP1, Inst, ResType);
+    std::pair<Value *, Value *> SplitOP2 = opSplit(OP2, Inst, ResType);
+    Value *NewHi = createIntrinsic(AvgIntID, Inst, SplitOP1.first,
+                                   SplitOP2.first, ResType, NumElts, false);
+    Value *NewLo = createIntrinsic(AvgIntID, Inst, SplitOP1.second,
+                                   SplitOP2.second, ResType, NumElts, false);
+    SmallVector<Constant *, 8> ShuffleMask;
+    for (unsigned i = 0; i < NumElts; ++i)
+      ShuffleMask.push_back(IRB.getInt32(i));
+    // Concat Hi and Lo.
+    NewIn =
+        IRB.CreateShuffleVector(NewLo, NewHi, ConstantVector::get(ShuffleMask));
+  } else
+    NewIn =
+        createIntrinsic(AvgIntID, Inst, OP1, OP2, AvgInstTy, NumElts, false);
+
+  if (InstEltSize > NewEltSize)
+    // Extend the output to match the original instruction type.
+    NewIn = (IsSigned) ? IRB.CreateSExt(NewIn, InstTy)
+                       : IRB.CreateZExt(NewIn, InstTy);
+  Inst->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+bool HexagonGenWideningVecInstr::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  for (auto &I : *B) {
+    Type *InstTy = I.getType();
+    if (!InstTy->isVectorTy() || !HST->isTypeForHVX(cast<VectorType>(InstTy)))
+      continue;
+
+    unsigned InstLen = InstTy->getPrimitiveSizeInBits();
+    if (InstLen < HwVLen && !WidenShortVector)
+      continue;
+
+    Changed |= processInstructionForVMPA(&I);
+    Changed |= genSaturatingInst(&I);
+    Changed |= genVAvg(&I);
+  }
+  // Generate widening instructions.
+  for (auto &I : *B)
+    Changed |= processInstruction(&I);
+  return Changed;
+}
+
+bool HexagonGenWideningVecInstr::runOnFunction(Function &F) {
+  M = F.getParent();
+  HST = TM->getSubtargetImpl(F);
+
+  // Return if useHVX128BOps is not set. It can be enabled for 64B mode
+  // but wil require some changes. For example, bitcast for intrinsics
+  // assumes 128B mode.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  HwVLen = HST->getVectorLength() * 8; // Vector Length in bits
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonGenWideningVecInstr(const HexagonTargetMachine &TM) {
+  return new HexagonGenWideningVecInstr(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
index 0528cbd..683feb1 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -50,10 +50,7 @@ public:
                           const HexagonSubtarget &ST)
     : Resources(ST.createDFAPacketizer(II)), TII(HII) { }
 
-  ~HexagonHazardRecognizer() override {
-    if (Resources)
-      delete Resources;
-  }
+  ~HexagonHazardRecognizer() override { delete Resources; }
 
   /// This callback is invoked when a new block of instructions is about to be
   /// scheduled. The hazard state is set to an initialized state.
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 3cc146b..728ffef 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -811,8 +811,8 @@ ArrayRef<int> hi(ArrayRef<int> Vuu) { return Vuu.take_back(Vuu.size() / 2); }
 MaskT vshuffvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
   int Len = Vu.size();
   MaskT Vdd(2 * Len);
-  std::copy(Vv.begin(), Vv.end(), Vdd.begin());
-  std::copy(Vu.begin(), Vu.end(), Vdd.begin() + Len);
+  llvm::copy(Vv, Vdd.begin());
+  llvm::copy(Vu, Vdd.begin() + Len);
 
   auto Vd0 = MutableArrayRef<int>(Vdd).take_front(Len);
   auto Vd1 = MutableArrayRef<int>(Vdd).take_back(Len);
@@ -831,8 +831,8 @@ MaskT vshuffvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
 MaskT vdealvdd(ArrayRef<int> Vu, ArrayRef<int> Vv, unsigned Rt) {
   int Len = Vu.size();
   MaskT Vdd(2 * Len);
-  std::copy(Vv.begin(), Vv.end(), Vdd.begin());
-  std::copy(Vu.begin(), Vu.end(), Vdd.begin() + Len);
+  llvm::copy(Vv, Vdd.begin());
+  llvm::copy(Vu, Vdd.begin() + Len);
 
   auto Vd0 = MutableArrayRef<int>(Vdd).take_front(Len);
   auto Vd1 = MutableArrayRef<int>(Vdd).take_back(Len);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 526b4de..025e5b0 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1504,8 +1504,8 @@ HexagonTargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
 HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
                                              const HexagonSubtarget &ST)
-    : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
-      Subtarget(ST) {
+    : TargetLowering(TM, ST),
+      HTM(static_cast<const HexagonTargetMachine &>(TM)), Subtarget(ST) {
   auto &HRI = *Subtarget.getRegisterInfo();
 
   setPrefLoopAlignment(Align(16));
@@ -1677,6 +1677,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   }
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
   // Turn FP extload into load/fpextend.
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
@@ -1872,9 +1874,15 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
+  setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_BF16, MVT::f64, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
+
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 
@@ -2107,7 +2115,7 @@ static Value *getUnderLyingObjectForBrevLdIntr(Value *V) {
 /// true and store the intrinsic information into the IntrinsicInfo that was
 /// passed to the function.
 bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                               const CallInst &I,
+                                               const CallBase &I,
                                                MachineFunction &MF,
                                                unsigned Intrinsic) const {
   switch (Intrinsic) {
@@ -2519,7 +2527,7 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
     // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
+      Consts[i] = ConstantInt::getSigned(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();
@@ -3948,3 +3956,51 @@ HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   return AtomicExpansionKind::LLSC;
 }
+
+bool HexagonTargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit since
+  // this will fold the and/cmp/br into a single tstbit instruction.
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getValue().isPowerOf2();
+}
+
+// Check if the result of the node is only used as a return value, as
+// otherwise we can't perform a tail-call.
+bool HexagonTargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDNode *Copy = *N->user_begin();
+
+  if (Copy->getOpcode() == ISD::BITCAST) {
+    return isUsedByReturnOnly(Copy, Chain);
+  }
+
+  if (Copy->getOpcode() != ISD::CopyToReg) {
+    return false;
+  }
+
+  // If the ISD::CopyToReg has a glue operand, we conservatively assume it
+  // isn't safe to perform a tail call.
+  if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == MVT::Glue)
+    return false;
+
+  // The copy must be used by a HexagonISD::RET_GLUE, and nothing else.
+  bool HasRet = false;
+  for (SDNode *Node : Copy->users()) {
+    if (Node->getOpcode() != HexagonISD::RET_GLUE)
+      return false;
+    HasRet = true;
+  }
+  if (!HasRet)
+    return false;
+
+  Chain = Copy->getOperand(0);
+  return true;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 8d04edb..2d7e3c3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -145,7 +145,7 @@ public:
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
 
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I,
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
@@ -160,6 +160,10 @@ public:
 
   bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
   /// Return true if an FMA operation is faster than a pair of mul and add
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this
   /// method returns true (and FMAs are legal), otherwise fmuladd is
@@ -588,6 +592,7 @@ private:
   SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue WidenHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
   SDValue LegalizeHvxResize(SDValue Op, SelectionDAG &DAG) const;
   SDValue ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) const;
   SDValue EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 0573f64..4bc8e74 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -31,6 +31,10 @@ static cl::opt<unsigned> HvxWidenThreshold("hexagon-hvx-widen",
   cl::Hidden, cl::init(16),
   cl::desc("Lower threshold (in bytes) for widening to HVX vectors"));
 
+static cl::opt<bool>
+    EnableFpFastConvert("hexagon-fp-fast-convert", cl::Hidden, cl::init(false),
+                        cl::desc("Enable FP fast conversion routine."));
+
 static const MVT LegalV64[] =  { MVT::v64i8,  MVT::v32i16,  MVT::v16i32 };
 static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
@@ -88,6 +92,10 @@ HexagonTargetLowering::initializeHVXLowering() {
       addRegisterClass(MVT::v64f32, &Hexagon::HvxWRRegClass);
       addRegisterClass(MVT::v128f16, &Hexagon::HvxWRRegClass);
     }
+    if (Subtarget.useHVXV81Ops()) {
+      addRegisterClass(MVT::v64bf16, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v128bf16, &Hexagon::HvxWRRegClass);
+    }
   }
 
   // Set up operation actions.
@@ -162,6 +170,30 @@ HexagonTargetLowering::initializeHVXLowering() {
     setPromoteTo(ISD::VECTOR_SHUFFLE,  MVT::v64f32, ByteW);
     setPromoteTo(ISD::VECTOR_SHUFFLE,  MVT::v32f32, ByteV);
 
+    if (Subtarget.useHVXV81Ops()) {
+      setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v128bf16, ByteW);
+      setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64bf16, ByteV);
+      setPromoteTo(ISD::SETCC, MVT::v64bf16, MVT::v64f32);
+      setPromoteTo(ISD::FADD, MVT::v64bf16, MVT::v64f32);
+      setPromoteTo(ISD::FSUB, MVT::v64bf16, MVT::v64f32);
+      setPromoteTo(ISD::FMUL, MVT::v64bf16, MVT::v64f32);
+      setPromoteTo(ISD::FMINNUM, MVT::v64bf16, MVT::v64f32);
+      setPromoteTo(ISD::FMAXNUM, MVT::v64bf16, MVT::v64f32);
+
+      setOperationAction(ISD::SPLAT_VECTOR, MVT::v64bf16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64bf16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64bf16, Custom);
+
+      setOperationAction(ISD::MLOAD, MVT::v64bf16, Custom);
+      setOperationAction(ISD::MSTORE, MVT::v64bf16, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v64bf16, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64bf16, Custom);
+
+      setOperationAction(ISD::SPLAT_VECTOR, MVT::bf16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
+    }
+
     for (MVT P : FloatW) {
       setOperationAction(ISD::LOAD,           P, Custom);
       setOperationAction(ISD::STORE,          P, Custom);
@@ -438,6 +470,7 @@ HexagonTargetLowering::initializeHVXLowering() {
         setOperationAction(ISD::ANY_EXTEND,   VecTy, Custom);
         setOperationAction(ISD::SIGN_EXTEND,  VecTy, Custom);
         setOperationAction(ISD::ZERO_EXTEND,  VecTy, Custom);
+        setOperationAction(ISD::INTRINSIC_WO_CHAIN, VecTy, Custom);
         if (Subtarget.useHVXFloatingPoint()) {
           setOperationAction(ISD::FP_TO_SINT,   VecTy, Custom);
           setOperationAction(ISD::FP_TO_UINT,   VecTy, Custom);
@@ -462,6 +495,10 @@ HexagonTargetLowering::initializeHVXLowering() {
 
 unsigned
 HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const {
+  // Early exit for invalid input types
+  if (!VecTy.isVector())
+    return ~0u;
+
   MVT ElemTy = VecTy.getVectorElementType();
   unsigned VecLen = VecTy.getVectorNumElements();
   unsigned HwLen = Subtarget.getVectorLength();
@@ -1667,14 +1704,15 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
   // In case of MVT::f16 BUILD_VECTOR, since MVT::f16 is
   // not a legal type, just bitcast the node to use i16
   // types and bitcast the result back to f16
-  if (VecTy.getVectorElementType() == MVT::f16) {
-    SmallVector<SDValue,64> NewOps;
+  if (VecTy.getVectorElementType() == MVT::f16 ||
+      VecTy.getVectorElementType() == MVT::bf16) {
+    SmallVector<SDValue, 64> NewOps;
     for (unsigned i = 0; i != Size; i++)
       NewOps.push_back(DAG.getBitcast(MVT::i16, Ops[i]));
 
-    SDValue T0 = DAG.getNode(ISD::BUILD_VECTOR, dl,
-        tyVector(VecTy, MVT::i16), NewOps);
-    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+    SDValue T0 =
+        DAG.getNode(ISD::BUILD_VECTOR, dl, tyVector(VecTy, MVT::i16), NewOps);
+    return DAG.getBitcast(tyVector(VecTy, VecTy.getVectorElementType()), T0);
   }
 
   // First, split the BUILD_VECTOR for vector pairs. We could generate
@@ -1698,7 +1736,7 @@ HexagonTargetLowering::LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG)
   MVT VecTy = ty(Op);
   MVT ArgTy = ty(Op.getOperand(0));
 
-  if (ArgTy == MVT::f16) {
+  if (ArgTy == MVT::f16 || ArgTy == MVT::bf16) {
     MVT SplatTy =  MVT::getVectorVT(MVT::i16, VecTy.getVectorNumElements());
     SDValue ToInt16 = DAG.getBitcast(MVT::i16, Op.getOperand(0));
     SDValue ToInt32 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, ToInt16);
@@ -1831,12 +1869,12 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
   if (ElemTy == MVT::i1)
     return insertHvxElementPred(VecV, IdxV, ValV, dl, DAG);
 
-  if (ElemTy == MVT::f16) {
+  if (ElemTy == MVT::f16 || ElemTy == MVT::bf16) {
     SDValue T0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
         tyVector(VecTy, MVT::i16),
         DAG.getBitcast(tyVector(VecTy, MVT::i16), VecV),
         DAG.getBitcast(MVT::i16, ValV), IdxV);
-    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+    return DAG.getBitcast(tyVector(VecTy, ElemTy), T0);
   }
 
   return insertHvxElementReg(VecV, IdxV, ValV, dl, DAG);
@@ -2334,6 +2372,25 @@ SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op,
   MVT VecTy = ty(Op);
   MVT ArgTy = ty(Op.getOperand(0));
   const SDLoc &dl(Op);
+
+  if (ArgTy == MVT::v64bf16) {
+    MVT HalfTy = typeSplit(VecTy).first;
+    SDValue BF16Vec = Op.getOperand(0);
+    SDValue Zeroes =
+        getInstr(Hexagon::V6_vxor, dl, HalfTy, {BF16Vec, BF16Vec}, DAG);
+    // Interleave zero vector with the bf16 vector, with zeroes in the lower
+    // half of each 32 bit lane, effectively extending the bf16 values to fp32
+    // values.
+    SDValue ShuffVec =
+        getInstr(Hexagon::V6_vshufoeh, dl, VecTy, {BF16Vec, Zeroes}, DAG);
+    VectorPair VecPair = opSplit(ShuffVec, dl, DAG);
+    SDValue Result = getInstr(Hexagon::V6_vshuffvdd, dl, VecTy,
+                              {VecPair.second, VecPair.first,
+                               DAG.getSignedConstant(-4, dl, MVT::i32)},
+                              DAG);
+    return Result;
+  }
+
   assert(VecTy == MVT::v64f32 && ArgTy == MVT::v64f16);
 
   SDValue F16Vec = Op.getOperand(0);
@@ -2918,6 +2975,32 @@ HexagonTargetLowering::ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const {
   MVT ResTy = ty(Op);
   assert(InpTy.changeTypeToInteger() == ResTy);
 
+  // At this point this is an experiment under a flag.
+  // In arch before V81 the rounding mode is towards nearest value.
+  // The C/C++ standard requires rounding towards zero:
+  // C (C99 and later): ISO/IEC 9899:2018 (C18), section 6.3.1.4 — "When a
+  // finite value of real floating type is converted to an integer type, the
+  // fractional part is discarded (i.e., the value is truncated toward zero)."
+  // C++: ISO/IEC 14882:2020 (C++20), section 7.3.7 — "A prvalue of a
+  // floating-point type can be converted to a prvalue of an integer type. The
+  // conversion truncates; that is, the fractional part is discarded."
+  if (InpTy == MVT::v64f16) {
+    if (Subtarget.useHVXV81Ops()) {
+      // This is c/c++ compliant
+      SDValue ConvVec =
+          getInstr(Hexagon::V6_vconv_h_hf_rnd, dl, ResTy, {Op0}, DAG);
+      return ConvVec;
+    } else if (EnableFpFastConvert) {
+      // Vd32.h=Vu32.hf same as Q6_Vh_equals_Vhf
+      SDValue ConvVec = getInstr(Hexagon::V6_vconv_h_hf, dl, ResTy, {Op0}, DAG);
+      return ConvVec;
+    }
+  } else if (EnableFpFastConvert && InpTy == MVT::v32f32) {
+    // Vd32.w=Vu32.sf same as Q6_Vw_equals_Vsf
+    SDValue ConvVec = getInstr(Hexagon::V6_vconv_w_sf, dl, ResTy, {Op0}, DAG);
+    return ConvVec;
+  }
+
   // int32_t conv_f32_to_i32(uint32_t inp) {
   //   // s | exp8 | frac23
   //
@@ -3351,6 +3434,104 @@ HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
                      {SetCC, getZero(dl, MVT::i32, DAG)});
 }
 
+SDValue HexagonTargetLowering::WidenHvxIntrinsic(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  unsigned HwWidth = 8 * Subtarget.getVectorLength();
+  bool IsResInterleaved = false;
+
+  SDValue WideRes = SDValue();
+  SDValue Op1 = Op.getOperand(1);
+  MVT ResTy = ty(Op);
+  MVT OpTy = ty(Op1);
+  if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
+    return SDValue();
+
+  auto getFactor = [HwWidth](MVT Ty) {
+    unsigned Width = Ty.getSizeInBits();
+    assert(HwWidth % Width == 0);
+    return HwWidth / Width;
+  };
+
+  auto getWideTy = [getFactor](MVT Ty) {
+    unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
+    return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
+  };
+
+  unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue Op2 = Op.getOperand(2);
+  SDValue WideOp1 = appendUndef(Op1, getWideTy(OpTy), DAG);
+  SDValue WideOp2;
+  if (dyn_cast<const ConstantSDNode>(Op2.getNode())) {
+    WideOp2 = Op2;
+  } else {
+    WideOp2 = appendUndef(Op2, getWideTy(OpTy), DAG);
+  }
+  unsigned WidenFactor = getFactor(OpTy);
+  unsigned WideLen = ResTy.getVectorNumElements() * WidenFactor;
+  MVT WideResTy = MVT::getVectorVT(ResTy.getVectorElementType(), WideLen);
+
+  switch (IID) {
+  default:
+    return SDValue();
+  case Intrinsic::hexagon_vasrsat_su:
+  case Intrinsic::hexagon_vasrsat_uu:
+  case Intrinsic::hexagon_vasrsat_ss:
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2,
+                          Op.getOperand(3));
+    break;
+  case Intrinsic::hexagon_vadd_su:
+  case Intrinsic::hexagon_vadd_uu:
+  case Intrinsic::hexagon_vadd_ss:
+  case Intrinsic::hexagon_vadd_us:
+
+  case Intrinsic::hexagon_vsub_su:
+  case Intrinsic::hexagon_vsub_uu:
+  case Intrinsic::hexagon_vsub_ss:
+  case Intrinsic::hexagon_vsub_us:
+
+  case Intrinsic::hexagon_vmpy_su:
+  case Intrinsic::hexagon_vmpy_uu:
+  case Intrinsic::hexagon_vmpy_ss:
+  case Intrinsic::hexagon_vmpy_us:
+  case Intrinsic::hexagon_vmpy_ub_ub:
+  case Intrinsic::hexagon_vmpy_ub_b:
+  case Intrinsic::hexagon_vmpy_uh_uh:
+  case Intrinsic::hexagon_vmpy_h_h:
+    IsResInterleaved = true;
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2);
+    break;
+  case Intrinsic::hexagon_vavgu:
+  case Intrinsic::hexagon_vavgs:
+    WideRes = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WideResTy,
+                          DAG.getConstant(IID, dl, MVT::i32), WideOp1, WideOp2);
+    break;
+  }
+  unsigned OrigLen = ResTy.getVectorNumElements();
+  assert(OrigLen % 2 == 0);
+  unsigned HalfOrigLen = OrigLen / 2;
+  unsigned SplitLen = WideLen / 2;
+  if (IsResInterleaved) {
+    // Get the valid odd and even elements from the widened vector-pair while
+    // maintaining their deinterleaved order. The following shuffle_vector will
+    // produce a vector-pair with all the valid elements (even followed by odd)
+    // accumulated together followed by undefs.
+    SmallVector<int, 128> ShuffV;
+    for (unsigned j = 0; j < WidenFactor; j++) {
+      for (unsigned i = 0; i < HalfOrigLen; i++)
+        ShuffV.push_back(j * HalfOrigLen + i);
+      for (unsigned i = 0; i < HalfOrigLen; i++)
+        ShuffV.push_back(SplitLen + j * HalfOrigLen + i);
+    }
+    WideRes = DAG.getVectorShuffle(WideResTy, dl, WideRes,
+                                   DAG.getUNDEF(WideResTy), ShuffV);
+  }
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy,
+                     {WideRes, getZero(dl, MVT::i32, DAG)});
+}
+
 SDValue
 HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
@@ -3617,6 +3798,12 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
         Results.push_back(S);
       }
       break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      if (shouldWidenToHvx(ty(Op.getOperand(1)), DAG)) {
+        if (SDValue T = WidenHvxIntrinsic(Op, DAG))
+          Results.push_back(T);
+      }
+      break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:
     case ISD::FP_TO_SINT:
@@ -3677,6 +3864,11 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
         Results.push_back(C);
       }
       break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      assert(shouldWidenToHvx(ty(N->getOperand(1)), DAG) && "Not widening?");
+      if (SDValue T = WidenHvxIntrinsic(Op, DAG))
+        Results.push_back(T);
+      break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
       if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) {
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 47726d6..7682af4 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768;
 void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST)
-    : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN,
+    : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN,
                           Hexagon::ADJCALLSTACKUP),
-      Subtarget(ST) {}
+      RegInfo(ST.getHwMode()), Subtarget(ST) {}
 
 namespace llvm {
 namespace HexagonFUnits {
@@ -964,7 +964,6 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
@@ -1009,10 +1008,12 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void HexagonInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -4753,6 +4754,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const {
   return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0);
 }
 
+bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const {
+  return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32);
+}
+
 // Addressing mode relations.
 short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const {
   return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index c17e527..796b978 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -23,6 +23,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "HexagonRegisterInfo.h"
+
 #define GET_INSTRINFO_HEADER
 #include "HexagonGenInstrInfo.inc"
 
@@ -36,6 +38,7 @@ class MachineOperand;
 class TargetRegisterInfo;
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  const HexagonRegisterInfo RegInfo;
   const HexagonSubtarget &Subtarget;
 
   enum BundleAttribute {
@@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
 public:
   explicit HexagonInstrInfo(const HexagonSubtarget &ST);
 
+  const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   /// TargetInstrInfo overrides.
 
   /// If the specified machine instruction is a direct
@@ -183,8 +188,7 @@ public:
   /// is true, the register operand is the last use and must be marked kill.
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// Load the specified register of the given register class from the specified
@@ -193,7 +197,7 @@ public:
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// This function is called for all pseudo instructions
@@ -532,6 +536,7 @@ public:
   }
 
   MCInst getNop() const override;
+  bool isQFPMul(const MachineInstr *MF) const;
 };
 
 /// \brief Create RegSubRegPair from a register MachineOperand
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 25b81d8..7f16c3e 100644
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -355,6 +355,120 @@ defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignbi>;
 defm : T_VVI_inv_pat <V6_valignbi, int_hexagon_V6_vlalignb>;
 defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignbi>;
 
+
+class VAccGenIntrin_pat<InstHexagon MI, Intrinsic IntID,
+                        ValueType ResType, PatFrag VPred, PatFrag WPred>
+  : Pat<(add WPred:$Vx, (ResType (IntID VPred:$Vs, VPred:$Vt))),
+        (MI WPred:$Vx, VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+let AddedComplexity = 100 in {
+  def : VAccGenIntrin_pat<V6_vmpybv_acc,
+                          int_hexagon_vmpy_ss, VecPI16, HVI8, HWI16>;
+  def : VAccGenIntrin_pat<V6_vmpyubv_acc,
+                          int_hexagon_vmpy_uu, VecPI16, HVI8, HWI16>;
+  def : VAccGenIntrin_pat<V6_vmpyhv_acc,
+                          int_hexagon_vmpy_ss, VecPI32, HVI16, HWI32>;
+  def : VAccGenIntrin_pat<V6_vmpyuhv_acc,
+                          int_hexagon_vmpy_uu, VecPI32, HVI16, HWI32>;
+
+  // The second operand in V6_vmpybusv_acc is unsigned.
+  def : Pat<(add HWI16:$Vx, (VecPI16 (int_hexagon_vmpy_us HVI8:$Vs,
+                                                               HVI8:$Vv))),
+            (V6_vmpybusv_acc HWI16:$Vx, HVI8:$Vs, HVI8:$Vv)>;
+
+  def : Pat<(add HWI16:$Vx, (VecPI16 (int_hexagon_vmpy_su HVI8:$Vs,
+                                                               HVI8:$Vv))),
+            (V6_vmpybusv_acc HWI16:$Vx, HVI8:$Vv, HVI8:$Vs)>;
+
+  // The third operand in V6_vmpyhus_acc is unsigned.
+  def : Pat<(add HWI32:$Vx, (VecPI32 (int_hexagon_vmpy_us HVI16:$Vs,
+                                                               HVI16:$Vv))),
+            (V6_vmpyhus_acc HWI32:$Vx, HVI16:$Vv, HVI16:$Vs)>;
+
+  def : Pat<(add HWI32:$Vx, (VecPI32 (int_hexagon_vmpy_su HVI16:$Vs,
+                                                               HVI16:$Vv))),
+            (V6_vmpyhus_acc HWI32:$Vx, HVI16:$Vs, HVI16:$Vv)>;
+}
+
+class ExtIntrin_pat<InstHexagon MI, Intrinsic IntID,
+                    ValueType ResType, PatFrag VPred>
+  : Pat<(ResType (IntID VPred:$Vs, VPred:$Vt)),
+        (MI VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+def : ExtIntrin_pat<V6_vaddubh, int_hexagon_vadd_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vadduhw, int_hexagon_vadd_uu, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vaddhw, int_hexagon_vadd_ss, VecPI32, HVI16>;
+
+def : ExtIntrin_pat<V6_vsububh, int_hexagon_vsub_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vsubuhw, int_hexagon_vsub_uu, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vsubhw, int_hexagon_vsub_ss, VecPI32, HVI16>;
+
+def : ExtIntrin_pat<V6_vmpybv, int_hexagon_vmpy_ss, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vmpyhv, int_hexagon_vmpy_ss, VecPI32, HVI16>;
+def : ExtIntrin_pat<V6_vmpyubv, int_hexagon_vmpy_uu, VecPI16, HVI8>;
+def : ExtIntrin_pat<V6_vmpyuhv, int_hexagon_vmpy_uu, VecPI32, HVI16>;
+
+// The first operand in V6_vmpybusv is unsigned.
+def : Pat<(VecPI16 (int_hexagon_vmpy_us HVI8:$Vs, HVI8:$Vv)),
+          (V6_vmpybusv HVI8:$Vs, HVI8:$Vv)>;
+
+def : Pat<(VecPI16 (int_hexagon_vmpy_su HVI8:$Vs, HVI8:$Vv)),
+          (V6_vmpybusv HVI8:$Vv, HVI8:$Vs)>;
+
+// The second operand in V6_vmpyhus is unsigned.
+def : Pat<(VecPI32 (int_hexagon_vmpy_us HVI16:$Vs, HVI16:$Vv)),
+          (V6_vmpyhus HVI16:$Vv, HVI16:$Vs)>;
+
+def : Pat<(VecPI32 (int_hexagon_vmpy_su HVI16:$Vs, HVI16:$Vv)),
+          (V6_vmpyhus HVI16:$Vs, HVI16:$Vv)>;
+
+class VAvgInstr_pat<InstHexagon MI, Intrinsic IntID,
+                       ValueType ResType, PatFrag VPred>
+  : Pat<(ResType (IntID VPred:$Vs, VPred:$Vt)),
+        (MI VPred:$Vs, VPred:$Vt)>, Requires<[UseHVX128B]>;
+
+def : VAvgInstr_pat<V6_vavgub, int_hexagon_vavgu, VecI8, HVI8>;
+def : VAvgInstr_pat<V6_vavgb, int_hexagon_vavgs, VecI8, HVI8>;
+def : VAvgInstr_pat<V6_vavguh, int_hexagon_vavgu, VecI16, HVI16>;
+def : VAvgInstr_pat<V6_vavgh, int_hexagon_vavgs, VecI16, HVI16>;
+def : VAvgInstr_pat<V6_vavguw, int_hexagon_vavgu, VecI32, HVI32>;
+def : VAvgInstr_pat<V6_vavgw, int_hexagon_vavgs, VecI32, HVI32>;
+
+class VAsrIntr_pat<InstHexagon MI, Intrinsic IntID,
+                   ValueType ResType, PatFrag VPred>
+: Pat<(ResType (IntID VPred:$Vs, VPred:$Vt, IntRegsLow8:$Rt)),
+      (MI VPred:$Vs, VPred:$Vt, IntRegsLow8:$Rt)>, Requires<[UseHVX128B]>;
+
+def : VAsrIntr_pat<V6_vasruhubsat, int_hexagon_vasrsat_uu, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasrhubsat, int_hexagon_vasrsat_su, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasrhbsat, int_hexagon_vasrsat_ss, VecI8, HVI16>;
+def : VAsrIntr_pat<V6_vasruwuhsat, int_hexagon_vasrsat_uu, VecI16, HVI32>;
+def : VAsrIntr_pat<V6_vasrwuhsat, int_hexagon_vasrsat_su, VecI16, HVI32>;
+def : VAsrIntr_pat<V6_vasrwhsat, int_hexagon_vasrsat_ss, VecI16, HVI32>;
+
+class VMpyVSInstr_pat<InstHexagon MI, Intrinsic IntID,
+                   ValueType ResType, PatFrag VPred>
+: Pat<(ResType (IntID VPred:$Vs, IntRegs:$Rt)),
+      (MI VPred:$Vs, IntRegs:$Rt)>, Requires<[UseHVX128B]>;
+
+def : VMpyVSInstr_pat<V6_vmpyub, int_hexagon_vmpy_ub_ub, VecPI16, HVI8>;
+def : VMpyVSInstr_pat<V6_vmpybus, int_hexagon_vmpy_ub_b, VecPI16, HVI8>;
+def : VMpyVSInstr_pat<V6_vmpyuh, int_hexagon_vmpy_uh_uh, VecPI32, HVI16>;
+def : VMpyVSInstr_pat<V6_vmpyh, int_hexagon_vmpy_h_h, VecPI32, HVI16>;
+
+class VAccIntrin_pat<InstHexagon MI, Intrinsic IntID>
+  : Pat<(add HvxWR:$Vx, (IntID HvxVR:$Vs, HvxVR:$Vt)),
+        (MI HvxWR:$Vx, HvxVR:$Vs, HvxVR:$Vt)>, Requires<[UseHVX128B]>;
+
+let AddedComplexity = 350 in {
+  def : VAccIntrin_pat<V6_vmpybv_acc, int_hexagon_V6_vmpybv_128B>;
+  def : VAccIntrin_pat<V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_128B>;
+  def : VAccIntrin_pat<V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_128B>;
+  def : VAccIntrin_pat<V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_128B>;
+  def : VAccIntrin_pat<V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_128B>;
+  def : VAccIntrin_pat<V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_128B>;
+}
+
 def: Pat<(int_hexagon_V6_vd0),
          (V6_vd0)>, Requires<[UseHVXV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vd0_128B ),
diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
index 7cbd81f..54969b2 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
@@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineInstr *CombI;
     if (Acc != 0) {
       const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
       Register VReg = MF->getRegInfo().createVirtualRegister(RC);
       MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
       NG.push_back(TfrI);
@@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   } else {
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
     Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
     NG.push_back(TfrI);
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 5a1d5bc..c68b632 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -138,7 +138,7 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
     return false;
 
   // Make sure that the (unique) def operand is a register from IntRegs.
-  bool HadDef = false;
+  [[maybe_unused]] bool HadDef = false;
   for (const MachineOperand &Op : II->operands()) {
     if (!Op.isReg() || !Op.isDef())
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 6dd83c1..53afbc4 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -107,7 +107,7 @@ private:
   bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
                        const NodeList &UNodeList);
   bool isSafeToExtLR(NodeAddr<StmtNode *> SN, MachineInstr *MI,
-                     unsigned LRExtReg, const NodeList &UNodeList);
+                     Register LRExtReg, const NodeList &UNodeList);
   void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
   bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
   short getBaseWithLongOffset(const MachineInstr &MI) const;
@@ -177,7 +177,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
   NodeId OffsetRegRD = 0;
   for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
     RegisterRef RR = UA.Addr->getRegRef(*DFG);
-    if (OffsetReg == RR.Reg) {
+    if (OffsetReg == RR.asMCReg()) {
       OffsetRR = RR;
       OffsetRegRD = UA.Addr->getReachingDef();
     }
@@ -198,7 +198,7 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
     // Reaching Def to an offset register can't be a phi.
     if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
         MI.getParent() != UseMI.getParent())
-    return false;
+      return false;
 
     const MCInstrDesc &UseMID = UseMI.getDesc();
     if ((!UseMID.mayLoad() && !UseMID.mayStore()) ||
@@ -300,7 +300,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
 }
 
 bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
-                                       MachineInstr *MI, unsigned LRExtReg,
+                                       MachineInstr *MI, Register LRExtReg,
                                        const NodeList &UNodeList) {
   RegisterRef LRExtRR;
   NodeId LRExtRegRD = 0;
@@ -308,7 +308,7 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
   // for the LRExtReg.
   for (NodeAddr<UseNode *> UA : SN.Addr->members_if(DFG->IsUse, *DFG)) {
     RegisterRef RR = UA.Addr->getRegRef(*DFG);
-    if (LRExtReg == RR.Reg) {
+    if (LRExtReg == RR.asMCReg()) {
       LRExtRR = RR;
       LRExtRegRD = UA.Addr->getReachingDef();
     }
@@ -552,7 +552,7 @@ bool HexagonOptAddrMode::processAddBases(NodeAddr<StmtNode *> AddSN,
   // Find the UseNode that contains the base register and it's reachingDef
   for (NodeAddr<UseNode *> UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) {
     RegisterRef URR = UA.Addr->getRegRef(*DFG);
-    if (BaseReg != URR.Reg)
+    if (BaseReg != URR.asMCReg())
       continue;
 
     UAReachingDefID = UA.Addr->getReachingDef();
@@ -740,7 +740,7 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
   // for the LRExtReg.
   for (NodeAddr<UseNode *> UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) {
     RegisterRef RR = UA.Addr->getRegRef(*DFG);
-    if (BaseReg == RR.Reg)
+    if (BaseReg == RR.asMCReg())
       LRExtRegRD = UA.Addr->getReachingDef();
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp b/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp
new file mode 100644
index 0000000..fcfae17
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonOptShuffleVector.cpp
@@ -0,0 +1,713 @@
+//===---------------------- HexagonOptShuffleVector.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimize vector shuffles by postponing them as late as possible. The intent
+// here is to remove uncessary shuffles and also increases the oportunities for
+// adjacent shuffles to be merged together.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "hex-shuff-vec"
+/// A command line argument to limit the search space along def chain.
+static cl::opt<int> MaxDefSearchCount(
+    "shuffvec-max-search-count",
+    cl::desc("Maximum number of instructions traversed along def chain."),
+    cl::Hidden, cl::init(15));
+
+#ifndef NDEBUG
+static cl::opt<int>
+    ShuffVecLimit("shuff-vec-max",
+                  cl::desc("Maximum number of shuffles to be relocated."),
+                  cl::Hidden, cl::init(-1));
+#endif
+
+namespace llvm {
+void initializeHexagonOptShuffleVectorPass(PassRegistry &);
+FunctionPass *createHexagonOptShuffleVector(const HexagonTargetMachine &);
+} // end namespace llvm
+
+namespace {
+
+class HexagonOptShuffleVector : public FunctionPass {
+public:
+  static char ID;
+#ifndef NDEBUG
+  static int NumRelocated;
+#endif
+  HexagonOptShuffleVector() : FunctionPass(ID) {
+    initializeHexagonOptShuffleVectorPass(*PassRegistry::getPassRegistry());
+  }
+
+  HexagonOptShuffleVector(const HexagonTargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {
+    initializeHexagonOptShuffleVectorPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon Optimize Vector Shuffles";
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  using ValueVector = SmallVector<Value *, 8>;
+  const HexagonTargetMachine *TM = nullptr;
+  const HexagonSubtarget *HST = nullptr;
+  SmallPtrSet<Instruction *, 8> Visited;
+  using ShuffUseList =
+      SmallDenseMap<Instruction *, SmallVector<Instruction *, 2>>;
+  ShuffUseList ShuffUses;
+  int DefSearchCount;
+
+  bool visitBlock(BasicBlock *B);
+  bool findNewShuffLoc(Instruction *I, ArrayRef<int> &ShuffMask,
+                       Value *&NewLoc);
+  bool isValidIntrinsic(IntrinsicInst *I);
+  bool relocateShuffVec(Instruction *I, ArrayRef<int> &M, Value *NewLoc,
+                        std::list<Instruction *> &WorkList);
+  bool getUseList(Instruction *I, ValueVector &UseList);
+  bool analyzeHiLoUse(Instruction *HI, Instruction *LO,
+                      ArrayRef<int> &ShuffMask, Value *&NewLoc,
+                      ShuffUseList &CurShuffUses);
+  bool isHILo(Value *V, bool IsHI);
+  bool hasDefWithSameShuffMask(Value *V, SmallVector<Instruction *, 2> &ImmUse,
+                               ArrayRef<int> &ShuffMask,
+                               ShuffUseList &CurShuffUses);
+  void FindHiLoUse(ValueVector &UseList, Instruction *&HI, Instruction *&LO);
+  bool isConcatMask(ArrayRef<int> &Mask, Instruction *ShuffInst);
+  bool isValidUseInstr(ValueVector &UseList, Instruction *&UI);
+  bool areAllOperandsValid(Instruction *I, Instruction *UI,
+                           ArrayRef<int> &ShuffMask,
+                           ShuffUseList &CurShuffUses);
+  Value *getOperand(Instruction *I, unsigned i);
+  static iterator_range<User::op_iterator> getArgOperands(User *U);
+  static std::pair<Value *, Value *> stripCasts(Value *V);
+  static bool isConstantVectorSplat(Value *V);
+};
+
+} // end anonymous namespace
+
+#ifndef NDEBUG
+int HexagonOptShuffleVector::NumRelocated = 0;
+#endif
+char HexagonOptShuffleVector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonOptShuffleVector, "shuff-vec",
+                      "Hexagon Optimize Shuffle Vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonOptShuffleVector, "shuff-vec",
+                    "Hexagon Optimize Shuffle Vector", false, false)
+
+bool HexagonOptShuffleVector::isConcatMask(ArrayRef<int> &Mask,
+                                           Instruction *ShuffInst) {
+  Type *ShuffTy = ShuffInst->getType();
+  int NumElts = cast<FixedVectorType>(ShuffTy)->getNumElements();
+  for (int i = 0; i < NumElts; i++) {
+    if (Mask[i] != i)
+      return false;
+  }
+  return true;
+}
+
+bool HexagonOptShuffleVector::isValidIntrinsic(IntrinsicInst *I) {
+  switch (I->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::hexagon_V6_vaddubh_128B:
+  case Intrinsic::hexagon_V6_vadduhw_128B:
+  case Intrinsic::hexagon_V6_vaddhw_128B:
+  case Intrinsic::hexagon_V6_vaddh_dv_128B:
+  case Intrinsic::hexagon_V6_vsububh_128B:
+  case Intrinsic::hexagon_V6_vsubuhw_128B:
+  case Intrinsic::hexagon_V6_vsubhw_128B:
+  case Intrinsic::hexagon_V6_vsubh_dv_128B:
+  case Intrinsic::hexagon_V6_vmpyubv_128B:
+  case Intrinsic::hexagon_V6_vmpybv_128B:
+  case Intrinsic::hexagon_V6_vmpyuhv_128B:
+  case Intrinsic::hexagon_V6_vmpyhv_128B:
+  case Intrinsic::hexagon_V6_vmpybusv_128B:
+  case Intrinsic::hexagon_V6_vmpyhus_128B:
+  case Intrinsic::hexagon_V6_vavgb_128B:
+  case Intrinsic::hexagon_V6_vavgub_128B:
+  case Intrinsic::hexagon_V6_vavgh_128B:
+  case Intrinsic::hexagon_V6_vavguh_128B:
+  case Intrinsic::hexagon_V6_vavgw_128B:
+  case Intrinsic::hexagon_V6_vavguw_128B:
+  case Intrinsic::hexagon_V6_hi_128B:
+  case Intrinsic::hexagon_V6_lo_128B:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  // Generic hexagon vector intrinsics
+  case Intrinsic::hexagon_vadd_su:
+  case Intrinsic::hexagon_vadd_uu:
+  case Intrinsic::hexagon_vadd_ss:
+  case Intrinsic::hexagon_vadd_us:
+  case Intrinsic::hexagon_vsub_su:
+  case Intrinsic::hexagon_vsub_uu:
+  case Intrinsic::hexagon_vsub_ss:
+  case Intrinsic::hexagon_vsub_us:
+  case Intrinsic::hexagon_vmpy_su:
+  case Intrinsic::hexagon_vmpy_uu:
+  case Intrinsic::hexagon_vmpy_ss:
+  case Intrinsic::hexagon_vmpy_us:
+  case Intrinsic::hexagon_vavgu:
+  case Intrinsic::hexagon_vavgs:
+  case Intrinsic::hexagon_vmpy_ub_b:
+  case Intrinsic::hexagon_vmpy_ub_ub:
+  case Intrinsic::hexagon_vmpy_uh_uh:
+  case Intrinsic::hexagon_vmpy_h_h:
+    return true;
+  }
+  llvm_unreachable("Unsupported instruction!");
+}
+
+bool HexagonOptShuffleVector::getUseList(Instruction *I, ValueVector &UseList) {
+  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
+    Instruction *J = dyn_cast<Instruction>(*UI);
+    if (!J)
+      return false;
+    if (auto *C = dyn_cast<CastInst>(*UI)) {
+      if (!getUseList(C, UseList))
+        return false;
+    } else
+      UseList.push_back(*UI);
+    ++UI;
+  }
+  return true;
+}
+
+bool HexagonOptShuffleVector::isHILo(Value *V, bool IsHI) {
+  if (!(dyn_cast<Instruction>(V)))
+    return false;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!isa<CallInst>(I))
+    return false;
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  if ((II->getIntrinsicID() == Intrinsic::hexagon_V6_hi_128B && IsHI) ||
+      (II->getIntrinsicID() == Intrinsic::hexagon_V6_lo_128B && !IsHI))
+    return true;
+  return false;
+}
+
+Value *HexagonOptShuffleVector::getOperand(Instruction *I, unsigned i) {
+  Value *V = I->getOperand(i);
+  if (auto *C = dyn_cast<CastInst>(V))
+    return C->getOperand(0);
+  return V;
+}
+
+iterator_range<User::op_iterator>
+HexagonOptShuffleVector::getArgOperands(User *U) {
+  if (auto *CB = dyn_cast<CallBase>(U))
+    return CB->args();
+  return U->operands();
+}
+
+// Strip out all the cast operations to find the first non-cast definition of a
+// value. The function also returns the last cast operation in the def-chain.
+std::pair<Value *, Value *> HexagonOptShuffleVector::stripCasts(Value *V) {
+  Value *LastCast = nullptr;
+  while (auto *C = dyn_cast<CastInst>(V)) {
+    LastCast = V;
+    V = C->getOperand(0);
+  }
+  return std::make_pair(V, LastCast);
+}
+
+bool HexagonOptShuffleVector::isConstantVectorSplat(Value *V) {
+  if (auto *CV = dyn_cast<ConstantVector>(V))
+    return CV->getSplatValue();
+  if (auto *CV = dyn_cast<ConstantDataVector>(V))
+    return CV->isSplat();
+  return false;
+}
+
+// Make sure all the operations on HI and LO counterparts are identical
+// until both halves are merged together. When a merge point (concat)
+// is found, set it as 'NewLoc' and return.
+bool HexagonOptShuffleVector::analyzeHiLoUse(Instruction *HI, Instruction *LO,
+                                             ArrayRef<int> &ShuffMask,
+                                             Value *&NewLoc,
+                                             ShuffUseList &CurShuffUses) {
+  ValueVector HiUseList, LoUseList;
+  getUseList(HI, HiUseList);
+  getUseList(LO, LoUseList);
+
+  // To keep the analsis simple, only handle Hi and Lo with a single use. Also,
+  // not even sure at this point if it will be profitable due to multiple
+  // merge points.
+  if (HiUseList.size() != 1 || LoUseList.size() != 1)
+    return false;
+
+  Instruction *HiUse = dyn_cast<Instruction>(HiUseList[0]);
+  Instruction *LoUse = dyn_cast<Instruction>(LoUseList[0]);
+  if (!HiUse || !LoUse)
+    return false;
+
+  bool IsUseIntrinsic = false;
+  if (isa<CallInst>(HiUse)) {
+    if (!isa<CallInst>(LoUse))
+      return false;
+    // Continue only if both Hi and Lo uses are calls to the same intrinsic.
+    IntrinsicInst *HiUseII = dyn_cast<IntrinsicInst>(HiUse);
+    IntrinsicInst *LoUseII = dyn_cast<IntrinsicInst>(LoUse);
+    if (!HiUseII || !LoUseII ||
+        HiUseII->getIntrinsicID() != LoUseII->getIntrinsicID() ||
+        !isValidIntrinsic(HiUseII))
+      return false;
+    IsUseIntrinsic = true;
+    HiUse = HiUseII;
+    LoUse = LoUseII;
+  }
+  if (HiUse->getOpcode() != LoUse->getOpcode())
+    return false;
+
+  // If both Hi and Lo use are same and is a concat operation, set it
+  // as a 'NewLoc'.
+  if (HiUse == LoUse) {
+    // Return true if use is a concat of Hi and Lo.
+    ArrayRef<int> M;
+    if (match(HiUse, (m_Shuffle(m_Value(), m_Value(), m_Mask(M))))) {
+      if (isConcatMask(M, HiUse)) {
+        NewLoc = HiUse;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Check if HiUse and LoUse are shuffles with the same mask. If so, safe to
+  // continue the search.
+  ArrayRef<int> M1, M2;
+  if (match(HiUse, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M1)))) &&
+      match(LoUse, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M2)))) &&
+      M1.equals(M2))
+    return analyzeHiLoUse(HiUse, LoUse, ShuffMask, NewLoc, CurShuffUses);
+
+  // For now, only handling binary ops and some of the instrinsics
+  // which appear to be safe (hardcoded in isValidIntrinsic()).
+  if (!HiUse->isBinaryOp() && !IsUseIntrinsic)
+    return false;
+
+  ValueVector HiUseOperands, LoUseOperands;
+  int HiOpNum = -1, LoOpNum = -1;
+  for (unsigned i = 0; i < HiUse->getNumOperands(); i++) {
+    Value *V = getOperand(HiUse, i);
+    if (V == HI)
+      HiOpNum = i;
+    else
+      HiUseOperands.push_back(V);
+  }
+  for (unsigned i = 0; i < LoUse->getNumOperands(); i++) {
+    Value *V = getOperand(LoUse, i);
+    if (V == LO)
+      LoOpNum = i;
+    else
+      LoUseOperands.push_back(V);
+  }
+
+  // Enforcing strict ordering which is not necessary in case of
+  // commutative operations and may be relaxed in future if needed.
+  if (HiOpNum < 0 || HiOpNum != LoOpNum ||
+      LoUseOperands.size() != HiUseOperands.size())
+    return false;
+
+  unsigned NumOperands = HiUseOperands.size();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    if (HiUseOperands[i] == LoUseOperands[i])
+      continue;
+    // Only handle the case where other operands to Hi and Lo uses
+    // are comming from another Hi and Lo pair.
+    if (!isHILo(HiUseOperands[i], true) || !isHILo(LoUseOperands[i], false))
+      return false;
+
+    Value *DefHiUse = dyn_cast<Instruction>(HiUseOperands[i])->getOperand(0);
+    Value *DefLoUse = dyn_cast<Instruction>(LoUseOperands[i])->getOperand(0);
+    if (!DefHiUse || DefHiUse != DefLoUse)
+      return false;
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (dyn_cast<CastInst>(DefHiUse))
+      ImmUseList.push_back(dyn_cast<Instruction>(DefHiUse));
+    else {
+      ImmUseList.push_back(HiUse);
+      ImmUseList.push_back(LoUse);
+    }
+
+    // Make sure that the Hi/Lo def has the same shuffle mask.
+    if (!hasDefWithSameShuffMask(DefHiUse, ImmUseList, ShuffMask, CurShuffUses))
+      return false;
+  }
+
+  // Continue the search along Hi/Lo use-chain.
+  return analyzeHiLoUse(HiUse, LoUse, ShuffMask, NewLoc, CurShuffUses);
+}
+
+bool HexagonOptShuffleVector::hasDefWithSameShuffMask(
+    Value *V, SmallVector<Instruction *, 2> &ImmUses, ArrayRef<int> &ShuffMask,
+    ShuffUseList &CurShuffUses) {
+  // Follow def-chain until we have found a shuffle_vector or have run out
+  // of max number of attempts.
+  if (DefSearchCount >= MaxDefSearchCount)
+    return false;
+
+  ++DefSearchCount;
+  V = stripCasts(V).first;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+  bool Found = true;
+  ArrayRef<int> M;
+  if (match(V, (m_Shuffle(m_Value(), m_Value(), m_Mask(M)))) &&
+      M.equals(ShuffMask)) {
+    CurShuffUses[I] = ImmUses;
+    return true;
+  }
+  if ((match(V, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                          m_Poison(), m_ZeroMask()))))
+    return true; // scalar converted to a vector
+
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!I->isBinaryOp() && (!II || !isValidIntrinsic(II)))
+    return false;
+
+  for (Value *OpV : getArgOperands(I)) {
+    std::pair<Value *, Value *> P = stripCasts(OpV);
+    OpV = P.first;
+
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (P.second)
+      ImmUseList.push_back(dyn_cast<Instruction>(P.second));
+    else
+      ImmUseList.push_back(dyn_cast<Instruction>(I));
+
+    if (isa<PoisonValue>(OpV))
+      continue;
+    if (isConstantVectorSplat(OpV))
+      continue;
+    if (!dyn_cast<Instruction>(OpV))
+      return false;
+    if ((match(OpV, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                              m_Poison(), m_ZeroMask()))))
+      continue;
+    Found &= hasDefWithSameShuffMask(OpV, ImmUseList, ShuffMask, CurShuffUses);
+  }
+  return Found;
+}
+
+void HexagonOptShuffleVector::FindHiLoUse(ValueVector &UseList,
+                                          Instruction *&HI, Instruction *&LO) {
+
+  for (unsigned i = 0; i < UseList.size(); i++) {
+    auto *J = dyn_cast<Instruction>(UseList[i]);
+    auto *CI = dyn_cast<CallInst>(J);
+    if (CI) {
+      auto *II = dyn_cast<IntrinsicInst>(CI);
+      if (II) {
+        Intrinsic::ID IntID = II->getIntrinsicID();
+        if (IntID == Intrinsic::hexagon_V6_hi_128B)
+          HI = J;
+        if (IntID == Intrinsic::hexagon_V6_lo_128B)
+          LO = J;
+      }
+    }
+  }
+}
+
+bool HexagonOptShuffleVector::isValidUseInstr(ValueVector &UseList,
+                                              Instruction *&UI) {
+  // Don't allow multiple uses. Only done in case of a Hi/Lo pair.
+  if (UseList.size() != 1)
+    return false;
+  UI = dyn_cast<Instruction>(UseList[0]);
+  if (!UI)
+    return false;
+  // Should be either a binary op or one of the supported instrinsics.
+  if (auto *CI = dyn_cast<CallInst>(UI)) {
+    auto *II = dyn_cast<IntrinsicInst>(CI);
+    if (!II || !isValidIntrinsic(II))
+      return false;
+    UI = II;
+  } else if (!UI->isBinaryOp())
+    return false;
+  return true;
+}
+
+// Check all the operands of 'Use' to make sure that they are either:
+// 1) a constant
+// 2) a scalar
+// 3) a constant vector
+// 4) a vector using the same mask as I
+bool HexagonOptShuffleVector::areAllOperandsValid(Instruction *I,
+                                                  Instruction *Use,
+                                                  ArrayRef<int> &ShuffMask,
+                                                  ShuffUseList &CurShuffUses) {
+  bool AllOperandsOK = true;
+  for (Value *OpV : getArgOperands(Use)) {
+    bool HasOneUse = OpV->hasOneUse();
+    std::pair<Value *, Value *> P = stripCasts(OpV);
+    OpV = P.first;
+
+    SmallVector<Instruction *, 2> ImmUseList;
+    if (P.second)
+      ImmUseList.push_back(dyn_cast<Instruction>(P.second));
+    else
+      ImmUseList.push_back(dyn_cast<Instruction>(Use));
+
+    if (OpV == I || isa<PoisonValue>(OpV))
+      continue;
+    if (isConstantVectorSplat(OpV))
+      continue;
+    if (!dyn_cast<Instruction>(OpV) || !HasOneUse)
+      return false;
+
+    if ((match(OpV, m_Shuffle(m_InsertElt(m_Poison(), m_Value(), m_Zero()),
+                              m_Poison(), m_ZeroMask()))))
+      continue;
+    AllOperandsOK &=
+        hasDefWithSameShuffMask(OpV, ImmUseList, ShuffMask, CurShuffUses);
+  }
+  return AllOperandsOK;
+}
+
+// Find the new location where it's safe to relocate shuffle instruction 'I'.
+bool HexagonOptShuffleVector::findNewShuffLoc(Instruction *I,
+                                              ArrayRef<int> &ShuffMask,
+                                              Value *&NewLoc) {
+  DefSearchCount = 0;
+  ValueVector UseList;
+  if (!getUseList(I, UseList))
+    return false;
+
+  using ShuffUseList =
+      SmallDenseMap<Instruction *, SmallVector<Instruction *, 2>>;
+  ShuffUseList CurShuffUses;
+  // Check for Hi and Lo pair.
+  Instruction *HI = nullptr, *LO = nullptr;
+  FindHiLoUse(UseList, HI, LO);
+  if (UseList.size() == 2 && HI && LO) {
+    // If 'I' has Hi and Lo use-pair, then it can be relocated only after Hi/Lo
+    // use-chain's merge point, i.e., after a concat vector provided it's safe
+    // to do so.
+    LLVM_DEBUG({
+      dbgs() << "\tFollowing the Hi/LO pair :\n";
+      dbgs() << "\t\tHI - ";
+      HI->dump();
+      dbgs() << "\t\tLO - ";
+      LO->dump();
+    });
+    if (!analyzeHiLoUse(HI, LO, ShuffMask, NewLoc, CurShuffUses))
+      return false;
+    for (auto &it : CurShuffUses)
+      ShuffUses[it.first] = it.second;
+    return true;
+  } else { // Single use case
+    Instruction *UI = nullptr;
+    if (!isValidUseInstr(UseList, UI))
+      return false;
+    assert(UI && "Expected a valid use, but found none!!");
+
+    if (HI || LO) {
+      // If the single use case is either Hi or Lo, it is not safe to relocate
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\tChecking operands in 'use' : \n\t\t"; UI->dump());
+    if (!areAllOperandsValid(I, UI, ShuffMask, CurShuffUses)) {
+      LLVM_DEBUG(dbgs() << "\t\tNOT SAFE -- Exiting!!\n");
+      return false;
+    }
+    for (auto &it : CurShuffUses)
+      ShuffUses[it.first] = it.second;
+    NewLoc = UI;
+    // Keep looking for the new location until can't proceed any longer.
+    findNewShuffLoc(UI, ShuffMask, NewLoc);
+  }
+  return true;
+}
+
+// Move shuffle instruction 'I' after 'NewLoc'.
+bool HexagonOptShuffleVector::relocateShuffVec(
+    Instruction *I, ArrayRef<int> &M, Value *NewLoc,
+    std::list<Instruction *> &WorkList) {
+  // Remove original vector shuffles at the input operands.
+  // However, it can be done only if the replacements have the
+  // same number of vector elements as the original operands.
+  std::map<Instruction *, Value *> InstrMap;
+  bool CanReplace = true;
+  unsigned ShuffInstCount = ShuffUses.size();
+  for (auto &it : ShuffUses) {
+    Instruction *J = it.first;
+    Visited.insert(J);
+    Value *ShuffleOP = nullptr;
+    match(J, (m_Shuffle(m_Value(ShuffleOP), m_Poison(), m_Mask(M))));
+    VectorType *JTy = cast<FixedVectorType>(J->getType());
+    VectorType *ShuffTy = cast<FixedVectorType>(ShuffleOP->getType());
+    if (JTy->getElementCount() != ShuffTy->getElementCount())
+      CanReplace = false;
+
+    // Relocate shufflevector after a wider instruction only if there are
+    // at least two or more shufflevectors being relocated in order for the
+    // relocation to be profitable as otherwise it will require more shuffles.
+    VectorType *NewShuffTy = cast<FixedVectorType>(NewLoc->getType());
+    if (ShuffInstCount == 1 &&
+        NewShuffTy->getElementType() > ShuffTy->getElementType())
+      CanReplace = false;
+    InstrMap[J] = ShuffleOP;
+  }
+  if (!CanReplace) {
+    LLVM_DEBUG(dbgs() << "\tRelocation FAILED!! \n");
+    return false;
+  }
+  for (auto IM : InstrMap) {
+    Instruction *J = IM.first;
+    assert(ShuffUses.count(J));
+    SmallVector<Instruction *, 2> Uses = ShuffUses[J];
+    if (Uses.size() > 0) {
+      for (auto *U : Uses)
+        U->replaceUsesOfWith(IM.first, IM.second);
+    } else
+      // This is the shuffle we started with, and we have already made sure
+      // that it has either single use or a HI/LO use pair. So, it's okay
+      // to replace all its uses with the input to the shuffle instruction.
+      IM.first->replaceAllUsesWith(IM.second);
+  }
+  // Shuffle the output of NewLoc based on the original mask.
+  Instruction *Pos = dyn_cast<Instruction>(NewLoc);
+  assert(Pos);
+  Pos = Pos->getNextNode();
+  IRBuilder<> IRB(Pos);
+  Value *NewShuffV =
+      IRB.CreateShuffleVector(NewLoc, PoisonValue::get(NewLoc->getType()), M);
+  Instruction *NewInst = dyn_cast<Instruction>(NewShuffV);
+  if (!NewInst) {
+    LLVM_DEBUG(dbgs() << "\tRelocation FAILED!! \n");
+    return false;
+  }
+  for (auto UI = NewLoc->user_begin(), UE = NewLoc->user_end(); UI != UE;) {
+    Use &TheUse = UI.getUse();
+    ++UI;
+    Instruction *J = dyn_cast<Instruction>(TheUse.getUser());
+    if (J && TheUse.getUser() != NewShuffV)
+      J->replaceUsesOfWith(NewLoc, NewShuffV);
+  }
+  WorkList.push_back(NewInst);
+  LLVM_DEBUG(dbgs() << "\tRelocation Successfull!! \n");
+  LLVM_DEBUG(dbgs() << "\tAdded to Worklist :\n"; NewInst->dump());
+  return true;
+}
+
+bool HexagonOptShuffleVector::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+  ArrayRef<int> M;
+  std::list<Instruction *> WorkList;
+  LLVM_DEBUG(dbgs() << "Preparing worklist for BB:\n");
+  LLVM_DEBUG(B->dump());
+  for (auto &I : *B) {
+    if (match(&I, (m_Shuffle(m_Value(), m_Value(), m_ZeroMask()))))
+      continue; // Skip - building vector from a scalar
+    if (match(&I, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M))))) {
+      WorkList.push_back(&I);
+      LLVM_DEBUG(dbgs() << "\tAdded instr - "; I.dump());
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Processing worklist:\n");
+  while (!WorkList.empty()) {
+#ifndef NDEBUG
+    int Limit = ShuffVecLimit;
+    if (Limit >= 0) {
+      if (NumRelocated >= ShuffVecLimit) {
+        LLVM_DEBUG({
+          dbgs() << "Reached maximum limit!! \n";
+          dbgs() << "Can't process any more shuffles.... \n";
+        });
+        return Changed;
+      }
+    }
+#endif
+    Instruction *I = WorkList.front();
+    WorkList.pop_front();
+    LLVM_DEBUG(dbgs() << "\tProcessing instr - "; I->dump());
+    Value *NewLoc = nullptr;
+
+    // 'ShuffUses' is used to keep track of the vector shuffles that need to
+    // be relocated along with their immediate uses that are known to satisfy
+    // all the safety requirements of the relocation.
+    // NOTE: The shuffle instr 'I', where the analysis starts, doesn't have
+    // its immediate uses set in 'ShuffUses'. This can be done but isn't
+    // necessary. At this point, only shuffles with single use or a HI/LO pair
+    // are allowed. This is done mostly because those with the multiple uses
+    // aren't expected to be much profitable and can be extended in the future
+    // if necessary. For now, all the uses in such cases can be safely updated
+    // when the corresponding vector shuffle is relocated.
+
+    ShuffUses.clear();
+    ShuffUses[I] = SmallVector<Instruction *, 2>();
+    // Skip if node already visited.
+    if (!Visited.insert(I).second) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - Already visited ...\n");
+      continue;
+    }
+    if (!match(I, (m_Shuffle(m_Value(), m_Poison(), m_Mask(M))))) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - Not a vector shuffle ...\n");
+      continue;
+    }
+    if (!findNewShuffLoc(I, M, NewLoc) || !NewLoc) {
+      LLVM_DEBUG(dbgs() << "\t\tSKIPPING - NewLoc not found ...\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "\t\tRelocating after -- "; NewLoc->dump());
+    Changed |= relocateShuffVec(I, M, NewLoc, WorkList);
+#ifndef NDEBUG
+    NumRelocated++;
+#endif
+  }
+  return Changed;
+}
+
+bool HexagonOptShuffleVector::runOnFunction(Function &F) {
+  HST = TM->getSubtargetImpl(F);
+  // Works only for 128B mode but can be extended for 64B if needed.
+  if (skipFunction(F) || !HST->useHVX128BOps())
+    return false;
+
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= visitBlock(&B);
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createHexagonOptShuffleVector(const HexagonTargetMachine &TM) {
+  return new HexagonOptShuffleVector(&TM);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index e40dbd2..e84070f 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -391,7 +391,6 @@ def Fptoui: pf1<fp_to_uint>;
 def Sitofp: pf1<sint_to_fp>;
 def Uitofp: pf1<uint_to_fp>;
 
-
 // --(1) Immediate -------------------------------------------------------
 //
 
@@ -474,6 +473,18 @@ def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
 def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
 def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
 
+def: Pat<(i32 (fp_to_bf16 F32:$v)),
+         (C2_mux (F2_sfclass F32:$v, 0x10), (A2_tfrsi(i32 0x7fff)),
+           (C2_mux
+             (C2_cmpeq
+               (A2_and F32:$v, (A2_tfrsi (i32 0x1FFFF))),
+               (A2_tfrsi (i32 0x08000))),
+             (A2_and (A2_asrh F32:$v), (A2_tfrsi (i32 65535))),
+             (A2_and
+               (A2_asrh
+                 (A2_add F32:$v, (A2_and F32:$v, (A2_tfrsi (i32 0x8000))))),
+                 (A2_tfrsi (i32 65535))))
+         )>;
 // Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
 def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
 def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index d19920c..674d191 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -15,12 +15,14 @@ def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
 def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
 def HVF16:  PatLeaf<(VecF16  HvxVR:$R)>;
 def HVF32:  PatLeaf<(VecF32  HvxVR:$R)>;
+def HVBF16: PatLeaf<(VecBF16 HvxVR:$R)>;
 
 def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
 def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
 def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
 def HWF16:  PatLeaf<(VecPF16 HvxWR:$R)>;
 def HWF32:  PatLeaf<(VecPF32 HvxWR:$R)>;
+def HWBF16: PatLeaf<(VecBF16 HvxWR:$R)>;
 
 def SDTVecUnaryOp:
   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -182,12 +184,15 @@ let Predicates = [UseHVX] in {
 }
 
 let Predicates = [UseHVXV68] in {
-  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF16, IsVecOff>;
-  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF32, IsVecOff>;
-  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF16, IsVecOff>;
-  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF32, IsVecOff>;
-  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF16, IsVecOff>;
-  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF32, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecBF16, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF16, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF32, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_ai, alignedload, VecBF16, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_ai, alignedload, VecF16, IsVecOff>;
+  defm : HvxLda_pat<V6_vL32b_ai, alignedload, VecF32, IsVecOff>;
+  defm : HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecBF16, IsVecOff>;
+  defm : HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecF16, IsVecOff>;
+  defm : HvxLd_pat<V6_vL32Ub_ai, unalignedload, VecF32, IsVecOff>;
 }
 
 // HVX stores
@@ -233,10 +238,13 @@ let Predicates = [UseHVX] in {
 }
 
 let Predicates = [UseHVXV68] in {
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVBF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVBF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVBF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF16, IsVecOff>;
   defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF32, IsVecOff>;
 }
@@ -253,20 +261,36 @@ let Predicates = [UseHVX] in {
   defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecF32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecPI8,  VecPF32,  HvxWR>;
+  defm: NopCast_pat<VecPI16,  VecPF32,  HvxWR>;
+  defm: NopCast_pat<VecPI32,  VecPF32,  HvxWR>;
+}
+
 let Predicates = [UseHVX, UseHVXFloatingPoint] in {
   defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI8,   VecBF16, HvxVR>;
   defm: NopCast_pat<VecI8,   VecF32,  HvxVR>;
   defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecBF16, HvxVR>;
   defm: NopCast_pat<VecI16,  VecF32,  HvxVR>;
   defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecBF16, HvxVR>;
   defm: NopCast_pat<VecI32,  VecF32,  HvxVR>;
   defm: NopCast_pat<VecF16,  VecF32,  HvxVR>;
 
   defm: NopCast_pat<VecPI8,  VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI8,  VecPBF16, HvxWR>;
   defm: NopCast_pat<VecPI8,  VecPF32, HvxWR>;
   defm: NopCast_pat<VecPI16, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPBF16, HvxWR>;
   defm: NopCast_pat<VecPI16, VecPF32, HvxWR>;
   defm: NopCast_pat<VecPI32, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPBF16, HvxWR>;
   defm: NopCast_pat<VecPI32, VecPF32, HvxWR>;
   defm: NopCast_pat<VecPF16, VecPF32, HvxWR>;
 }
@@ -293,6 +317,8 @@ let Predicates = [UseHVX] in {
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
   def: Pat<(VecPI32 (concat_vectors HVI32:$Vs, HVI32:$Vt)),
            (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPF32 (concat_vectors HVF32:$Vs, HVF32:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
 
   def: Pat<(VecQ8  (qcat HQ16:$Qs, HQ16:$Qt)), (Combineq $Qt, $Qs)>;
   def: Pat<(VecQ16 (qcat HQ32:$Qs, HQ32:$Qt)), (Combineq $Qt, $Qs)>;
@@ -315,11 +341,14 @@ let Predicates = [UseHVX] in {
 let Predicates = [UseHVX, UseHVXFloatingPoint] in {
   let AddedComplexity = 100 in {
     def: Pat<(VecF16  vzero), (V6_vd0)>;
+    def: Pat<(VecBF16  vzero), (V6_vd0)>;
     def: Pat<(VecF32  vzero), (V6_vd0)>;
     def: Pat<(VecPF16 vzero), (PS_vdd0)>;
+    def: Pat<(VecPBF16 vzero), (PS_vdd0)>;
     def: Pat<(VecPF32 vzero), (PS_vdd0)>;
 
     def: Pat<(concat_vectors (VecF16 vzero), (VecF16 vzero)), (PS_vdd0)>;
+    def : Pat<(concat_vectors (VecBF16 vzero), (VecBF16 vzero)), (PS_vdd0)>;
     def: Pat<(concat_vectors (VecF32 vzero), (VecF32 vzero)), (PS_vdd0)>;
   }
 
@@ -355,11 +384,13 @@ let Predicates = [UseHVX] in {
 let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
   let AddedComplexity = 30 in {
     def: Pat<(VecF16  (splat_vector u16_0ImmPred:$V)), (PS_vsplatih imm:$V)>;
+    def: Pat<(VecBF16  (splat_vector u16_0ImmPred:$V)), (PS_vsplatih imm:$V)>;
     def: Pat<(VecF32  (splat_vector anyint:$V)),       (PS_vsplatiw imm:$V)>;
     def: Pat<(VecF32  (splat_vector f32ImmPred:$V)),   (PS_vsplatiw (ftoi $V))>;
   }
   let AddedComplexity = 20 in {
     def: Pat<(VecF16  (splat_vector I32:$Rs)), (PS_vsplatrh $Rs)>;
+    def: Pat<(VecBF16  (splat_vector I32:$Rs)), (PS_vsplatrh $Rs)>;
     def: Pat<(VecF32  (splat_vector I32:$Rs)), (PS_vsplatrw $Rs)>;
     def: Pat<(VecF32  (splat_vector F32:$Rs)), (PS_vsplatrw $Rs)>;
   }
@@ -519,6 +550,35 @@ let Predicates = [UseHVXV68, UseHVXIEEEFP] in {
   def: Pat<(VecPF16 (Uitofp HVI8:$Vu)), (V6_vcvt_hf_ub HvxVR:$Vu)>;
 }
 
+let Predicates = [UseHVXV81] in {
+  def : Pat<(VecBF16 (pf1<fpround> HWF32:$Vuu)),
+            (V6_vpackwuh_sat (V6_vmux
+                 (V6_veqsf (HiVec HvxWR:$Vuu), (HiVec HvxWR:$Vuu)),
+                 (V6_vlsrw (V6_vmux (V6_veqw (V6_vand (HiVec HvxWR:$Vuu),
+                                         (PS_vsplatiw (i32 0x1FFFF))),
+                                (PS_vsplatiw (i32 0x08000))),
+                      (HiVec HvxWR:$Vuu),
+                      (V6_vaddw (HiVec HvxWR:$Vuu),
+                                (V6_vand (HiVec HvxWR:$Vuu),
+                                    (PS_vsplatiw (i32 0x8000))))),
+                                  (A2_tfrsi 16)),
+                 (PS_vsplatih (i32 0x7fff))),
+                (V6_vmux (V6_veqsf (LoVec HvxWR:$Vuu), (LoVec HvxWR:$Vuu)),
+                    (V6_vlsrw (V6_vmux (V6_veqw (V6_vand (LoVec HvxWR:$Vuu),
+                                            (PS_vsplatiw (i32 0x1FFFF))),
+                                   (PS_vsplatiw (i32 0x08000))),
+                         (LoVec HvxWR:$Vuu),
+                         (V6_vaddw (LoVec HvxWR:$Vuu),
+                                   (V6_vand (LoVec HvxWR:$Vuu),
+                                       (PS_vsplatiw (i32 0x8000))))),
+                        (A2_tfrsi 16)),
+                    (PS_vsplatih (i32 0x7fff))))>;
+}
+
+let Predicates = [UseHVXV73, UseHVXQFloat] in {
+  def : Pat<(VecF32 (Sitofp HVI32:$Vu)), (V6_vconv_sf_w HvxVR:$Vu)>;
+}
+
 let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
   def: Pat<(vselect HQ16:$Qu, HVF16:$Vs, HVF16:$Vt),
            (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
@@ -531,6 +591,13 @@ let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
            (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
 }
 
+let Predicates = [UseHVXV81, UseHVXFloatingPoint] in {
+  def : Pat<(vselect HQ16:$Qu, HVBF16:$Vs, HVBF16:$Vt),
+            (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+  def : Pat<(vselect (qnot HQ16:$Qu), HVBF16:$Vs, HVBF16:$Vt),
+            (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+}
+
 let Predicates = [UseHVXV68, UseHVX128B, UseHVXQFloat] in {
   let AddedComplexity = 220 in {
     defm: MinMax_pats<V6_vmin_hf, V6_vmax_hf, vselect,  setgt, VecQ16, HVF16>;
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index f29a739..c9cb449 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -58,7 +58,7 @@
 // are PHI inst.
 //
 //===----------------------------------------------------------------------===//
-#include <unordered_set>
+
 #define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
 
 #include "Hexagon.h"
@@ -77,7 +77,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
-#include <vector>
 
 #define DEBUG_TYPE "hexagon-qfp-optimizer"
 
@@ -86,6 +85,9 @@ using namespace llvm;
 cl::opt<bool>
     DisableQFOptimizer("disable-qfp-opt", cl::init(false),
                        cl::desc("Disable optimization of Qfloat operations."));
+cl::opt<bool> DisableQFOptForMul(
+    "disable-qfp-opt-mul", cl::init(true),
+    cl::desc("Disable optimization of Qfloat operations for multiply."));
 
 namespace {
 const std::map<unsigned short, unsigned short> QFPInstMap{
@@ -101,11 +103,16 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
     {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
     {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
     {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
-    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32},
+    {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32},
+    {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16},
+    {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32},
+    {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16},
+    {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32},
+    {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}};
 } // namespace
 
 namespace {
-
 struct HexagonQFPOptimizer : public MachineFunctionPass {
 public:
   static char ID;
@@ -116,6 +123,10 @@ public:
 
   bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
 
+  bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
   StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -142,19 +153,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() {
 bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
                                       MachineBasicBlock *MBB) {
 
-  // Early exit:
-  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
-  // + 1 dest),
-  // - or does not have a transformation mapping.
-  if (MI->getNumOperands() < 3)
+  if (MI->getNumOperands() == 2)
+    return optimizeQfpOneOp(MI, MBB);
+  else if (MI->getNumOperands() == 3)
+    return optimizeQfpTwoOp(MI, MBB);
+  else
     return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+
+  unsigned Op0F = 0;
   auto It = QFPInstMap.find(MI->getOpcode());
   if (It == QFPInstMap.end())
     return false;
+
   unsigned short InstTy = It->second;
+  // Get the reachind defs of MI
+  MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+  MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump());
+  MachineInstr *ReachDefDef = nullptr;
+
+  // Get the reaching def of the reaching def to check for W reg def
+  if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() &&
+      DefMI->getOperand(1).getReg().isVirtual())
+    ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+  unsigned ReachDefOp = DefMI->getOpcode();
+  MachineInstrBuilder MIB;
+
+  // Check if the reaching def is a conversion
+  if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 ||
+      ReachDefOp == Hexagon::V6_vconv_hf_qf16) {
+
+    // Return if the reaching def of reaching def is W type
+    if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) ==
+                           &Hexagon::HvxWRRegClass)
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &SrcOp = DefMI->getOperand(1);
+    Op0F = getKillRegState(SrcOp.isKill());
+    SrcOp.setIsKill(false);
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+  return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
 
   unsigned Op0F = 0;
   unsigned Op1F = 0;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+  unsigned short InstTy = It->second;
   // Get the reaching defs of MI, DefMI1 and DefMI2
   MachineInstr *DefMI1 = nullptr;
   MachineInstr *DefMI2 = nullptr;
@@ -167,6 +228,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     return false;
 
   MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
   MachineInstr *Inst1 = nullptr;
   MachineInstr *Inst2 = nullptr;
   LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
@@ -185,7 +249,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
   unsigned Def2OP = DefMI2->getOpcode();
 
   MachineInstrBuilder MIB;
-  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+
+  // Check if the both the reaching defs of MI are qf to sf/hf conversions
   if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
        Def2OP == Hexagon::V6_vconv_sf_qf32) ||
       (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -226,7 +291,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if left operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
                Def2OP != Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@@ -250,7 +315,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
 
-    // Case 2: Left operand is conversion to sf/hf
+    // Check if right operand's reaching def is a conversion to sf/hf
   } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
                Def2OP == Hexagon::V6_vconv_sf_qf32) ||
               (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
@@ -258,13 +323,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
              !DefMI1->isPHI() &&
              (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
     // The second operand of original instruction is converted.
-    // In "mix" instructions, "qf" operand is always the first operand.
-
-    // Caveat: vsub is not commutative w.r.t operands.
-    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
-        InstTy == Hexagon::V6_vsub_qf32_mix)
-      return false;
-
     if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
                      &Hexagon::HvxWRRegClass)
       return false;
@@ -275,10 +333,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
     Op1F = getKillRegState(Src2.isKill());
     Src2.setIsKill(false);
     Op0F = getKillRegState(Src1.isKill());
-    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
-              .addReg(Src2.getReg(), Op1F,
-                      Src2.getSubReg()) // Notice the operands are flipped.
-              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix) {
+      if (!HST->useHVXV81Ops())
+        // vsub_(hf|sf)_mix insts are only avlbl on hvx81+
+        return false;
+      // vsub is not commutative w.r.t. operands -> treat it as a special case
+      // to choose the correct mix instruction.
+      if (Def2OP == Hexagon::V6_vconv_sf_qf32)
+        InstTy = Hexagon::V6_vsub_sf_mix;
+      else if (Def2OP == Hexagon::V6_vconv_hf_qf16)
+        InstTy = Hexagon::V6_vsub_hf_mix;
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+                .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    } else {
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src2.getReg(), Op1F,
+                        Src2.getSubReg()) // Notice the operands are flipped.
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    }
     LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
     return true;
   }
@@ -309,15 +383,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
     while (MII != MBBI->instr_end()) {
       MachineInstr *MI = &*MII;
       ++MII; // As MI might be removed.
-
-      if (QFPInstMap.count(MI->getOpcode()) &&
-          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
-          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
-        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
-        if (optimizeQfp(MI, MBB)) {
-          MI->eraseFromParent();
-          LLVM_DEBUG(dbgs() << "\t....Removing....");
-          Changed = true;
+      if (QFPInstMap.count(MI->getOpcode())) {
+        auto OpC = MI->getOpcode();
+        if (DisableQFOptForMul && HII->isQFPMul(MI))
+          continue;
+        if (OpC != Hexagon::V6_vconv_sf_qf32 &&
+            OpC != Hexagon::V6_vconv_hf_qf16) {
+          LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+          if (optimizeQfp(MI, MBB)) {
+            MI->eraseFromParent();
+            LLVM_DEBUG(dbgs() << "\t....Removing....");
+            Changed = true;
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 54f5608..f375b25 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index 3a77fcd..1f1aebd 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -15,141 +15,126 @@ let Namespace = "Hexagon" in {
   class HexagonReg<bits<5> num, string n, list<string> alt = [],
                    list<Register> alias = []> : Register<n, alt> {
     let Aliases = alias;
-    let HWEncoding{4-0} = num;
+    let HWEncoding{4 -0} = num;
   }
 
   // These registers are used to preserve a distinction between
   // vector register pairs of differing order.
-  class HexagonFakeReg<string n> : Register<n> {
-    let isArtificial = 1;
-  }
+  class HexagonFakeReg<string n> : Register<n> { let isArtificial = 1; }
 
   class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
-                         list<string> alt = []> :
-        RegisterWithSubRegs<n, subregs> {
+                         list<string> alt = []>
+      : RegisterWithSubRegs<n, subregs> {
     let AltNames = alt;
-    let HWEncoding{4-0} = num;
+    let HWEncoding{4 -0} = num;
   }
 
   class HexagonSys<bits<7> num, string n, list<string> alt = [],
                    list<Register> alias = []> : Register<n, alt> {
     let Aliases = alias;
-    let HWEncoding{6-0} = num;
+    let HWEncoding{6 -0} = num;
   }
 
   class HexagonDoubleSys<bits<7> num, string n, list<Register> subregs,
-                         list<string> alt = []> :
-        RegisterWithSubRegs<n, subregs> {
+                         list<string> alt = []>
+      : RegisterWithSubRegs<n, subregs> {
     let AltNames = alt;
-    let HWEncoding{6-0} = num;
+    let HWEncoding{6 -0} = num;
   }
 
   // Registers are identified with 5-bit ID numbers.
   // Ri - 32-bit integer registers.
-  class Ri<bits<5> num, string n, list<string> alt = []> :
-        HexagonReg<num, n, alt>;
+  class Ri<bits<5> num, string n, list<string> alt = []>
+      : HexagonReg<num, n, alt>;
 
   // Rp - false/pseudo registers.  These registers are used
   // to provide a distinct set of aliases for both styles of vector
   // register pairs without encountering subregister indexing constraints.
-  class R_fake<string n> :
-        HexagonFakeReg<n>;
-
+  class R_fake<string n> : HexagonFakeReg<n>;
 
   // Rf - 32-bit floating-point registers.
   class Rf<bits<5> num, string n> : HexagonReg<num, n>;
 
   // Rd - 64-bit registers.
-  class Rd<bits<5> num, string n, list<Register> subregs,
-           list<string> alt = []> :
-        HexagonDoubleReg<num, n, subregs, alt> {
+  class Rd<bits<5> num, string n, list<Register> subregs, list<string> alt = []>
+      : HexagonDoubleReg<num, n, subregs, alt> {
     let SubRegs = subregs;
   }
 
   // Rp - predicate registers
   class Rp<bits<5> num, string n> : HexagonReg<num, n>;
 
-
   // Rq - vector predicate registers
   class Rq<bits<3> num, string n> : Register<n, []> {
-    let HWEncoding{2-0} = num;
+    let HWEncoding{2 -0} = num;
   }
 
   // Rc - control registers
-  class Rc<bits<5> num, string n,
-           list<string> alt = [], list<Register> alias = []> :
-        HexagonReg<num, n, alt, alias>;
+  class Rc<bits<5> num, string n, list<string> alt = [],
+           list<Register> alias = []> : HexagonReg<num, n, alt, alias>;
 
   // Rcc - 64-bit control registers.
   class Rcc<bits<5> num, string n, list<Register> subregs,
-            list<string> alt = []> :
-        HexagonDoubleReg<num, n, subregs, alt> {
+            list<string> alt = []> : HexagonDoubleReg<num, n, subregs, alt> {
     let SubRegs = subregs;
   }
 
   // Rs - system registers
-  class Rs<bits<7> num, string n,
-           list<string> alt = [], list<Register> alias = []> :
-        HexagonSys<num, n, alt, alias>;
+  class Rs<bits<7> num, string n, list<string> alt = [],
+           list<Register> alias = []> : HexagonSys<num, n, alt, alias>;
 
   // Rss - 64-bit system registers.
   class Rss<bits<7> num, string n, list<Register> subregs,
-            list<string> alt = []> :
-        HexagonDoubleSys<num, n, subregs, alt> {
+            list<string> alt = []> : HexagonDoubleSys<num, n, subregs, alt> {
     let SubRegs = subregs;
   }
 
   // Mx - address modifier registers
-  class Mx<bits<1> num, string n> : Register<n, []> {
-    let HWEncoding{0} = num;
-  }
+  class Mx<bits<1> num, string n> : Register<n, []> { let HWEncoding{0} = num; }
 
   // Rg - Guest/Hypervisor registers
-  class Rg<bits<5> num, string n,
-           list<string> alt = [], list<Register> alias = []> :
-        HexagonReg<num, n, alt, alias>;
+  class Rg<bits<5> num, string n, list<string> alt = [],
+           list<Register> alias = []> : HexagonReg<num, n, alt, alias>;
 
   // Rgg - 64-bit Guest/Hypervisor registers
-  class Rgg<bits<5> num, string n, list<Register> subregs> :
-        HexagonDoubleReg<num, n, subregs> {
+  class Rgg<bits<5> num, string n, list<Register> subregs>
+      : HexagonDoubleReg<num, n, subregs> {
     let SubRegs = subregs;
   }
 
-  def isub_lo  : SubRegIndex<32>;
-  def isub_hi  : SubRegIndex<32, 32>;
-  def vsub_lo  : SubRegIndex<-1, -1>;
-  def vsub_hi  : SubRegIndex<-1, -1>;
-  def vsub_fake: SubRegIndex<-1, -1>;
-  def wsub_lo  : SubRegIndex<-1, -1>;
-  def wsub_hi  : SubRegIndex<-1, -1>;
+  def isub_lo : SubRegIndex<32>;
+  def isub_hi : SubRegIndex<32, 32>;
+  def vsub_lo : SubRegIndex<-1, -1>;
+  def vsub_hi : SubRegIndex<-1, -1>;
+  def vsub_fake : SubRegIndex<-1, -1>;
+  def wsub_lo : SubRegIndex<-1, -1>;
+  def wsub_hi : SubRegIndex<-1, -1>;
   def subreg_overflow : SubRegIndex<1, 0>;
 
   // Integer registers.
-  foreach i = 0-28 in {
-    def R#i  : Ri<i, "r"#i>,  DwarfRegNum<[i]>;
-  }
+  foreach i = 0 -28 in { def R#i : Ri<i, "r"#i>, DwarfRegNum<[i]>; }
   def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>;
   def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>;
   def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>;
 
   // Aliases of the R* registers used to hold 64-bit int values (doubles).
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>,  DwarfRegNum<[32]>;
-  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>,  DwarfRegNum<[34]>;
-  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>,  DwarfRegNum<[36]>;
-  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>,  DwarfRegNum<[38]>;
-  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>,  DwarfRegNum<[40]>;
-  def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
-  def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
-  def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
-  def D8  : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
-  def D9  : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
-  def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
-  def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
-  def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
-  def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
-  def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
-  def D15 : Rd<30, "r31:30", [R30, R31], ["lr:fp"]>, DwarfRegNum<[62]>;
+    def D0 : Rd<0, "r1:0", [R0, R1]>, DwarfRegNum<[32]>;
+    def D1 : Rd<2, "r3:2", [R2, R3]>, DwarfRegNum<[34]>;
+    def D2 : Rd<4, "r5:4", [R4, R5]>, DwarfRegNum<[36]>;
+    def D3 : Rd<6, "r7:6", [R6, R7]>, DwarfRegNum<[38]>;
+    def D4 : Rd<8, "r9:8", [R8, R9]>, DwarfRegNum<[40]>;
+    def D5 : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+    def D6 : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+    def D7 : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+    def D8 : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+    def D9 : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+    def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+    def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+    def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+    def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+    def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+    def D15 : Rd<30, "r31:30", [R30, R31], ["lr:fp"]>, DwarfRegNum<[62]>;
   }
 
   // Predicate registers.
@@ -164,119 +149,118 @@ let Namespace = "Hexagon" in {
   // on the entire USR.
   def USR_OVF : Rc<?, "usr.ovf">;
 
-  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[75]> {
+  def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[75]> {
     let SubRegIndices = [subreg_overflow];
     let SubRegs = [USR_OVF];
   }
 
   // Control registers.
-  def SA0:        Rc<0,  "sa0",        ["c0"]>,    DwarfRegNum<[67]>;
-  def LC0:        Rc<1,  "lc0",        ["c1"]>,    DwarfRegNum<[68]>;
-  def SA1:        Rc<2,  "sa1",        ["c2"]>,    DwarfRegNum<[69]>;
-  def LC1:        Rc<3,  "lc1",        ["c3"]>,    DwarfRegNum<[70]>;
-  def P3_0:       Rc<4,  "p3:0",       ["c4"], [P0, P1, P2, P3]>,
-                                                   DwarfRegNum<[71]>;
+  def SA0 : Rc<0, "sa0", ["c0"]>, DwarfRegNum<[67]>;
+  def LC0 : Rc<1, "lc0", ["c1"]>, DwarfRegNum<[68]>;
+  def SA1 : Rc<2, "sa1", ["c2"]>, DwarfRegNum<[69]>;
+  def LC1 : Rc<3, "lc1", ["c3"]>, DwarfRegNum<[70]>;
+  def P3_0 : Rc<4, "p3:0", ["c4"], [P0, P1, P2, P3]>, DwarfRegNum<[71]>;
   // When defining more Cn registers, make sure to explicitly mark them
   // as reserved in HexagonRegisterInfo.cpp.
-  def C5:         Rc<5,  "c5",         ["c5"]>,    DwarfRegNum<[72]>;
-  def M0:         Rc<6,  "m0",         ["c6"]>,    DwarfRegNum<[73]>;
-  def M1:         Rc<7,  "m1",         ["c7"]>,    DwarfRegNum<[74]>;
+  def C5 : Rc<5, "c5", ["c5"]>, DwarfRegNum<[72]>;
+  def M0 : Rc<6, "m0", ["c6"]>, DwarfRegNum<[73]>;
+  def M1 : Rc<7, "m1", ["c7"]>, DwarfRegNum<[74]>;
   // Define C8 separately and make it aliased with USR.
   // The problem is that USR has subregisters (e.g. overflow). If USR was
   // specified as a subregister of C9_8, it would imply that subreg_overflow
   // and isub_lo can be composed, which leads to all kinds of issues
   // with lane masks.
-  def C8:         Rc<8,  "c8",         [], [USR]>, DwarfRegNum<[75]>;
-  def PC:         Rc<9,  "pc",         ["c9"]>,    DwarfRegNum<[76]>;
-  def UGP:        Rc<10, "ugp",        ["c10"]>,   DwarfRegNum<[77]>;
-  def GP:         Rc<11, "gp",         ["c11"]>,   DwarfRegNum<[78]>;
-  def CS0:        Rc<12, "cs0",        ["c12"]>,   DwarfRegNum<[79]>;
-  def CS1:        Rc<13, "cs1",        ["c13"]>,   DwarfRegNum<[80]>;
-  def UPCYCLELO:  Rc<14, "upcyclelo",  ["c14"]>,   DwarfRegNum<[81]>;
-  def UPCYCLEHI:  Rc<15, "upcyclehi",  ["c15"]>,   DwarfRegNum<[82]>;
-  def FRAMELIMIT: Rc<16, "framelimit", ["c16"]>,   DwarfRegNum<[83]>;
-  def FRAMEKEY:   Rc<17, "framekey",   ["c17"]>,   DwarfRegNum<[84]>;
-  def PKTCOUNTLO: Rc<18, "pktcountlo", ["c18"]>,   DwarfRegNum<[85]>;
-  def PKTCOUNTHI: Rc<19, "pktcounthi", ["c19"]>,   DwarfRegNum<[86]>;
-  def UTIMERLO:   Rc<30, "utimerlo",   ["c30"]>,   DwarfRegNum<[97]>;
-  def UTIMERHI:   Rc<31, "utimerhi",   ["c31"]>,   DwarfRegNum<[98]>;
+  def C8 : Rc<8, "c8", [], [USR]>, DwarfRegNum<[75]>;
+  def PC : Rc<9, "pc", ["c9"]>, DwarfRegNum<[76]>;
+  def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[77]>;
+  def GP : Rc<11, "gp", ["c11"]>, DwarfRegNum<[78]>;
+  def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[79]>;
+  def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[80]>;
+  def UPCYCLELO : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[81]>;
+  def UPCYCLEHI : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[82]>;
+  def FRAMELIMIT : Rc<16, "framelimit", ["c16"]>, DwarfRegNum<[83]>;
+  def FRAMEKEY : Rc<17, "framekey", ["c17"]>, DwarfRegNum<[84]>;
+  def PKTCOUNTLO : Rc<18, "pktcountlo", ["c18"]>, DwarfRegNum<[85]>;
+  def PKTCOUNTHI : Rc<19, "pktcounthi", ["c19"]>, DwarfRegNum<[86]>;
+  def UTIMERLO : Rc<30, "utimerlo", ["c30"]>, DwarfRegNum<[97]>;
+  def UTIMERHI : Rc<31, "utimerhi", ["c31"]>, DwarfRegNum<[98]>;
 
   // Control registers pairs.
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
-    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
-    def C5_4   : Rcc<4,   "c5:4",  [P3_0, C5]>,              DwarfRegNum<[71]>;
-    def C7_6   : Rcc<6,   "c7:6",  [M0, M1],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    def C1_0 : Rcc<0, "c1:0", [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2 : Rcc<2, "c3:2", [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C5_4 : Rcc<4, "c5:4", [P3_0, C5]>, DwarfRegNum<[71]>;
+    def C7_6 : Rcc<6, "c7:6", [M0, M1], ["m1:0"]>, DwarfRegNum<[72]>;
     // Use C8 instead of USR as a subregister of C9_8.
-    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
-    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
-    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
-    def UPCYCLE: Rcc<14, "c15:14", [UPCYCLELO, UPCYCLEHI], ["upcycle"]>,
-                                                              DwarfRegNum<[80]>;
-    def C17_16 : Rcc<16, "c17:16", [FRAMELIMIT, FRAMEKEY]>,  DwarfRegNum<[83]>;
+    def C9_8 : Rcc<8, "c9:8", [C8, PC]>, DwarfRegNum<[74]>;
+    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>, DwarfRegNum<[76]>;
+    def CS : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>, DwarfRegNum<[78]>;
+    def UPCYCLE : Rcc<14, "c15:14", [UPCYCLELO, UPCYCLEHI], ["upcycle"]>,
+                  DwarfRegNum<[80]>;
+    def C17_16 : Rcc<16, "c17:16", [FRAMELIMIT, FRAMEKEY]>, DwarfRegNum<[83]>;
     def PKTCOUNT : Rcc<18, "c19:18", [PKTCOUNTLO, PKTCOUNTHI], ["pktcount"]>,
-                                                              DwarfRegNum<[85]>;
-    def UTIMER :  Rcc<30, "c31:30", [UTIMERLO, UTIMERHI], ["utimer"]>,
-                                                              DwarfRegNum<[97]>;
+                   DwarfRegNum<[85]>;
+    def UTIMER : Rcc<30, "c31:30", [UTIMERLO, UTIMERHI], ["utimer"]>,
+                 DwarfRegNum<[97]>;
   }
 
-  foreach i = 0-31 in {
-    def V#i  : Ri<i, "v"#i>,  DwarfRegNum<[!add(i, 99)]>;
-    def VF#i : R_fake<"__"#!add(i,999999)>,  DwarfRegNum<[!add(i, 999999)]>;
-    def VFR#i : R_fake<"__"#!add(i,9999999)>,  DwarfRegNum<[!add(i, 9999999)]>;
+  foreach i = 0 -31 in {
+    def V#i : Ri<i, "v"#i>, DwarfRegNum<[!add(i, 99)]>;
+    def VF#i : R_fake<"__"#!add(i, 999999)>, DwarfRegNum<[!add(i, 999999)]>;
+    def VFR#i : R_fake<"__"#!add(i, 9999999)>, DwarfRegNum<[!add(i, 9999999)]>;
   }
   def VTMP : Ri<0, "vtmp">, DwarfRegNum<[131]>;
 
   // Aliases of the V* registers used to hold double vec values.
   let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
-  def W0  : Rd< 0,  "v1:0",  [V0,  V1, VF0]>,  DwarfRegNum<[99]>;
-  def W1  : Rd< 2,  "v3:2",  [V2,  V3, VF1]>,  DwarfRegNum<[101]>;
-  def W2  : Rd< 4,  "v5:4",  [V4,  V5, VF2]>,  DwarfRegNum<[103]>;
-  def W3  : Rd< 6,  "v7:6",  [V6,  V7, VF3]>,  DwarfRegNum<[105]>;
-  def W4  : Rd< 8,  "v9:8",  [V8,  V9, VF4]>,  DwarfRegNum<[107]>;
-  def W5  : Rd<10, "v11:10", [V10, V11, VF5]>, DwarfRegNum<[109]>;
-  def W6  : Rd<12, "v13:12", [V12, V13, VF6]>, DwarfRegNum<[111]>;
-  def W7  : Rd<14, "v15:14", [V14, V15, VF7]>, DwarfRegNum<[113]>;
-  def W8  : Rd<16, "v17:16", [V16, V17, VF8]>, DwarfRegNum<[115]>;
-  def W9  : Rd<18, "v19:18", [V18, V19, VF9]>, DwarfRegNum<[117]>;
-  def W10 : Rd<20, "v21:20", [V20, V21, VF10]>, DwarfRegNum<[119]>;
-  def W11 : Rd<22, "v23:22", [V22, V23, VF11]>, DwarfRegNum<[121]>;
-  def W12 : Rd<24, "v25:24", [V24, V25, VF12]>, DwarfRegNum<[123]>;
-  def W13 : Rd<26, "v27:26", [V26, V27, VF13]>, DwarfRegNum<[125]>;
-  def W14 : Rd<28, "v29:28", [V28, V29, VF14]>, DwarfRegNum<[127]>;
-  def W15 : Rd<30, "v31:30", [V30, V31, VF15]>, DwarfRegNum<[129]>;
+    def W0 : Rd<0, "v1:0", [V0, V1, VF0]>, DwarfRegNum<[99]>;
+    def W1 : Rd<2, "v3:2", [V2, V3, VF1]>, DwarfRegNum<[101]>;
+    def W2 : Rd<4, "v5:4", [V4, V5, VF2]>, DwarfRegNum<[103]>;
+    def W3 : Rd<6, "v7:6", [V6, V7, VF3]>, DwarfRegNum<[105]>;
+    def W4 : Rd<8, "v9:8", [V8, V9, VF4]>, DwarfRegNum<[107]>;
+    def W5 : Rd<10, "v11:10", [V10, V11, VF5]>, DwarfRegNum<[109]>;
+    def W6 : Rd<12, "v13:12", [V12, V13, VF6]>, DwarfRegNum<[111]>;
+    def W7 : Rd<14, "v15:14", [V14, V15, VF7]>, DwarfRegNum<[113]>;
+    def W8 : Rd<16, "v17:16", [V16, V17, VF8]>, DwarfRegNum<[115]>;
+    def W9 : Rd<18, "v19:18", [V18, V19, VF9]>, DwarfRegNum<[117]>;
+    def W10 : Rd<20, "v21:20", [V20, V21, VF10]>, DwarfRegNum<[119]>;
+    def W11 : Rd<22, "v23:22", [V22, V23, VF11]>, DwarfRegNum<[121]>;
+    def W12 : Rd<24, "v25:24", [V24, V25, VF12]>, DwarfRegNum<[123]>;
+    def W13 : Rd<26, "v27:26", [V26, V27, VF13]>, DwarfRegNum<[125]>;
+    def W14 : Rd<28, "v29:28", [V28, V29, VF14]>, DwarfRegNum<[127]>;
+    def W15 : Rd<30, "v31:30", [V30, V31, VF15]>, DwarfRegNum<[129]>;
   }
 
   // Reverse Aliases of the V* registers used to hold double vec values.
   let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
-  def WR0 : Rd< 1,  "v0:1",  [V0, V1, VFR0]>,  DwarfRegNum<[161]>;
-  def WR1 : Rd< 3,  "v2:3",  [V2, V3, VFR1]>,  DwarfRegNum<[162]>;
-  def WR2 : Rd< 5,  "v4:5",  [V4, V5, VFR2]>,  DwarfRegNum<[163]>;
-  def WR3 : Rd< 7,  "v6:7",  [V6, V7, VFR3]>,  DwarfRegNum<[164]>;
-  def WR4 : Rd< 9,  "v8:9",  [V8, V9, VFR4]>,  DwarfRegNum<[165]>;
-  def WR5 : Rd<11, "v10:11", [V10, V11, VFR5]>,  DwarfRegNum<[166]>;
-  def WR6 : Rd<13, "v12:13", [V12, V13, VFR6]>,  DwarfRegNum<[167]>;
-  def WR7 : Rd<15, "v14:15", [V14, V15, VFR7]>,  DwarfRegNum<[168]>;
-  def WR8 : Rd<17, "v16:17", [V16, V17, VFR8]>,  DwarfRegNum<[169]>;
-  def WR9 : Rd<19, "v18:19", [V18, V19, VFR9]>,  DwarfRegNum<[170]>;
-  def WR10: Rd<21, "v20:21", [V20, V21, VFR10]>,  DwarfRegNum<[171]>;
-  def WR11: Rd<23, "v22:23", [V22, V23, VFR11]>,  DwarfRegNum<[172]>;
-  def WR12: Rd<25, "v24:25", [V24, V25, VFR12]>,  DwarfRegNum<[173]>;
-  def WR13: Rd<27, "v26:27", [V26, V27, VFR13]>,  DwarfRegNum<[174]>;
-  def WR14: Rd<29, "v28:29", [V28, V29, VFR14]>,  DwarfRegNum<[175]>;
-  def WR15: Rd<31, "v30:31", [V30, V31, VFR15]>,  DwarfRegNum<[176]>;
+    def WR0 : Rd<1, "v0:1", [V0, V1, VFR0]>, DwarfRegNum<[161]>;
+    def WR1 : Rd<3, "v2:3", [V2, V3, VFR1]>, DwarfRegNum<[162]>;
+    def WR2 : Rd<5, "v4:5", [V4, V5, VFR2]>, DwarfRegNum<[163]>;
+    def WR3 : Rd<7, "v6:7", [V6, V7, VFR3]>, DwarfRegNum<[164]>;
+    def WR4 : Rd<9, "v8:9", [V8, V9, VFR4]>, DwarfRegNum<[165]>;
+    def WR5 : Rd<11, "v10:11", [V10, V11, VFR5]>, DwarfRegNum<[166]>;
+    def WR6 : Rd<13, "v12:13", [V12, V13, VFR6]>, DwarfRegNum<[167]>;
+    def WR7 : Rd<15, "v14:15", [V14, V15, VFR7]>, DwarfRegNum<[168]>;
+    def WR8 : Rd<17, "v16:17", [V16, V17, VFR8]>, DwarfRegNum<[169]>;
+    def WR9 : Rd<19, "v18:19", [V18, V19, VFR9]>, DwarfRegNum<[170]>;
+    def WR10 : Rd<21, "v20:21", [V20, V21, VFR10]>, DwarfRegNum<[171]>;
+    def WR11 : Rd<23, "v22:23", [V22, V23, VFR11]>, DwarfRegNum<[172]>;
+    def WR12 : Rd<25, "v24:25", [V24, V25, VFR12]>, DwarfRegNum<[173]>;
+    def WR13 : Rd<27, "v26:27", [V26, V27, VFR13]>, DwarfRegNum<[174]>;
+    def WR14 : Rd<29, "v28:29", [V28, V29, VFR14]>, DwarfRegNum<[175]>;
+    def WR15 : Rd<31, "v30:31", [V30, V31, VFR15]>, DwarfRegNum<[176]>;
   }
 
   // Aliases of the V* registers used to hold quad vec values.
   let SubRegIndices = [wsub_lo, wsub_hi], CoveredBySubRegs = 1 in {
-  def VQ0  : Rd< 0, "v3:0",   [W0,  W1]>,  DwarfRegNum<[252]>;
-  def VQ1  : Rd< 4, "v7:4",   [W2,  W3]>,  DwarfRegNum<[253]>;
-  def VQ2  : Rd< 8, "v11:8",  [W4,  W5]>,  DwarfRegNum<[254]>;
-  def VQ3  : Rd<12, "v15:12", [W6,  W7]>,  DwarfRegNum<[255]>;
-  def VQ4  : Rd<16, "v19:16", [W8,  W9]>,  DwarfRegNum<[256]>;
-  def VQ5  : Rd<20, "v23:20", [W10, W11]>, DwarfRegNum<[257]>;
-  def VQ6  : Rd<24, "v27:24", [W12, W13]>, DwarfRegNum<[258]>;
-  def VQ7  : Rd<28, "v31:28", [W14, W15]>, DwarfRegNum<[259]>;
+    def VQ0 : Rd<0, "v3:0", [W0, W1]>, DwarfRegNum<[252]>;
+    def VQ1 : Rd<4, "v7:4", [W2, W3]>, DwarfRegNum<[253]>;
+    def VQ2 : Rd<8, "v11:8", [W4, W5]>, DwarfRegNum<[254]>;
+    def VQ3 : Rd<12, "v15:12", [W6, W7]>, DwarfRegNum<[255]>;
+    def VQ4 : Rd<16, "v19:16", [W8, W9]>, DwarfRegNum<[256]>;
+    def VQ5 : Rd<20, "v23:20", [W10, W11]>, DwarfRegNum<[257]>;
+    def VQ6 : Rd<24, "v27:24", [W12, W13]>, DwarfRegNum<[258]>;
+    def VQ7 : Rd<28, "v31:28", [W14, W15]>, DwarfRegNum<[259]>;
   }
 
   // Vector Predicate registers.
@@ -286,359 +270,357 @@ let Namespace = "Hexagon" in {
   def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
 
   // System registers.
-  def SGP0     :  Rs<0,  "sgp0",       ["s0"]>,  DwarfRegNum<[144]>;
-  def SGP1     :  Rs<1,  "sgp1",       ["s1"]>,  DwarfRegNum<[145]>;
-  def STID     :  Rs<2,  "stid",       ["s2"]>,  DwarfRegNum<[146]>;
-  def ELR      :  Rs<3,  "elr",        ["s3"]>,  DwarfRegNum<[147]>;
-  def BADVA0   :  Rs<4,  "badva0",     ["s4"]>,  DwarfRegNum<[148]>;
-  def BADVA1   :  Rs<5,  "badva1",     ["s5"]>,  DwarfRegNum<[149]>;
-  def SSR      :  Rs<6,  "ssr",        ["s6"]>,  DwarfRegNum<[150]>;
-  def CCR      :  Rs<7,  "ccr",        ["s7"]>,  DwarfRegNum<[151]>;
-  def HTID     :  Rs<8,  "htid",       ["s8"]>,  DwarfRegNum<[152]>;
-  def BADVA    :  Rs<9,  "badva",      ["s9"]>,  DwarfRegNum<[153]>;
-  def IMASK    :  Rs<10, "imask",      ["s10"]>, DwarfRegNum<[154]>;
-  def S11      :  Rs<11, "s11">,                 DwarfRegNum<[155]>;
-  def S12      :  Rs<12, "s12">,                 DwarfRegNum<[156]>;
-  def S13      :  Rs<13, "s13">,                 DwarfRegNum<[157]>;
-  def S14      :  Rs<14, "s14">,                 DwarfRegNum<[158]>;
-  def S15      :  Rs<15, "s15">,                 DwarfRegNum<[159]>;
-  def EVB      :  Rs<16, "evb",        ["s16"]>, DwarfRegNum<[160]>;
-  def MODECTL  :  Rs<17, "modectl",    ["s17"]>, DwarfRegNum<[161]>;
-  def SYSCFG   :  Rs<18, "syscfg",     ["s18"]>, DwarfRegNum<[162]>;
-  def S19      :  Rs<19, "s19",        ["s19"]>, DwarfRegNum<[163]>;
-  def S20      :  Rs<20, "s20",        ["s20"]>, DwarfRegNum<[164]>;
-  def VID      :  Rs<21, "vid",        ["s21"]>, DwarfRegNum<[165]>;
-  def S22      :  Rs<22, "s22",        ["s22"]>, DwarfRegNum<[166]>;
-  def S23      :  Rs<23, "s23">,                 DwarfRegNum<[167]>;
-  def S24      :  Rs<24, "s24">,                 DwarfRegNum<[168]>;
-  def S25      :  Rs<25, "s25">,                 DwarfRegNum<[169]>;
-  def S26      :  Rs<26, "s26">,                 DwarfRegNum<[170]>;
-  def CFGBASE  :  Rs<27, "cfgbase",    ["s27"]>, DwarfRegNum<[171]>;
-  def DIAG     :  Rs<28, "diag",       ["s28"]>, DwarfRegNum<[172]>;
-  def REV      :  Rs<29, "rev",        ["s29"]>, DwarfRegNum<[173]>;
-  def PCYCLELO :  Rs<30, "pcyclelo",   ["s30"]>, DwarfRegNum<[174]>;
-  def PCYCLEHI :  Rs<31, "pcyclehi",   ["s31"]>, DwarfRegNum<[175]>;
-  def ISDBST   :  Rs<32, "isdbst",     ["s32"]>, DwarfRegNum<[176]>;
-  def ISDBCFG0 :  Rs<33, "isdbcfg0",   ["s33"]>, DwarfRegNum<[177]>;
-  def ISDBCFG1 :  Rs<34, "isdbcfg1",   ["s34"]>, DwarfRegNum<[178]>;
-  def S35      :  Rs<35, "s35">,                 DwarfRegNum<[179]>;
-  def BRKPTPC0 :  Rs<36, "brkptpc0",   ["s36"]>, DwarfRegNum<[180]>;
-  def BRKPTCFG0:  Rs<37, "brkptcfg0",  ["s37"]>, DwarfRegNum<[181]>;
-  def BRKPTPC1 :  Rs<38, "brkptpc1",   ["s38"]>, DwarfRegNum<[182]>;
-  def BRKPTCFG1:  Rs<39, "brkptcfg1",  ["s39"]>, DwarfRegNum<[183]>;
-  def ISDBMBXIN:  Rs<40, "isdbmbxin",  ["s40"]>, DwarfRegNum<[184]>;
-  def ISDBMBXOUT: Rs<41, "isdbmbxout", ["s41"]>, DwarfRegNum<[185]>;
-  def ISDBEN:     Rs<42, "isdben",     ["s42"]>, DwarfRegNum<[186]>;
-  def ISDBGPR:    Rs<43, "isdbgpr",    ["s43"]>, DwarfRegNum<[187]>;
-  def S44:        Rs<44, "s44">,                 DwarfRegNum<[188]>;
-  def S45:        Rs<45, "s45">,                 DwarfRegNum<[189]>;
-  def S46:        Rs<46, "s46">,                 DwarfRegNum<[190]>;
-  def S47:        Rs<47, "s47">,                 DwarfRegNum<[191]>;
-  def PMUCNT0:    Rs<48, "pmucnt0",    ["s48"]>, DwarfRegNum<[192]>;
-  def PMUCNT1:    Rs<49, "pmucnt1",    ["s49"]>, DwarfRegNum<[193]>;
-  def PMUCNT2:    Rs<50, "pmucnt2",    ["s50"]>, DwarfRegNum<[194]>;
-  def PMUCNT3:    Rs<51, "pmucnt3",    ["s51"]>, DwarfRegNum<[195]>;
-  def PMUEVTCFG:  Rs<52, "pmuevtcfg",  ["s52"]>, DwarfRegNum<[196]>;
-  def PMUCFG:     Rs<53, "pmucfg",     ["s53"]>, DwarfRegNum<[197]>;
-  def S54:        Rs<54, "s54">,                 DwarfRegNum<[198]>;
-  def S55:        Rs<55, "s55">,                 DwarfRegNum<[199]>;
-  def S56:        Rs<56, "s56">,                 DwarfRegNum<[200]>;
-  def S57:        Rs<57, "s57">,                 DwarfRegNum<[201]>;
-  def S58:        Rs<58, "s58">,                 DwarfRegNum<[202]>;
-  def S59:        Rs<59, "s59">,                 DwarfRegNum<[203]>;
-  def S60:        Rs<60, "s60">,                 DwarfRegNum<[204]>;
-  def S61:        Rs<61, "s61">,                 DwarfRegNum<[205]>;
-  def S62:        Rs<62, "s62">,                 DwarfRegNum<[206]>;
-  def S63:        Rs<63, "s63">,                 DwarfRegNum<[207]>;
-  def S64:        Rs<64, "s64">,                 DwarfRegNum<[208]>;
-  def S65:        Rs<65, "s65">,                 DwarfRegNum<[209]>;
-  def S66:        Rs<66, "s66">,                 DwarfRegNum<[210]>;
-  def S67:        Rs<67, "s67">,                 DwarfRegNum<[211]>;
-  def S68:        Rs<68, "s68">,                 DwarfRegNum<[212]>;
-  def S69:        Rs<69, "s69">,                 DwarfRegNum<[213]>;
-  def S70:        Rs<70, "s70">,                 DwarfRegNum<[214]>;
-  def S71:        Rs<71, "s71">,                 DwarfRegNum<[215]>;
-  def S72:        Rs<72, "s72">,                 DwarfRegNum<[216]>;
-  def S73:        Rs<73, "s73">,                 DwarfRegNum<[217]>;
-  def S74:        Rs<74, "s74">,                 DwarfRegNum<[218]>;
-  def S75:        Rs<75, "s75">,                 DwarfRegNum<[219]>;
-  def S76:        Rs<76, "s76">,                 DwarfRegNum<[220]>;
-  def S77:        Rs<77, "s77">,                 DwarfRegNum<[221]>;
-  def S78:        Rs<78, "s78">,                 DwarfRegNum<[222]>;
-  def S79:        Rs<79, "s79">,                 DwarfRegNum<[223]>;
-  def S80:        Rs<80, "s80">,                 DwarfRegNum<[224]>;
+  def SGP0 : Rs<0, "sgp0", ["s0"]>, DwarfRegNum<[144]>;
+  def SGP1 : Rs<1, "sgp1", ["s1"]>, DwarfRegNum<[145]>;
+  def STID : Rs<2, "stid", ["s2"]>, DwarfRegNum<[146]>;
+  def ELR : Rs<3, "elr", ["s3"]>, DwarfRegNum<[147]>;
+  def BADVA0 : Rs<4, "badva0", ["s4"]>, DwarfRegNum<[148]>;
+  def BADVA1 : Rs<5, "badva1", ["s5"]>, DwarfRegNum<[149]>;
+  def SSR : Rs<6, "ssr", ["s6"]>, DwarfRegNum<[150]>;
+  def CCR : Rs<7, "ccr", ["s7"]>, DwarfRegNum<[151]>;
+  def HTID : Rs<8, "htid", ["s8"]>, DwarfRegNum<[152]>;
+  def BADVA : Rs<9, "badva", ["s9"]>, DwarfRegNum<[153]>;
+  def IMASK : Rs<10, "imask", ["s10"]>, DwarfRegNum<[154]>;
+  def S11 : Rs<11, "s11">, DwarfRegNum<[155]>;
+  def S12 : Rs<12, "s12">, DwarfRegNum<[156]>;
+  def S13 : Rs<13, "s13">, DwarfRegNum<[157]>;
+  def S14 : Rs<14, "s14">, DwarfRegNum<[158]>;
+  def S15 : Rs<15, "s15">, DwarfRegNum<[159]>;
+  def EVB : Rs<16, "evb", ["s16"]>, DwarfRegNum<[160]>;
+  def MODECTL : Rs<17, "modectl", ["s17"]>, DwarfRegNum<[161]>;
+  def SYSCFG : Rs<18, "syscfg", ["s18"]>, DwarfRegNum<[162]>;
+  def S19 : Rs<19, "s19", ["s19"]>, DwarfRegNum<[163]>;
+  def S20 : Rs<20, "s20", ["s20"]>, DwarfRegNum<[164]>;
+  def VID : Rs<21, "vid", ["s21"]>, DwarfRegNum<[165]>;
+  def S22 : Rs<22, "s22", ["s22"]>, DwarfRegNum<[166]>;
+  def S23 : Rs<23, "s23">, DwarfRegNum<[167]>;
+  def S24 : Rs<24, "s24">, DwarfRegNum<[168]>;
+  def S25 : Rs<25, "s25">, DwarfRegNum<[169]>;
+  def S26 : Rs<26, "s26">, DwarfRegNum<[170]>;
+  def CFGBASE : Rs<27, "cfgbase", ["s27"]>, DwarfRegNum<[171]>;
+  def DIAG : Rs<28, "diag", ["s28"]>, DwarfRegNum<[172]>;
+  def REV : Rs<29, "rev", ["s29"]>, DwarfRegNum<[173]>;
+  def PCYCLELO : Rs<30, "pcyclelo", ["s30"]>, DwarfRegNum<[174]>;
+  def PCYCLEHI : Rs<31, "pcyclehi", ["s31"]>, DwarfRegNum<[175]>;
+  def ISDBST : Rs<32, "isdbst", ["s32"]>, DwarfRegNum<[176]>;
+  def ISDBCFG0 : Rs<33, "isdbcfg0", ["s33"]>, DwarfRegNum<[177]>;
+  def ISDBCFG1 : Rs<34, "isdbcfg1", ["s34"]>, DwarfRegNum<[178]>;
+  def S35 : Rs<35, "s35">, DwarfRegNum<[179]>;
+  def BRKPTPC0 : Rs<36, "brkptpc0", ["s36"]>, DwarfRegNum<[180]>;
+  def BRKPTCFG0 : Rs<37, "brkptcfg0", ["s37"]>, DwarfRegNum<[181]>;
+  def BRKPTPC1 : Rs<38, "brkptpc1", ["s38"]>, DwarfRegNum<[182]>;
+  def BRKPTCFG1 : Rs<39, "brkptcfg1", ["s39"]>, DwarfRegNum<[183]>;
+  def ISDBMBXIN : Rs<40, "isdbmbxin", ["s40"]>, DwarfRegNum<[184]>;
+  def ISDBMBXOUT : Rs<41, "isdbmbxout", ["s41"]>, DwarfRegNum<[185]>;
+  def ISDBEN : Rs<42, "isdben", ["s42"]>, DwarfRegNum<[186]>;
+  def ISDBGPR : Rs<43, "isdbgpr", ["s43"]>, DwarfRegNum<[187]>;
+  def S44 : Rs<44, "s44">, DwarfRegNum<[188]>;
+  def S45 : Rs<45, "s45">, DwarfRegNum<[189]>;
+  def S46 : Rs<46, "s46">, DwarfRegNum<[190]>;
+  def S47 : Rs<47, "s47">, DwarfRegNum<[191]>;
+  def PMUCNT0 : Rs<48, "pmucnt0", ["s48"]>, DwarfRegNum<[192]>;
+  def PMUCNT1 : Rs<49, "pmucnt1", ["s49"]>, DwarfRegNum<[193]>;
+  def PMUCNT2 : Rs<50, "pmucnt2", ["s50"]>, DwarfRegNum<[194]>;
+  def PMUCNT3 : Rs<51, "pmucnt3", ["s51"]>, DwarfRegNum<[195]>;
+  def PMUEVTCFG : Rs<52, "pmuevtcfg", ["s52"]>, DwarfRegNum<[196]>;
+  def PMUCFG : Rs<53, "pmucfg", ["s53"]>, DwarfRegNum<[197]>;
+  def S54 : Rs<54, "s54">, DwarfRegNum<[198]>;
+  def S55 : Rs<55, "s55">, DwarfRegNum<[199]>;
+  def S56 : Rs<56, "s56">, DwarfRegNum<[200]>;
+  def S57 : Rs<57, "s57">, DwarfRegNum<[201]>;
+  def S58 : Rs<58, "s58">, DwarfRegNum<[202]>;
+  def S59 : Rs<59, "s59">, DwarfRegNum<[203]>;
+  def S60 : Rs<60, "s60">, DwarfRegNum<[204]>;
+  def S61 : Rs<61, "s61">, DwarfRegNum<[205]>;
+  def S62 : Rs<62, "s62">, DwarfRegNum<[206]>;
+  def S63 : Rs<63, "s63">, DwarfRegNum<[207]>;
+  def S64 : Rs<64, "s64">, DwarfRegNum<[208]>;
+  def S65 : Rs<65, "s65">, DwarfRegNum<[209]>;
+  def S66 : Rs<66, "s66">, DwarfRegNum<[210]>;
+  def S67 : Rs<67, "s67">, DwarfRegNum<[211]>;
+  def S68 : Rs<68, "s68">, DwarfRegNum<[212]>;
+  def S69 : Rs<69, "s69">, DwarfRegNum<[213]>;
+  def S70 : Rs<70, "s70">, DwarfRegNum<[214]>;
+  def S71 : Rs<71, "s71">, DwarfRegNum<[215]>;
+  def S72 : Rs<72, "s72">, DwarfRegNum<[216]>;
+  def S73 : Rs<73, "s73">, DwarfRegNum<[217]>;
+  def S74 : Rs<74, "s74">, DwarfRegNum<[218]>;
+  def S75 : Rs<75, "s75">, DwarfRegNum<[219]>;
+  def S76 : Rs<76, "s76">, DwarfRegNum<[220]>;
+  def S77 : Rs<77, "s77">, DwarfRegNum<[221]>;
+  def S78 : Rs<78, "s78">, DwarfRegNum<[222]>;
+  def S79 : Rs<79, "s79">, DwarfRegNum<[223]>;
+  def S80 : Rs<80, "s80">, DwarfRegNum<[224]>;
 
   // System Register Pair
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def SGP1_0 : Rss<0,  "s1:0",  [SGP0, SGP1], ["sgp1:0"]>, DwarfRegNum<[144]>;
-    def S3_2   : Rss<2,  "s3:2",  [STID, ELR]>,              DwarfRegNum<[146]>;
-    def S5_4   : Rss<4,  "s5:4",  [BADVA0, BADVA1], ["badva1:0"]>,
-                                                             DwarfRegNum<[148]>;
-    def S7_6   : Rss<6,  "s7:6",  [SSR, CCR], ["ccr:ssr"]>,  DwarfRegNum<[150]>;
-    def S9_8   : Rss<8,  "s9:8",  [HTID, BADVA]>,            DwarfRegNum<[152]>;
-    def S11_10 : Rss<10, "s11:10", [IMASK, S11]>,            DwarfRegNum<[154]>;
-    def S13_12 : Rss<12, "s13:12", [S12, S13]>,              DwarfRegNum<[156]>;
-    def S15_14 : Rss<14, "s15:14", [S14, S15]>,              DwarfRegNum<[158]>;
-    def S17_16 : Rss<16, "s17:16", [EVB, MODECTL]>,          DwarfRegNum<[160]>;
-    def S19_18 : Rss<18, "s19:18", [SYSCFG, S19]>,           DwarfRegNum<[162]>;
-    def S21_20 : Rss<20, "s21:20", [S20, VID]>,              DwarfRegNum<[164]>;
-    def S23_22 : Rss<22, "s23:22", [S22, S23]>,              DwarfRegNum<[166]>;
-    def S25_24 : Rss<24, "s25:24", [S24, S25]>,              DwarfRegNum<[168]>;
-    def S27_26 : Rss<26, "s27:26", [S26, CFGBASE]>,          DwarfRegNum<[170]>;
-    def S29_28 : Rss<28, "s29:28", [DIAG, REV]>,             DwarfRegNum<[172]>;
-    def S31_30 : Rss<30, "s31:30", [PCYCLELO, PCYCLEHI], ["pcycle"]>,    DwarfRegNum<[174]>;
-    def S33_32 : Rss<32, "s33:32", [ISDBST, ISDBCFG0]>,      DwarfRegNum<[176]>;
-    def S35_34 : Rss<34, "s35:34", [ISDBCFG1, S35]>,    DwarfRegNum<[178]>;
-    def S37_36 : Rss<36, "s37:36", [BRKPTPC0, BRKPTCFG0]>,   DwarfRegNum<[180]>;
-    def S39_38 : Rss<38, "s39:38", [BRKPTPC1, BRKPTCFG1]>,   DwarfRegNum<[182]>;
+    def SGP1_0 : Rss<0, "s1:0", [SGP0, SGP1], ["sgp1:0"]>, DwarfRegNum<[144]>;
+    def S3_2 : Rss<2, "s3:2", [STID, ELR]>, DwarfRegNum<[146]>;
+    def S5_4 : Rss<4, "s5:4", [BADVA0, BADVA1], ["badva1:0"]>,
+               DwarfRegNum<[148]>;
+    def S7_6 : Rss<6, "s7:6", [SSR, CCR], ["ccr:ssr"]>, DwarfRegNum<[150]>;
+    def S9_8 : Rss<8, "s9:8", [HTID, BADVA]>, DwarfRegNum<[152]>;
+    def S11_10 : Rss<10, "s11:10", [IMASK, S11]>, DwarfRegNum<[154]>;
+    def S13_12 : Rss<12, "s13:12", [S12, S13]>, DwarfRegNum<[156]>;
+    def S15_14 : Rss<14, "s15:14", [S14, S15]>, DwarfRegNum<[158]>;
+    def S17_16 : Rss<16, "s17:16", [EVB, MODECTL]>, DwarfRegNum<[160]>;
+    def S19_18 : Rss<18, "s19:18", [SYSCFG, S19]>, DwarfRegNum<[162]>;
+    def S21_20 : Rss<20, "s21:20", [S20, VID]>, DwarfRegNum<[164]>;
+    def S23_22 : Rss<22, "s23:22", [S22, S23]>, DwarfRegNum<[166]>;
+    def S25_24 : Rss<24, "s25:24", [S24, S25]>, DwarfRegNum<[168]>;
+    def S27_26 : Rss<26, "s27:26", [S26, CFGBASE]>, DwarfRegNum<[170]>;
+    def S29_28 : Rss<28, "s29:28", [DIAG, REV]>, DwarfRegNum<[172]>;
+    def S31_30 : Rss<30, "s31:30", [PCYCLELO, PCYCLEHI], ["pcycle"]>,
+                 DwarfRegNum<[174]>;
+    def S33_32 : Rss<32, "s33:32", [ISDBST, ISDBCFG0]>, DwarfRegNum<[176]>;
+    def S35_34 : Rss<34, "s35:34", [ISDBCFG1, S35]>, DwarfRegNum<[178]>;
+    def S37_36 : Rss<36, "s37:36", [BRKPTPC0, BRKPTCFG0]>, DwarfRegNum<[180]>;
+    def S39_38 : Rss<38, "s39:38", [BRKPTPC1, BRKPTCFG1]>, DwarfRegNum<[182]>;
     def S41_40 : Rss<40, "s41:40", [ISDBMBXIN, ISDBMBXOUT]>, DwarfRegNum<[184]>;
-    def S43_42 : Rss<42, "s43:42", [ISDBEN, ISDBGPR]>,       DwarfRegNum<[186]>;
-    def S45_44 : Rss<44, "s45:44", [S44, S45]>,              DwarfRegNum<[188]>;
-    def S47_46 : Rss<46, "s47:46", [S46, S47]>,              DwarfRegNum<[190]>;
-    def S49_48 : Rss<48, "s49:48", [PMUCNT0, PMUCNT1]>,      DwarfRegNum<[192]>;
-    def S51_50 : Rss<50, "s51:50", [PMUCNT2, PMUCNT3]>,      DwarfRegNum<[194]>;
-    def S53_52 : Rss<52, "s53:52", [PMUEVTCFG, PMUCFG]>,     DwarfRegNum<[196]>;
-    def S55_54 : Rss<54, "s55:54", [S54, S55]>,              DwarfRegNum<[198]>;
-    def S57_56 : Rss<56, "s57:56", [S56, S57]>,              DwarfRegNum<[200]>;
-    def S59_58 : Rss<58, "s59:58", [S58, S59]>,              DwarfRegNum<[202]>;
-    def S61_60 : Rss<60, "s61:60", [S60, S61]>,              DwarfRegNum<[204]>;
-    def S63_62 : Rss<62, "s63:62", [S62, S63]>,              DwarfRegNum<[206]>;
-    def S65_64 : Rss<64, "s65:64", [S64, S65]>,              DwarfRegNum<[208]>;
-    def S67_66 : Rss<66, "s67:66", [S66, S67]>,              DwarfRegNum<[210]>;
-    def S69_68 : Rss<68, "s69:68", [S68, S69]>,              DwarfRegNum<[212]>;
-    def S71_70 : Rss<70, "s71:70", [S70, S71]>,              DwarfRegNum<[214]>;
-    def S73_72 : Rss<72, "s73:72", [S72, S73]>,              DwarfRegNum<[216]>;
-    def S75_74 : Rss<74, "s75:74", [S74, S75]>,              DwarfRegNum<[218]>;
-    def S77_76 : Rss<76, "s77:76", [S77, S76]>,              DwarfRegNum<[219]>;
-    def S79_78 : Rss<78, "s79:78", [S79, S78]>,              DwarfRegNum<[220]>;
+    def S43_42 : Rss<42, "s43:42", [ISDBEN, ISDBGPR]>, DwarfRegNum<[186]>;
+    def S45_44 : Rss<44, "s45:44", [S44, S45]>, DwarfRegNum<[188]>;
+    def S47_46 : Rss<46, "s47:46", [S46, S47]>, DwarfRegNum<[190]>;
+    def S49_48 : Rss<48, "s49:48", [PMUCNT0, PMUCNT1]>, DwarfRegNum<[192]>;
+    def S51_50 : Rss<50, "s51:50", [PMUCNT2, PMUCNT3]>, DwarfRegNum<[194]>;
+    def S53_52 : Rss<52, "s53:52", [PMUEVTCFG, PMUCFG]>, DwarfRegNum<[196]>;
+    def S55_54 : Rss<54, "s55:54", [S54, S55]>, DwarfRegNum<[198]>;
+    def S57_56 : Rss<56, "s57:56", [S56, S57]>, DwarfRegNum<[200]>;
+    def S59_58 : Rss<58, "s59:58", [S58, S59]>, DwarfRegNum<[202]>;
+    def S61_60 : Rss<60, "s61:60", [S60, S61]>, DwarfRegNum<[204]>;
+    def S63_62 : Rss<62, "s63:62", [S62, S63]>, DwarfRegNum<[206]>;
+    def S65_64 : Rss<64, "s65:64", [S64, S65]>, DwarfRegNum<[208]>;
+    def S67_66 : Rss<66, "s67:66", [S66, S67]>, DwarfRegNum<[210]>;
+    def S69_68 : Rss<68, "s69:68", [S68, S69]>, DwarfRegNum<[212]>;
+    def S71_70 : Rss<70, "s71:70", [S70, S71]>, DwarfRegNum<[214]>;
+    def S73_72 : Rss<72, "s73:72", [S72, S73]>, DwarfRegNum<[216]>;
+    def S75_74 : Rss<74, "s75:74", [S74, S75]>, DwarfRegNum<[218]>;
+    def S77_76 : Rss<76, "s77:76", [S77, S76]>, DwarfRegNum<[219]>;
+    def S79_78 : Rss<78, "s79:78", [S79, S78]>, DwarfRegNum<[220]>;
   }
 
   // Guest Registers
-  def GELR:      Rg<0,  "gelr", ["g0"]>,       DwarfRegNum<[220]>;
-  def GSR:       Rg<1,  "gsr", ["g1"]>,        DwarfRegNum<[221]>;
-  def GOSP:      Rg<2,  "gosp", ["g2"]>,       DwarfRegNum<[222]>;
-  def G3:        Rg<3,  "gbadva", ["g3"]>,     DwarfRegNum<[223]>;
-  def G4:        Rg<4,  "g4">,                 DwarfRegNum<[224]>;
-  def G5:        Rg<5,  "g5">,                 DwarfRegNum<[225]>;
-  def G6:        Rg<6,  "g6">,                 DwarfRegNum<[226]>;
-  def G7:        Rg<7,  "g7">,                 DwarfRegNum<[227]>;
-  def G8:        Rg<8,  "g8">,                 DwarfRegNum<[228]>;
-  def G9:        Rg<9,  "g9">,                 DwarfRegNum<[229]>;
-  def G10:       Rg<10, "g10">,                DwarfRegNum<[230]>;
-  def G11:       Rg<11, "g11">,                DwarfRegNum<[231]>;
-  def G12:       Rg<12, "g12">,                DwarfRegNum<[232]>;
-  def G13:       Rg<13, "g13">,                DwarfRegNum<[233]>;
-  def G14:       Rg<14, "g14">,                DwarfRegNum<[234]>;
-  def G15:       Rg<15, "g15">,                DwarfRegNum<[235]>;
-  def GPMUCNT4:  Rg<16, "gpmucnt4", ["g16"]>,  DwarfRegNum<[236]>;
-  def GPMUCNT5:  Rg<17, "gpmucnt5", ["g17"]>,  DwarfRegNum<[237]>;
-  def GPMUCNT6:  Rg<18, "gpmucnt6", ["g18"]>,  DwarfRegNum<[238]>;
-  def GPMUCNT7:  Rg<19, "gpmucnt7", ["g19"]>,  DwarfRegNum<[239]>;
-  def G20:       Rg<20, "g20">,                DwarfRegNum<[240]>;
-  def G21:       Rg<21, "g21">,                DwarfRegNum<[241]>;
-  def G22:       Rg<22, "g22">,                DwarfRegNum<[242]>;
-  def G23:       Rg<23, "g23">,                DwarfRegNum<[243]>;
-  def GPCYCLELO: Rg<24, "gpcyclelo", ["g24"]>, DwarfRegNum<[244]>;
-  def GPCYCLEHI: Rg<25, "gpcyclehi", ["g25"]>, DwarfRegNum<[245]>;
-  def GPMUCNT0:  Rg<26, "gpmucnt0",  ["g26"]>, DwarfRegNum<[246]>;
-  def GPMUCNT1:  Rg<27, "gpmucnt1",  ["g27"]>, DwarfRegNum<[247]>;
-  def GPMUCNT2:  Rg<28, "gpmucnt2",  ["g28"]>, DwarfRegNum<[248]>;
-  def GPMUCNT3:  Rg<29, "gpmucnt3",  ["g29"]>, DwarfRegNum<[249]>;
-  def G30:       Rg<30, "g30">,                DwarfRegNum<[250]>;
-  def G31:       Rg<31, "g31">,                DwarfRegNum<[251]>;
+  def GELR : Rg<0, "gelr", ["g0"]>, DwarfRegNum<[220]>;
+  def GSR : Rg<1, "gsr", ["g1"]>, DwarfRegNum<[221]>;
+  def GOSP : Rg<2, "gosp", ["g2"]>, DwarfRegNum<[222]>;
+  def G3 : Rg<3, "gbadva", ["g3"]>, DwarfRegNum<[223]>;
+  def G4 : Rg<4, "g4">, DwarfRegNum<[224]>;
+  def G5 : Rg<5, "g5">, DwarfRegNum<[225]>;
+  def G6 : Rg<6, "g6">, DwarfRegNum<[226]>;
+  def G7 : Rg<7, "g7">, DwarfRegNum<[227]>;
+  def G8 : Rg<8, "g8">, DwarfRegNum<[228]>;
+  def G9 : Rg<9, "g9">, DwarfRegNum<[229]>;
+  def G10 : Rg<10, "g10">, DwarfRegNum<[230]>;
+  def G11 : Rg<11, "g11">, DwarfRegNum<[231]>;
+  def G12 : Rg<12, "g12">, DwarfRegNum<[232]>;
+  def G13 : Rg<13, "g13">, DwarfRegNum<[233]>;
+  def G14 : Rg<14, "g14">, DwarfRegNum<[234]>;
+  def G15 : Rg<15, "g15">, DwarfRegNum<[235]>;
+  def GPMUCNT4 : Rg<16, "gpmucnt4", ["g16"]>, DwarfRegNum<[236]>;
+  def GPMUCNT5 : Rg<17, "gpmucnt5", ["g17"]>, DwarfRegNum<[237]>;
+  def GPMUCNT6 : Rg<18, "gpmucnt6", ["g18"]>, DwarfRegNum<[238]>;
+  def GPMUCNT7 : Rg<19, "gpmucnt7", ["g19"]>, DwarfRegNum<[239]>;
+  def G20 : Rg<20, "g20">, DwarfRegNum<[240]>;
+  def G21 : Rg<21, "g21">, DwarfRegNum<[241]>;
+  def G22 : Rg<22, "g22">, DwarfRegNum<[242]>;
+  def G23 : Rg<23, "g23">, DwarfRegNum<[243]>;
+  def GPCYCLELO : Rg<24, "gpcyclelo", ["g24"]>, DwarfRegNum<[244]>;
+  def GPCYCLEHI : Rg<25, "gpcyclehi", ["g25"]>, DwarfRegNum<[245]>;
+  def GPMUCNT0 : Rg<26, "gpmucnt0", ["g26"]>, DwarfRegNum<[246]>;
+  def GPMUCNT1 : Rg<27, "gpmucnt1", ["g27"]>, DwarfRegNum<[247]>;
+  def GPMUCNT2 : Rg<28, "gpmucnt2", ["g28"]>, DwarfRegNum<[248]>;
+  def GPMUCNT3 : Rg<29, "gpmucnt3", ["g29"]>, DwarfRegNum<[249]>;
+  def G30 : Rg<30, "g30">, DwarfRegNum<[250]>;
+  def G31 : Rg<31, "g31">, DwarfRegNum<[251]>;
 
   // Guest Register Pairs
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def G1_0   : Rgg<0,  "g1:0",   [GELR, GSR]>,            DwarfRegNum<[220]>;
-    def G3_2   : Rgg<2,  "g3:2",   [GOSP, G3]>,             DwarfRegNum<[222]>;
-    def G5_4   : Rgg<4,  "g5:4",   [G4, G5]>,               DwarfRegNum<[224]>;
-    def G7_6   : Rgg<6,  "g7:6",   [G6, G7]>,               DwarfRegNum<[226]>;
-    def G9_8   : Rgg<8,  "g9:8",   [G8, G9]>,               DwarfRegNum<[228]>;
-    def G11_10 : Rgg<10, "g11:10", [G10, G11]>,             DwarfRegNum<[230]>;
-    def G13_12 : Rgg<12, "g13:12", [G12, G13]>,             DwarfRegNum<[232]>;
-    def G15_14 : Rgg<14, "g15:14", [G14, G15]>,             DwarfRegNum<[234]>;
-    def G17_16 : Rgg<16, "g17:16", [GPMUCNT4, GPMUCNT5]>,   DwarfRegNum<[236]>;
-    def G19_18 : Rgg<18, "g19:18", [GPMUCNT6, GPMUCNT7]>,   DwarfRegNum<[238]>;
-    def G21_20 : Rgg<20, "g21:20", [G20, G21]>,             DwarfRegNum<[240]>;
-    def G23_22 : Rgg<22, "g23:22", [G22, G23]>,             DwarfRegNum<[242]>;
+    def G1_0 : Rgg<0, "g1:0", [GELR, GSR]>, DwarfRegNum<[220]>;
+    def G3_2 : Rgg<2, "g3:2", [GOSP, G3]>, DwarfRegNum<[222]>;
+    def G5_4 : Rgg<4, "g5:4", [G4, G5]>, DwarfRegNum<[224]>;
+    def G7_6 : Rgg<6, "g7:6", [G6, G7]>, DwarfRegNum<[226]>;
+    def G9_8 : Rgg<8, "g9:8", [G8, G9]>, DwarfRegNum<[228]>;
+    def G11_10 : Rgg<10, "g11:10", [G10, G11]>, DwarfRegNum<[230]>;
+    def G13_12 : Rgg<12, "g13:12", [G12, G13]>, DwarfRegNum<[232]>;
+    def G15_14 : Rgg<14, "g15:14", [G14, G15]>, DwarfRegNum<[234]>;
+    def G17_16 : Rgg<16, "g17:16", [GPMUCNT4, GPMUCNT5]>, DwarfRegNum<[236]>;
+    def G19_18 : Rgg<18, "g19:18", [GPMUCNT6, GPMUCNT7]>, DwarfRegNum<[238]>;
+    def G21_20 : Rgg<20, "g21:20", [G20, G21]>, DwarfRegNum<[240]>;
+    def G23_22 : Rgg<22, "g23:22", [G22, G23]>, DwarfRegNum<[242]>;
     def G25_24 : Rgg<24, "g25:24", [GPCYCLELO, GPCYCLEHI]>, DwarfRegNum<[244]>;
-    def G27_26 : Rgg<26, "g27:26", [GPMUCNT0, GPMUCNT1]>,   DwarfRegNum<[246]>;
-    def G29_28 : Rgg<28, "g29:28", [GPMUCNT2, GPMUCNT3]>,   DwarfRegNum<[248]>;
-    def G31_30 : Rgg<30, "g31:30", [G30, G31]>,             DwarfRegNum<[250]>;
+    def G27_26 : Rgg<26, "g27:26", [GPMUCNT0, GPMUCNT1]>, DwarfRegNum<[246]>;
+    def G29_28 : Rgg<28, "g29:28", [GPMUCNT2, GPMUCNT3]>, DwarfRegNum<[248]>;
+    def G31_30 : Rgg<30, "g31:30", [G30, G31]>, DwarfRegNum<[250]>;
   }
-
 }
 
 // HVX types
 
-def VecI1:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v64i1,  v128i1,  v64i1]>;
-def VecI8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v64i8,  v128i8,  v64i8]>;
-def VecI16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v32i16, v64i16,  v32i16]>;
-def VecI32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v16i32, v32i32,  v16i32]>;
-def VecF16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v32f16, v64f16,  v32f16]>;
-def VecF32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v16f32, v32f32,  v16f32]>;
-
-def VecPI8:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v128i8, v256i8,  v128i8]>;
-def VecPI16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v64i16, v128i16, v64i16]>;
-def VecPI32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v32i32, v64i32,  v32i32]>;
-def VecPF16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v64f16, v128f16, v64f16]>;
-def VecPF32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v32f32, v64f32,  v32f32]>;
-
-def VecQ8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v64i1,  v128i1,  v64i1]>;
-def VecQ16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v32i1,  v64i1,   v32i1]>;
-def VecQ32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v16i1,  v32i1,   v16i1]>;
+def VecI1
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i1, v128i1, v64i1]>;
+def VecI8
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i8, v128i8, v64i8]>;
+def VecI16
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32i16, v64i16, v32i16]>;
+def VecI32
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v16i32, v32i32, v16i32]>;
+def VecF16
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32f16, v64f16, v32f16]>;
+def VecF32
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v16f32, v32f32, v16f32]>;
+def VecBF16 : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32bf16, v64bf16,
+                                                               v32bf16]>;
+
+def VecPI8
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v128i8, v256i8, v128i8]>;
+def VecPI16 : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i16, v128i16,
+                                                               v64i16]>;
+def VecPI32
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32i32, v64i32, v32i32]>;
+def VecPF16 : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64f16, v128f16,
+                                                               v64f16]>;
+def VecPF32
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32f32, v64f32, v32f32]>;
+def VecPBF16
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64bf16, v128bf16,
+                                                       v64bf16]>;
+
+def VecQ8
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v64i1, v128i1, v64i1]>;
+def VecQ16
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v32i1, v64i1, v32i1]>;
+def VecQ32
+    : ValueTypeByHwMode<[Hvx64, Hvx128, DefaultMode], [v16i1, v32i1, v16i1]>;
 
 // HVX register classes
 
-def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecF32], 512,
-  (add (sequence "V%u", 0, 31), VTMP)> {
-  let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
-    [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
+def HvxVR
+    : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecBF16, VecF32],
+                    512, (add (sequence "V%u", 0, 31), VTMP)> {
+  let RegInfos =
+      RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<512, 512, 512>,
+                                                     RegInfo<1024, 1024, 1024>,
+                                                     RegInfo<512, 512, 512>]>;
 }
 
-def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPF32], 1024,
-  (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
-  let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
-    [RegInfo<1024,1024,512>, RegInfo<2048,2048,1024>, RegInfo<1024,1024,512>]>;
+def HvxWR
+    : RegisterClass<
+          "Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPBF16, VecPF32],
+          1024, (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
+  let RegInfos =
+      RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<1024, 1024, 512>,
+                                                     RegInfo<2048, 2048, 1024>,
+                                                     RegInfo<1024, 1024, 512>]>;
 }
 
 def HvxQR : RegisterClass<"Hexagon", [VecI1, VecQ8, VecQ16, VecQ32], 128,
-  (add Q0, Q1, Q2, Q3)> {
-  let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
-    [RegInfo<64,512,512>, RegInfo<128,1024,1024>, RegInfo<64,512,512>]>;
+                          (add Q0, Q1, Q2, Q3)> {
+  let RegInfos =
+      RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<64, 512, 512>,
+                                                     RegInfo<128, 1024, 1024>,
+                                                     RegInfo<64, 512, 512>]>;
 }
 
-def HvxVQR : RegisterClass<"Hexagon", [untyped], 2048,
-  (add (sequence "VQ%u", 0, 7))> {
-  let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
-    [RegInfo<2048,2048,512>, RegInfo<4096,4096,1024>, RegInfo<2048,2048,512>]>;
+def HvxVQR
+    : RegisterClass<"Hexagon", [untyped], 2048, (add (sequence "VQ%u", 0, 7))> {
+  let RegInfos =
+      RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode], [RegInfo<2048, 2048, 512>,
+                                                     RegInfo<4096, 4096, 1024>,
+                                                     RegInfo<2048, 2048, 512>]>;
 }
 
 // Core register classes
 
-def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
-  (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28),
-       R10, R11, R29, R30, R31)>;
+def IntRegs
+    : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
+                    (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), R10,
+                        R11, R29, R30, R31)>;
 
 // Registers are listed in reverse order for allocation preference reasons.
 def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
-  (add R23, R22, R21, R20, R19, R18, R17, R16,
-       R7, R6, R5, R4, R3, R2, R1, R0)>;
+                                   (add R23, R22, R21, R20, R19, R18, R17, R16,
+                                       R7, R6, R5, R4, R3, R2, R1, R0)>;
 
-def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
-  (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+def IntRegsLow8
+    : RegisterClass<"Hexagon", [i32], 32, (add R7, R6, R5, R4, R3, R2, R1, R0)>;
 
 def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
-  (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>;
-
-def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
-  (add D11, D10, D9, D8, D3, D2, D1, D0)>;
-
-let Size = 32 in
-def PredRegs : RegisterClass<"Hexagon",
-  [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32, (add P0, P1, P2, P3)>;
-
-let Size = 32 in
-def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
-
-let Size = 32, isAllocatable = 0 in
-def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
-  (add LC0, SA0, LC1, SA1, P3_0, C5, C8, PC, UGP, GP, CS0, CS1,
-       UPCYCLELO, UPCYCLEHI,
-       FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
-       M0, M1, USR)>;
-
-let Size = 64 in
-def VectRegRev : RegisterClass<"Hexagon", [i64], 64,
-  (add (sequence "WR%u", 0, 15))>;
-
-let isAllocatable = 0 in
-def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
-
-let Size = 64, isAllocatable = 0 in
-def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
-  (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
-       PKTCOUNT, UTIMER)>;
-
-let Size = 32, isAllocatable = 0 in
-def GuestRegs : RegisterClass<"Hexagon", [i32], 32,
-  (add GELR, GSR, GOSP,
-       (sequence "G%u", 3, 15),
-       GPMUCNT4, GPMUCNT5, GPMUCNT6, GPMUCNT7,
-       G20, G21, G22, G23,
-       GPCYCLELO, GPCYCLEHI, GPMUCNT0, GPMUCNT1,
-       GPMUCNT2,  GPMUCNT3,
-       G30, G31)>;
-
-let Size = 64, isAllocatable = 0 in
-def GuestRegs64 : RegisterClass<"Hexagon", [i64], 64,
-  (add G1_0, G3_2,
-       G5_4, G7_6, G9_8, G11_10, G13_12, G15_14,
-       G17_16, G19_18,
-       G21_20, G23_22,
-       G25_24, G27_26, G29_28,
-       G31_30)>;
-
-let Size = 32, isAllocatable = 0 in
-def SysRegs : RegisterClass<"Hexagon", [i32], 32,
-  (add SGP0, SGP1, STID, ELR, BADVA0, BADVA1,
-       SSR, CCR, HTID, BADVA, IMASK,
-       S11, S12, S13, S14, S15,
-       S19, S23, S25,
-       EVB, MODECTL, SYSCFG, S20, VID, S22, S24,
-       S26, CFGBASE, DIAG, REV, PCYCLEHI,
-       PCYCLELO, ISDBST, ISDBCFG0, ISDBCFG1, S35,
-       BRKPTPC0, BRKPTCFG0, BRKPTPC1, BRKPTCFG1,
-       ISDBMBXIN, ISDBMBXOUT, ISDBEN, ISDBGPR,
-       S44, S45, S46, S47,
-       PMUCNT0, PMUCNT1, PMUCNT2, PMUCNT3,
-       PMUEVTCFG, PMUCFG, S54, S55, S56, S57,
-       S58, S59, S60, S61, S62, S63, S64, S65, S66, S67,
-       S68, S69, S70, S71, S72, S73, S74, S75, S76, S77,
-       S78, S79, S80
-       )>;
-
-let Size = 64, isAllocatable = 0 in
-def SysRegs64 : RegisterClass<"Hexagon", [i64], 64,
-  (add SGP1_0,
-       S3_2, S5_4, S7_6, S9_8,
-       S11_10, S13_12, S15_14,
-       S17_16, S19_18, S21_20,
-       S23_22, S25_24,
-       S27_26, S29_28, S31_30, S33_32, S35_34,
-       S37_36, S39_38, S41_40, S43_42, S45_44,
-       S47_46, S49_48, S51_50, S53_52,
-       S55_54, S57_56, S59_58,
-       S61_60, S63_62, S65_64, S67_66, S69_68,
-       S71_70, S73_72, S75_74, S77_76, S79_78
-       )>;
+                               (add (sequence "D%u", 0, 4),
+                                   (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+def GeneralDoubleLow8Regs
+    : RegisterClass<"Hexagon", [i64], 64,
+                    (add D11, D10, D9, D8, D3, D2, D1, D0)>;
+
+let Size = 32 in def PredRegs
+    : RegisterClass<"Hexagon", [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
+                    (add P0, P1, P2, P3)>;
+
+let Size =
+    32 in def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
+
+let Size = 32, isAllocatable = 0 in def CtrRegs
+    : RegisterClass<"Hexagon", [i32], 32,
+                    (add LC0, SA0, LC1, SA1, P3_0, C5, C8, PC, UGP, GP, CS0,
+                        CS1, UPCYCLELO, UPCYCLEHI, FRAMELIMIT, FRAMEKEY,
+                        PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI, M0, M1,
+                        USR)>;
+
+let Size = 64 in def VectRegRev
+    : RegisterClass<"Hexagon", [i64], 64, (add (sequence "WR%u", 0, 15))>;
+
+let isAllocatable =
+    0 in def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
+
+let Size = 64, isAllocatable = 0 in def CtrRegs64
+    : RegisterClass<"Hexagon", [i64], 64,
+                    (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE,
+                        C17_16, PKTCOUNT, UTIMER)>;
+
+let Size = 32, isAllocatable = 0 in def GuestRegs
+    : RegisterClass<"Hexagon", [i32], 32,
+                    (add GELR, GSR, GOSP, (sequence "G%u", 3, 15), GPMUCNT4,
+                        GPMUCNT5, GPMUCNT6, GPMUCNT7, G20, G21, G22, G23,
+                        GPCYCLELO, GPCYCLEHI, GPMUCNT0, GPMUCNT1, GPMUCNT2,
+                        GPMUCNT3, G30, G31)>;
+
+let Size = 64, isAllocatable = 0 in def GuestRegs64
+    : RegisterClass<"Hexagon", [i64], 64,
+                    (add G1_0, G3_2, G5_4, G7_6, G9_8, G11_10, G13_12, G15_14,
+                        G17_16, G19_18, G21_20, G23_22, G25_24, G27_26, G29_28,
+                        G31_30)>;
+
+let Size = 32, isAllocatable = 0 in def SysRegs
+    : RegisterClass<"Hexagon", [i32], 32,
+                    (add SGP0, SGP1, STID, ELR, BADVA0, BADVA1, SSR, CCR, HTID,
+                        BADVA, IMASK, S11, S12, S13, S14, S15, S19, S23, S25,
+                        EVB, MODECTL, SYSCFG, S20, VID, S22, S24, S26, CFGBASE,
+                        DIAG, REV, PCYCLEHI, PCYCLELO, ISDBST, ISDBCFG0,
+                        ISDBCFG1, S35, BRKPTPC0, BRKPTCFG0, BRKPTPC1, BRKPTCFG1,
+                        ISDBMBXIN, ISDBMBXOUT, ISDBEN, ISDBGPR, S44, S45, S46,
+                        S47, PMUCNT0, PMUCNT1, PMUCNT2, PMUCNT3, PMUEVTCFG,
+                        PMUCFG, S54, S55, S56, S57, S58, S59, S60, S61, S62,
+                        S63, S64, S65, S66, S67, S68, S69, S70, S71, S72, S73,
+                        S74, S75, S76, S77, S78, S79, S80)>;
+
+let Size = 64, isAllocatable = 0 in def SysRegs64
+    : RegisterClass<"Hexagon", [i64], 64,
+                    (add SGP1_0, S3_2, S5_4, S7_6, S9_8, S11_10, S13_12, S15_14,
+                        S17_16, S19_18, S21_20, S23_22, S25_24, S27_26, S29_28,
+                        S31_30, S33_32, S35_34, S37_36, S39_38, S41_40, S43_42,
+                        S45_44, S47_46, S49_48, S51_50, S53_52, S55_54, S57_56,
+                        S59_58, S61_60, S63_62, S65_64, S67_66, S69_68, S71_70,
+                        S73_72, S75_74, S77_76, S79_78)>;
 
 // These registers are new for v62 and onward.
 // The function RegisterMatchesArch() uses this list for validation.
-let isAllocatable = 0 in
-def V62Regs : RegisterClass<"Hexagon", [i32], 32,
-  (add FRAMELIMIT, FRAMEKEY,   C17_16, PKTCOUNTLO, PKTCOUNTHI, PKTCOUNT,
-       UTIMERLO,   UTIMERHI,   UTIMER)>;
+let isAllocatable = 0 in def V62Regs
+    : RegisterClass<"Hexagon", [i32], 32,
+                    (add FRAMELIMIT, FRAMEKEY, C17_16, PKTCOUNTLO, PKTCOUNTHI,
+                        PKTCOUNT, UTIMERLO, UTIMERHI, UTIMER)>;
 
 // These registers are new for v65 and onward.
-let Size = 32, isAllocatable = 0 in
-def V65Regs : RegisterClass<"Hexagon", [i32], 32, (add VTMP)>;
-
+let Size = 32, isAllocatable = 0 in def V65Regs
+    : RegisterClass<"Hexagon", [i32], 32, (add VTMP)>;
 
-def HexagonCSR
-  : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
-                         R24, R25, R26, R27)>;
+def HexagonCSR : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
+                     R24, R25, R26, R27)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index ce2de75..66c8b0a 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
-#include <map>
 #include <optional>
 
 using namespace llvm;
@@ -77,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
       OptLevel(TM.getOptLevel()),
       CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      RegInfo(getHwMode()), TLInfo(TM, *this),
-      InstrItins(getInstrItineraryForCPU(CPUString)) {
+      TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) {
   Hexagon_MC::addArchSubtarget(this, FS);
   // Beware of the default constructor of InstrItineraryData: it will
   // reset all members to 0.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 995f66d..dde3229 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -100,7 +100,6 @@ private:
   // The following objects can use the TargetTriple, so they must be
   // declared after it.
   HexagonInstrInfo InstrInfo;
-  HexagonRegisterInfo RegInfo;
   HexagonTargetLowering TLInfo;
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
@@ -122,7 +121,7 @@ public:
   }
   const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const HexagonRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const HexagonTargetLowering *getTargetLowering() const override {
     return &TLInfo;
@@ -345,7 +344,11 @@ public:
   ArrayRef<MVT> getHVXElementTypes() const {
     static MVT Types[] = {MVT::i8, MVT::i16, MVT::i32};
     static MVT TypesV68[] = {MVT::i8, MVT::i16, MVT::i32, MVT::f16, MVT::f32};
+    static MVT TypesV81[] = {MVT::i8,  MVT::i16,  MVT::i32,
+                             MVT::f16, MVT::bf16, MVT::f32};
 
+    if (useHVXV81Ops() && useHVXFloatingPoint())
+      return ArrayRef(TypesV81);
     if (useHVXV68Ops() && useHVXFloatingPoint())
       return ArrayRef(TypesV68);
     return ArrayRef(Types);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index d9824a31..d98fe80 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -48,6 +48,14 @@ static cl::opt<bool>
                          cl::desc("Disable Hardware Loops for Hexagon target"));
 
 static cl::opt<bool>
+    EnableGenWideningVec("hexagon-widening-vectors", cl::init(true), cl::Hidden,
+                         cl::desc("Generate widening vector instructions"));
+
+static cl::opt<bool>
+    EnableOptShuffleVec("hexagon-opt-shuffvec", cl::init(true), cl::Hidden,
+                        cl::desc("Enable optimization of shuffle vectors"));
+
+static cl::opt<bool>
     DisableAModeOpt("disable-hexagon-amodeopt", cl::Hidden,
                     cl::desc("Disable Hexagon Addressing Mode Optimization"));
 
@@ -321,6 +329,8 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void HexagonPassConfig::addIRPasses() {
+  HexagonTargetMachine &HTM = getHexagonTargetMachine();
+
   TargetPassConfig::addIRPasses();
   bool NoOpt = (getOptLevel() == CodeGenOptLevel::None);
 
@@ -350,6 +360,13 @@ void HexagonPassConfig::addIRPasses() {
     // Replace certain combinations of shifts and ands with extracts.
     if (EnableGenExtract)
       addPass(createHexagonGenExtract());
+    if (EnableGenWideningVec) {
+      addPass(createHexagonGenWideningVecInstr(HTM));
+      addPass(createHexagonGenWideningVecFloatInstr(HTM));
+      addPass(createDeadCodeEliminationPass());
+    }
+    if (EnableOptShuffleVec)
+      addPass(createHexagonOptShuffleVector(HTM));
   }
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index e925e04..59c6201 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -224,14 +224,6 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 InstructionCost
-HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                      Align Alignment, unsigned AddressSpace,
-                                      TTI::TargetCostKind CostKind) const {
-  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                      CostKind);
-}
-
-InstructionCost
 HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
                                TTI::TargetCostKind CostKind, int Index,
@@ -240,13 +232,6 @@ HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   return 1;
 }
 
-InstructionCost HexagonTTIImpl::getGatherScatterOpCost(
-    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
-    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
-  return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                       Alignment, CostKind, I);
-}
-
 InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -345,14 +330,16 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
-                                        unsigned /*AddressSpace*/) const {
+                                        unsigned /*AddressSpace*/,
+                                        TTI::MaskKind /*MaskKind*/) const {
   // This function is called from scalarize-masked-mem-intrin, which runs
   // in pre-isel. Use ST directly instead of calling isHVXVectorType.
   return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
 }
 
 bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
-                                       unsigned /*AddressSpace*/) const {
+                                       unsigned /*AddressSpace*/,
+                                       TTI::MaskKind /*MaskKind*/) const {
   // This function is called from scalarize-masked-mem-intrin, which runs
   // in pre-isel. Use ST directly instead of calling isHVXVectorType.
   return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index cec2bf9..edf88cf 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -120,19 +120,10 @@ public:
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
-                        TTI::TargetCostKind CostKind) const override;
-  InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
                  VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;
-  InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-                                         const Value *Ptr, bool VariableMask,
-                                         Align Alignment,
-                                         TTI::TargetCostKind CostKind,
-                                         const Instruction *I) const override;
   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -166,9 +157,10 @@ public:
   }
 
   bool isLegalMaskedStore(Type *DataType, Align Alignment,
-                          unsigned AddressSpace) const override;
-  bool isLegalMaskedLoad(Type *DataType, Align Alignment,
-                         unsigned AddressSpace) const override;
+                          unsigned AddressSpace,
+                          TTI::MaskKind MaskKind) const override;
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace,
+                         TTI::MaskKind MaskKind) const override;
   bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
   bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override;
   bool forceScalarizeMaskedGather(VectorType *VTy,
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index cb88d1a..d39b79a 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   const MCInstrDesc& MCID = PacketMI.getDesc();
 
   // First operand is always the result.
-  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0);
   // Double regs can not feed into new value store: PRM section: 5.4.2.2.
   if (PacketRC == &Hexagon::DoubleRegsRegClass)
     return false;
@@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
-  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0);
   if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass)
     return false;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 5c50ec2..2813b1d 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -120,10 +120,6 @@ public:
   size_t length(Value *Val) const;
   size_t length(Type *Ty) const;
 
-  Constant *getNullValue(Type *Ty) const;
-  Constant *getFullValue(Type *Ty) const;
-  Constant *getConstSplat(Type *Ty, int Val) const;
-
   Value *simplify(Value *Val) const;
 
   Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
@@ -368,8 +364,8 @@ private:
   const HexagonVectorCombine &HVC;
 };
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::AddrInfo &AI) {
   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
   OS << "Addr: " << *AI.Addr << '\n';
   OS << "Type: " << *AI.ValTy << '\n';
@@ -379,8 +375,8 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::MoveGroup &MG) {
   OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
   OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
   OS << "Main\n";
@@ -398,9 +394,8 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS,
-                        const AlignVectors::ByteSpan::Block &B) {
+[[maybe_unused]] raw_ostream &
+operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
   if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
     OS << "(self:" << B.Seg.Val << ')';
@@ -412,8 +407,8 @@ raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-[[maybe_unused]]
-raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const AlignVectors::ByteSpan &BS) {
   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
   for (const AlignVectors::ByteSpan::Block &B : BS)
     OS << B << '\n';
@@ -683,8 +678,8 @@ auto AlignVectors::getMask(Value *Val) const -> Value * {
 
   Type *ValTy = getPayload(Val)->getType();
   if (auto *VecTy = dyn_cast<VectorType>(ValTy))
-    return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
-  return HVC.getFullValue(HVC.getBoolTy());
+    return Constant::getAllOnesValue(HVC.getBoolTy(HVC.length(VecTy)));
+  return Constant::getAllOnesValue(HVC.getBoolTy());
 }
 
 auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
@@ -1123,7 +1118,7 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
   BasicBlock *BaseBlock = Builder.GetInsertBlock();
 
   ByteSpan ASpan;
-  auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
+  auto *True = Constant::getAllOnesValue(HVC.getBoolTy(ScLen));
   auto *Undef = UndefValue::get(SecTy);
 
   // Created load does not have to be "Instruction" (e.g. "undef").
@@ -1350,7 +1345,7 @@ auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
     ByteSpan VSection =
         VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
     Value *Undef = UndefValue::get(SecTy);
-    Value *Zero = HVC.getNullValue(SecTy);
+    Value *Zero = Constant::getNullValue(SecTy);
     Value *AccumV = Undef;
     Value *AccumM = Zero;
     for (ByteSpan::Block &S : VSection) {
@@ -2475,19 +2470,19 @@ Value *HvxIdioms::processVGather(Instruction &In) const {
     Dst->eraseFromParent();
   } else if (Qual == HvxIdioms::LLVM_Scatter) {
     // Gather feeds directly into scatter.
-    LLVM_DEBUG({
-      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
-      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
-      unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
-      unsigned DstElements = HVC.length(DstInpTy);
-      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
-      assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
-      dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
-             << *Dst->getOperand(0) << "\n";
-      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements
-             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
-             << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
-    });
+    auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+    assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+    [[maybe_unused]] unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+    [[maybe_unused]] unsigned DstElements = HVC.length(DstInpTy);
+    [[maybe_unused]] auto *DstElemTy =
+        cast<PointerType>(DstInpTy->getElementType());
+    assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+    LLVM_DEBUG(dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
+                      << *Dst->getOperand(0) << "\n");
+    LLVM_DEBUG(dbgs() << "  Dst type(" << *DstInpTy << ") elements("
+                      << DstElements << ") VecLen(" << DstInpSize << ") type("
+                      << *DstElemTy << ") Access alignment("
+                      << *Dst->getOperand(2) << ")\n");
     // Address of source
     auto *Src = getPointer(IndexLoad);
     if (!Src)
@@ -2700,11 +2695,11 @@ auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
     // Do full-precision multiply and shift.
     Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
     if (Rounding) {
-      Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
+      Value *RoundVal = ConstantInt::get(Prod32->getType(), 1 << *Op.RoundAt);
       Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
     }
 
-    Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
+    Value *ShiftAmt = ConstantInt::get(Prod32->getType(), Op.Frac);
     Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
                          ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
                          : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
@@ -2723,10 +2718,10 @@ auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
 
   // Add the optional rounding to the proper word.
   if (Op.RoundAt.has_value()) {
-    Value *Zero = HVC.getNullValue(WordX[0]->getType());
+    Value *Zero = Constant::getNullValue(WordX[0]->getType());
     SmallVector<Value *> RoundV(WordP.size(), Zero);
     RoundV[*Op.RoundAt / 32] =
-        HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
+        ConstantInt::get(HvxWordTy, 1 << (*Op.RoundAt % 32));
     WordP = createAddLong(Builder, WordP, RoundV);
   }
 
@@ -2734,7 +2729,7 @@ auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
 
   // Shift all products right by Op.Frac.
   unsigned SkipWords = Op.Frac / 32;
-  Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
+  Constant *ShiftAmt = ConstantInt::get(HvxWordTy, Op.Frac % 32);
 
   for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
     int Src = Dst + SkipWords;
@@ -2803,7 +2798,7 @@ auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
     } else {
       AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
       if (CarryIn == nullptr)
-        CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
+        CarryIn = Constant::getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
       Args.push_back(CarryIn);
     }
     Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
@@ -2951,7 +2946,7 @@ auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
     }
   }
 
-  Value *Zero = HVC.getNullValue(WordX[0]->getType());
+  Value *Zero = Constant::getNullValue(WordX[0]->getType());
 
   auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
     if (Vector.empty())
@@ -3147,33 +3142,6 @@ auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
   return VecTy->getElementCount().getFixedValue();
 }
 
-auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
-  assert(Ty->isIntOrIntVectorTy());
-  auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
-  if (auto *VecTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
-  return Zero;
-}
-
-auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
-  assert(Ty->isIntOrIntVectorTy());
-  auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
-  if (auto *VecTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
-  return Minus1;
-}
-
-auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
-    -> Constant * {
-  assert(Ty->isVectorTy());
-  auto VecTy = cast<VectorType>(Ty);
-  Type *ElemTy = VecTy->getElementType();
-  // Add support for floats if needed.
-  auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
-                                         ConstantInt::get(ElemTy, Val));
-  return Splat;
-}
-
 auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
   if (auto *In = dyn_cast<Instruction>(V)) {
     SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
@@ -3581,7 +3549,7 @@ auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
     // If there are too few, fill them with the sign bit.
     Value *Last = Inputs.back();
     Value *Sign = Builder.CreateAShr(
-        Last, getConstSplat(Last->getType(), Width - 1), "asr");
+        Last, ConstantInt::get(Last->getType(), Width - 1), "asr");
     Inputs.resize(NeedInputs, Sign);
   }
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 1a0f1ab..5a187d2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -569,9 +569,9 @@ public:
     return true;
   }
 
-  bool finishLayout(const MCAssembler &Asm) const override {
+  bool finishLayout() const override {
     SmallVector<MCFragment *> Frags;
-    for (MCSection &Sec : Asm) {
+    for (MCSection &Sec : *Asm) {
       Frags.clear();
       for (MCFragment &F : Sec)
         Frags.push_back(&F);
@@ -580,7 +580,7 @@ public:
         default:
           break;
         case MCFragment::FT_Align: {
-          auto Size = Asm.computeFragmentSize(*Frags[J]);
+          auto Size = Asm->computeFragmentSize(*Frags[J]);
           for (auto K = J; K != 0 && Size >= HEXAGON_PACKET_SIZE;) {
             --K;
             switch (Frags[K]->getKind()) {
@@ -597,14 +597,14 @@ public:
               MCInst Inst = RF.getInst();
 
               const bool WouldTraverseLabel = llvm::any_of(
-                  Asm.symbols(), [&Asm, &RF, &Inst](MCSymbol const &sym) {
+                  Asm->symbols(), [&RF, &Inst, Asm = Asm](MCSymbol const &sym) {
                     uint64_t Offset = 0;
-                    const bool HasOffset = Asm.getSymbolOffset(sym, Offset);
+                    const bool HasOffset = Asm->getSymbolOffset(sym, Offset);
                     const unsigned PacketSizeBytes =
                         HexagonMCInstrInfo::bundleSize(Inst) *
                         HEXAGON_INSTR_SIZE;
                     const bool OffsetPastSym =
-                        Offset <= (Asm.getFragmentOffset(RF) + PacketSizeBytes);
+                        Offset <= Asm->getFragmentOffset(RF) + PacketSizeBytes;
                     return !sym.isVariable() && Offset != 0 && HasOffset &&
                            OffsetPastSym;
                   });
@@ -631,7 +631,7 @@ public:
                                             *RF.getSubtargetInfo(), Inst);
               //assert(!Error);
               (void)Error;
-              ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+              ReplaceInstruction(Asm->getEmitter(), RF, Inst);
               Size = 0; // Only look back one instruction
               break;
             }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 9b6bc5a..0b2279b 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -385,7 +385,7 @@ bool HexagonMCChecker::checkSlots() {
 bool HexagonMCChecker::checkPredicates() {
   // Check for proper use of new predicate registers.
   for (const auto &I : NewPreds) {
-    unsigned P = I;
+    MCRegister P = I;
 
     if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) {
       // Error out if the new predicate register is not defined,
@@ -398,7 +398,7 @@ bool HexagonMCChecker::checkPredicates() {
 
   // Check for proper use of auto-anded of predicate registers.
   for (const auto &I : LatePreds) {
-    unsigned P = I;
+    MCRegister P = I;
 
     if (LatePreds.count(P) > 1 || Defs.count(P)) {
       // Error out if predicate register defined "late" multiple times or
@@ -607,7 +607,7 @@ void HexagonMCChecker::checkRegisterCurDefs() {
 bool HexagonMCChecker::checkRegisters() {
   // Check for proper register definitions.
   for (const auto &I : Defs) {
-    unsigned R = I.first;
+    MCRegister R = I.first;
 
     if (isLoopRegister(R) && Defs.count(R) > 1 &&
         (HexagonMCInstrInfo::isInnerLoop(MCB) ||
@@ -620,8 +620,8 @@ bool HexagonMCChecker::checkRegisters() {
     if (SoftDefs.count(R)) {
       // Error out for explicit changes to registers also weakly defined
       // (e.g., "{ usr = r0; r0 = sfadd(...) }").
-      unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
-      unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+      MCRegister UsrR = Hexagon::USR;
+      MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
       reportErrorRegisters(BadR);
       return false;
     }
@@ -633,8 +633,8 @@ bool HexagonMCChecker::checkRegisters() {
       if (PM.count(Unconditional)) {
         // Error out on an unconditional change when there are any other
         // changes, conditional or not.
-        unsigned UsrR = Hexagon::USR;
-        unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+        MCRegister UsrR = Hexagon::USR;
+        MCRegister BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
         reportErrorRegisters(BadR);
         return false;
       }
@@ -664,7 +664,7 @@ bool HexagonMCChecker::checkRegisters() {
 
   // Check for use of temporary definitions.
   for (const auto &I : TmpDefs) {
-    unsigned R = I;
+    MCRegister R = I;
 
     if (!Uses.count(R)) {
       // special case for vhist
@@ -765,12 +765,12 @@ void HexagonMCChecker::compoundRegisterMap(unsigned &Register) {
   }
 }
 
-void HexagonMCChecker::reportErrorRegisters(unsigned Register) {
+void HexagonMCChecker::reportErrorRegisters(MCRegister Register) {
   reportError("register `" + Twine(RI.getName(Register)) +
               "' modified more than once");
 }
 
-void HexagonMCChecker::reportErrorNewValue(unsigned Register) {
+void HexagonMCChecker::reportErrorNewValue(MCRegister Register) {
   reportError("register `" + Twine(RI.getName(Register)) +
               "' used with `.new' "
               "but not validly modified in the same packet");
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index e9b87c5..8beee8d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -39,41 +39,41 @@ class HexagonMCChecker {
   bool ReportErrors;
 
   /// Set of definitions: register #, if predicated, if predicated true.
-  using PredSense = std::pair<unsigned, bool>;
+  using PredSense = std::pair<MCRegister, bool>;
   static const PredSense Unconditional;
   using PredSet = std::multiset<PredSense>;
   using PredSetIterator = std::multiset<PredSense>::iterator;
 
-  using DefsIterator = DenseMap<unsigned, PredSet>::iterator;
-  DenseMap<unsigned, PredSet> Defs;
+  using DefsIterator = DenseMap<MCRegister, PredSet>::iterator;
+  DenseMap<MCRegister, PredSet> Defs;
 
   /// Set of weak definitions whose clashes should be enforced selectively.
-  using SoftDefsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> SoftDefs;
+  using SoftDefsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> SoftDefs;
 
   /// Set of temporary definitions not committed to the register file.
-  using TmpDefsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> TmpDefs;
+  using TmpDefsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> TmpDefs;
 
   /// Set of new predicates used.
-  using NewPredsIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> NewPreds;
+  using NewPredsIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> NewPreds;
 
   /// Set of predicates defined late.
-  using LatePredsIterator = std::multiset<unsigned>::iterator;
-  std::multiset<unsigned> LatePreds;
+  using LatePredsIterator = std::multiset<MCRegister>::iterator;
+  std::multiset<MCRegister> LatePreds;
 
   /// Set of uses.
-  using UsesIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> Uses;
+  using UsesIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> Uses;
 
   /// Pre-defined set of read-only registers.
-  using ReadOnlyIterator = std::set<unsigned>::iterator;
-  std::set<unsigned> ReadOnly;
+  using ReadOnlyIterator = std::set<MCRegister>::iterator;
+  std::set<MCRegister> ReadOnly;
 
   // Contains the vector-pair-registers with the even number
   // first ("v0:1", e.g.) used/def'd in this packet.
-  std::set<unsigned> ReversePairs;
+  std::set<MCRegister> ReversePairs;
 
   void init();
   void init(MCInst const &);
@@ -107,7 +107,7 @@ class HexagonMCChecker {
 
   static void compoundRegisterMap(unsigned &);
 
-  bool isLoopRegister(unsigned R) const {
+  bool isLoopRegister(MCRegister R) const {
     return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R ||
             Hexagon::LC1 == R);
   }
@@ -120,8 +120,8 @@ public:
                             MCSubtargetInfo const &STI, bool CopyReportErrors);
 
   bool check(bool FullCheck = true);
-  void reportErrorRegisters(unsigned Register);
-  void reportErrorNewValue(unsigned Register);
+  void reportErrorRegisters(MCRegister Register);
+  void reportErrorNewValue(MCRegister Register);
   void reportError(SMLoc Loc, Twine const &Msg);
   void reportNote(SMLoc Loc, Twine const &Msg);
   void reportError(Twine const &Msg);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index c5e57d0..712bdbe 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -21,7 +21,6 @@
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 2f59b7c..10c350e 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -67,6 +67,11 @@ void HexagonMCELFStreamer::emitInstruction(const MCInst &MCB,
   assert(MCB.getOpcode() == Hexagon::BUNDLE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  HexagonMCChecker Check(getContext(), *MCII, STI, const_cast<MCInst &>(MCB),
+                         *RI);
+  [[maybe_unused]] bool CheckOk = Check.check(false);
+  assert(CheckOk);
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index 3b1d3bd..4d0df66 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
@@ -44,11 +43,11 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
       const MachineOperand &Src = MI->getOperand(1);
       RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
       RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
-      assert(Register::isPhysicalRegister(DstR.Reg));
-      assert(Register::isPhysicalRegister(SrcR.Reg));
+      assert(DstR.asMCReg().isPhysical());
+      assert(SrcR.asMCReg().isPhysical());
       const TargetRegisterInfo &TRI = DFG.getTRI();
-      if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
-          TRI.getMinimalPhysRegClass(SrcR.Reg))
+      if (TRI.getMinimalPhysRegClass(DstR.asMCReg()) !=
+          TRI.getMinimalPhysRegClass(SrcR.asMCReg()))
         return false;
       if (!DFG.isTracked(SrcR) || !DFG.isTracked(DstR))
         return false;
@@ -66,7 +65,7 @@ void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
   Copies.push_back(SA.Id);
 
   for (auto I : EM) {
-    auto FS = DefM.find(I.second.Reg);
+    auto FS = DefM.find(I.second.Id);
     if (FS == DefM.end() || FS->second.empty())
       continue; // Undefined source
     RDefMap[I.second][SA.Id] = FS->second.top()->Id;
@@ -93,7 +92,7 @@ void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
   for (auto &R : RDefMap) {
     if (!RRs.count(R.first))
       continue;
-    auto F = DefM.find(R.first.Reg);
+    auto F = DefM.find(R.first.Id);
     if (F == DefM.end() || F->second.empty())
       continue;
     R.second[IA.Id] = F->second.top()->Id;
@@ -155,16 +154,16 @@ bool CopyPropagation::run() {
   bool HasLimit = CpLimit.getNumOccurrences() > 0;
 #endif
 
-  auto MinPhysReg = [this] (RegisterRef RR) -> unsigned {
+  auto MinPhysReg = [this](RegisterRef RR) -> MCRegister {
     const TargetRegisterInfo &TRI = DFG.getTRI();
-    const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+    const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.asMCReg());
     if ((RC.LaneMask & RR.Mask) == RC.LaneMask)
-      return RR.Reg;
-    for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S)
+      return RR.asMCReg();
+    for (MCSubRegIndexIterator S(RR.asMCReg(), &TRI); S.isValid(); ++S)
       if (RR.Mask == TRI.getSubRegIndexLaneMask(S.getSubRegIndex()))
         return S.getSubReg();
     llvm_unreachable("Should have found a register");
-    return 0;
+    return MCRegister();
   };
 
   const PhysicalRegisterInfo &PRI = DFG.getPRI();
@@ -215,7 +214,7 @@ bool CopyPropagation::run() {
                  << *NodeAddr<StmtNode*>(IA).Addr->getCode();
         }
 
-        unsigned NewReg = MinPhysReg(SR);
+        MCRegister NewReg = MinPhysReg(SR);
         Op.setReg(NewReg);
         Op.setSubReg(0);
         DFG.unlinkUse(UA, false);