17 files changed, 359 insertions, 371 deletions
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 8ace2d2..95577dd 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -29,7 +29,6 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace llvm::dxil;
@@ -194,9 +193,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        assert(Binding.Size == UINT32_MAX ||
-               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
-                   "Resource range is too large");
+        assert(
+            (Binding.Size == UINT32_MAX ||
+             (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) &&
+            "Resource range is too large");
         BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
                                   ? UINT32_MAX
                                   : Binding.LowerBound + Binding.Size - 1;
@@ -284,6 +284,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
     PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
     PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
     PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+    if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+      PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+      PSV.BaseData.MaximumWaveLaneCount =
+          MMI.EntryPropertyVec[0].WaveSizeMax
+              ? MMI.EntryPropertyVec[0].WaveSizeMax
+              : MMI.EntryPropertyVec[0].WaveSizeMin;
+    }
     break;
   default:
     break;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 7ae500a..8b286626 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -930,6 +930,24 @@ def Discard : DXILOp<82, discard> {
   let stages = [Stages<DXIL1_0, [pixel]>];
 }
 
+def DerivCoarseX : DXILOp<83, unary> {
+  let Doc = "computes the rate of change per stamp in x direction";
+  let intrinsics = [IntrinSelect<int_dx_ddx_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
+def DerivCoarseY : DXILOp<84, unary> {
+  let Doc = "computes the rate of change per stamp in y direction";
+  let intrinsics = [IntrinSelect<int_dx_ddy_coarse>];
+  let arguments = [OverloadTy];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy]>];
+  let stages = [Stages<DXIL1_0, [library, pixel]>];
+}
+
 def ThreadId : DXILOp<93, threadId> {
   let Doc = "Reads the thread ID";
   let intrinsics = [IntrinSelect<int_dx_thread_id>];
@@ -1079,6 +1097,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
   let attributes = [Attributes<DXIL1_0, []>];
 }
 
+def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> {
+  let Doc = "returns the float16 stored in the low-half of the uint converted "
+            "to a float";
+  let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>];
+  let arguments = [Int32Ty];
+  let result = FloatTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def WaveAllBitCount : DXILOp<135, waveAllOp> {
   let Doc = "returns the count of bits set to 1 across the wave";
   let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>];
diff --git a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
index 4427797..5624532 100644
--- a/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILCBufferAccess.cpp
@@ -8,11 +8,13 @@
 
 #include "DXILCBufferAccess.h"
 #include "DirectX.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/Frontend/HLSL/CBuffer.h"
 #include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -21,297 +23,41 @@
 #define DEBUG_TYPE "dxil-cbuffer-access"
 using namespace llvm;
 
-namespace {
-/// Helper for building a `load.cbufferrow` intrinsic given a simple type.
-struct CBufferRowIntrin {
-  Intrinsic::ID IID;
-  Type *RetTy;
-  unsigned int EltSize;
-  unsigned int NumElts;
-
-  CBufferRowIntrin(const DataLayout &DL, Type *Ty) {
-    assert(Ty == Ty->getScalarType() && "Expected scalar type");
-
-    switch (DL.getTypeSizeInBits(Ty)) {
-    case 16:
-      IID = Intrinsic::dx_resource_load_cbufferrow_8;
-      RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty);
-      EltSize = 2;
-      NumElts = 8;
-      break;
-    case 32:
-      IID = Intrinsic::dx_resource_load_cbufferrow_4;
-      RetTy = StructType::get(Ty, Ty, Ty, Ty);
-      EltSize = 4;
-      NumElts = 4;
-      break;
-    case 64:
-      IID = Intrinsic::dx_resource_load_cbufferrow_2;
-      RetTy = StructType::get(Ty, Ty);
-      EltSize = 8;
-      NumElts = 2;
-      break;
-    default:
-      llvm_unreachable("Only 16, 32, and 64 bit types supported");
-    }
-  }
-};
-
-// Helper for creating CBuffer handles and loading data from them
-struct CBufferResource {
-  GlobalVariable *GVHandle;
-  GlobalVariable *Member;
-  size_t MemberOffset;
-
-  LoadInst *Handle;
-
-  CBufferResource(GlobalVariable *GVHandle, GlobalVariable *Member,
-                  size_t MemberOffset)
-      : GVHandle(GVHandle), Member(Member), MemberOffset(MemberOffset) {}
-
-  const DataLayout &getDataLayout() { return GVHandle->getDataLayout(); }
-  Type *getValueType() { return Member->getValueType(); }
-  iterator_range<ConstantDataSequential::user_iterator> users() {
-    return Member->users();
-  }
-
-  /// Get the byte offset of a Pointer-typed Value * `Val` relative to Member.
-  /// `Val` can either be Member itself, or a GEP of a constant offset from
-  /// Member
-  size_t getOffsetForCBufferGEP(Value *Val) {
-    assert(isa<PointerType>(Val->getType()) &&
-           "Expected a pointer-typed value");
-
-    if (Val == Member)
-      return 0;
-
-    if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
-      // Since we should always have a constant offset, we should only ever have
-      // a single GEP of indirection from the Global.
-      assert(GEP->getPointerOperand() == Member &&
-             "Indirect access to resource handle");
-
-      const DataLayout &DL = getDataLayout();
-      APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
-      bool Success = GEP->accumulateConstantOffset(DL, ConstantOffset);
-      (void)Success;
-      assert(Success && "Offsets into cbuffer globals must be constant");
-
-      if (auto *ATy = dyn_cast<ArrayType>(Member->getValueType()))
-        ConstantOffset =
-            hlsl::translateCBufArrayOffset(DL, ConstantOffset, ATy);
-
-      return ConstantOffset.getZExtValue();
-    }
-
-    llvm_unreachable("Expected Val to be a GlobalVariable or GEP");
-  }
-
-  /// Create a handle for this cbuffer resource using the IRBuilder `Builder`
-  /// and sets the handle as the current one to use for subsequent calls to
-  /// `loadValue`
-  void createAndSetCurrentHandle(IRBuilder<> &Builder) {
-    Handle = Builder.CreateLoad(GVHandle->getValueType(), GVHandle,
-                                GVHandle->getName());
+static void replaceUsersOfGlobal(GlobalVariable *Global,
+                                 GlobalVariable *HandleGV, size_t Offset) {
+  for (Use &U : make_early_inc_range(Global->uses())) {
+    auto UseInst = dyn_cast<Instruction>(U.getUser());
+    // TODO: Constants? Metadata?
+    assert(UseInst && "Non-instruction use of cbuffer");
+
+    IRBuilder<> Builder(UseInst);
+    LoadInst *Handle = Builder.CreateLoad(HandleGV->getValueType(), HandleGV,
+                                          HandleGV->getName());
+    Value *Ptr = Builder.CreateIntrinsic(
+        Global->getType(), Intrinsic::dx_resource_getpointer,
+        ArrayRef<Value *>{Handle,
+                          ConstantInt::get(Builder.getInt32Ty(), Offset)});
+    U.set(Ptr);
   }
 
-  /// Load a value of type `Ty` at offset `Offset` using the handle from the
-  /// last call to `createAndSetCurrentHandle`
-  Value *loadValue(IRBuilder<> &Builder, Type *Ty, size_t Offset,
-                   const Twine &Name = "") {
-    assert(Handle &&
-           "Expected a handle for this cbuffer global resource to be created "
-           "before loading a value from it");
-    const DataLayout &DL = getDataLayout();
-
-    size_t TargetOffset = MemberOffset + Offset;
-    CBufferRowIntrin Intrin(DL, Ty->getScalarType());
-    // The cbuffer consists of some number of 16-byte rows.
-    unsigned int CurrentRow = TargetOffset / hlsl::CBufferRowSizeInBytes;
-    unsigned int CurrentIndex =
-        (TargetOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
-
-    auto *CBufLoad = Builder.CreateIntrinsic(
-        Intrin.RetTy, Intrin.IID,
-        {Handle, ConstantInt::get(Builder.getInt32Ty(), CurrentRow)}, nullptr,
-        Name + ".load");
-    auto *Elt = Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
-                                           Name + ".extract");
-
-    Value *Result = nullptr;
-    unsigned int Remaining =
-        ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
-
-    if (Remaining == 0) {
-      // We only have a single element, so we're done.
-      Result = Elt;
-
-      // However, if we loaded a <1 x T>, then we need to adjust the type here.
-      if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
-        assert(VT->getNumElements() == 1 &&
-               "Can't have multiple elements here");
-        Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
-                                             Builder.getInt32(0), Name);
-      }
-      return Result;
-    }
-
-    // Walk each element and extract it, wrapping to new rows as needed.
-    SmallVector<Value *> Extracts{Elt};
-    while (Remaining--) {
-      CurrentIndex %= Intrin.NumElts;
-
-      if (CurrentIndex == 0)
-        CBufLoad = Builder.CreateIntrinsic(
-            Intrin.RetTy, Intrin.IID,
-            {Handle, ConstantInt::get(Builder.getInt32Ty(), ++CurrentRow)},
-            nullptr, Name + ".load");
-
-      Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
-                                                    Name + ".extract"));
-    }
-
-    // Finally, we build up the original loaded value.
-    Result = PoisonValue::get(Ty);
-    for (int I = 0, E = Extracts.size(); I < E; ++I)
-      Result =
-          Builder.CreateInsertElement(Result, Extracts[I], Builder.getInt32(I),
-                                      Name + formatv(".upto{}", I));
-    return Result;
-  }
-};
-
-} // namespace
-
-/// Replace load via cbuffer global with a load from the cbuffer handle itself.
-static void replaceLoad(LoadInst *LI, CBufferResource &CBR,
-                        SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
-  size_t Offset = CBR.getOffsetForCBufferGEP(LI->getPointerOperand());
-  IRBuilder<> Builder(LI);
-  CBR.createAndSetCurrentHandle(Builder);
-  Value *Result = CBR.loadValue(Builder, LI->getType(), Offset, LI->getName());
-  LI->replaceAllUsesWith(Result);
-  DeadInsts.push_back(LI);
-}
-
-/// This function recursively copies N array elements from the cbuffer resource
-/// CBR to the MemCpy Destination. Recursion is used to unravel multidimensional
-/// arrays into a sequence of scalar/vector extracts and stores.
-static void copyArrayElemsForMemCpy(IRBuilder<> &Builder, MemCpyInst *MCI,
-                                    CBufferResource &CBR, ArrayType *ArrTy,
-                                    size_t ArrOffset, size_t N,
-                                    const Twine &Name = "") {
-  const DataLayout &DL = MCI->getDataLayout();
-  Type *ElemTy = ArrTy->getElementType();
-  size_t ElemTySize = DL.getTypeAllocSize(ElemTy);
-  for (unsigned I = 0; I < N; ++I) {
-    size_t Offset = ArrOffset + I * ElemTySize;
-
-    // Recursively copy nested arrays
-    if (ArrayType *ElemArrTy = dyn_cast<ArrayType>(ElemTy)) {
-      copyArrayElemsForMemCpy(Builder, MCI, CBR, ElemArrTy, Offset,
-                              ElemArrTy->getNumElements(), Name);
-      continue;
-    }
-
-    // Load CBuffer value and store it in Dest
-    APInt CBufArrayOffset(
-        DL.getIndexTypeSizeInBits(MCI->getSource()->getType()), Offset);
-    CBufArrayOffset =
-        hlsl::translateCBufArrayOffset(DL, CBufArrayOffset, ArrTy);
-    Value *CBufferVal =
-        CBR.loadValue(Builder, ElemTy, CBufArrayOffset.getZExtValue(), Name);
-    Value *GEP =
-        Builder.CreateInBoundsGEP(Builder.getInt8Ty(), MCI->getDest(),
-                                  {Builder.getInt32(Offset)}, Name + ".dest");
-    Builder.CreateStore(CBufferVal, GEP, MCI->isVolatile());
-  }
-}
-
-/// Replace memcpy from a cbuffer global with a memcpy from the cbuffer handle
-/// itself. Assumes the cbuffer global is an array, and the length of bytes to
-/// copy is divisible by array element allocation size.
-/// The memcpy source must also be a direct cbuffer global reference, not a GEP.
-static void replaceMemCpy(MemCpyInst *MCI, CBufferResource &CBR) {
-
-  ArrayType *ArrTy = dyn_cast<ArrayType>(CBR.getValueType());
-  assert(ArrTy && "MemCpy lowering is only supported for array types");
-
-  // This assumption vastly simplifies the implementation
-  if (MCI->getSource() != CBR.Member)
-    reportFatalUsageError(
-        "Expected MemCpy source to be a cbuffer global variable");
-
-  ConstantInt *Length = dyn_cast<ConstantInt>(MCI->getLength());
-  uint64_t ByteLength = Length->getZExtValue();
-
-  // If length to copy is zero, no memcpy is needed
-  if (ByteLength == 0) {
-    MCI->eraseFromParent();
-    return;
-  }
-
-  const DataLayout &DL = CBR.getDataLayout();
-
-  Type *ElemTy = ArrTy->getElementType();
-  size_t ElemSize = DL.getTypeAllocSize(ElemTy);
-  assert(ByteLength % ElemSize == 0 &&
-         "Length of bytes to MemCpy must be divisible by allocation size of "
-         "source/destination array elements");
-  size_t ElemsToCpy = ByteLength / ElemSize;
-
-  IRBuilder<> Builder(MCI);
-  CBR.createAndSetCurrentHandle(Builder);
-
-  copyArrayElemsForMemCpy(Builder, MCI, CBR, ArrTy, 0, ElemsToCpy,
-                          "memcpy." + MCI->getDest()->getName() + "." +
-                              MCI->getSource()->getName());
-
-  MCI->eraseFromParent();
-}
-
-static void replaceAccessesWithHandle(CBufferResource &CBR) {
-  SmallVector<WeakTrackingVH> DeadInsts;
-
-  SmallVector<User *> ToProcess{CBR.users()};
-  while (!ToProcess.empty()) {
-    User *Cur = ToProcess.pop_back_val();
-
-    // If we have a load instruction, replace the access.
-    if (auto *LI = dyn_cast<LoadInst>(Cur)) {
-      replaceLoad(LI, CBR, DeadInsts);
-      continue;
-    }
-
-    // If we have a memcpy instruction, replace it with multiple accesses and
-    // subsequent stores to the destination
-    if (auto *MCI = dyn_cast<MemCpyInst>(Cur)) {
-      replaceMemCpy(MCI, CBR);
-      continue;
-    }
-
-    // Otherwise, walk users looking for a load...
-    if (isa<GetElementPtrInst>(Cur) || isa<GEPOperator>(Cur)) {
-      ToProcess.append(Cur->user_begin(), Cur->user_end());
-      continue;
-    }
-
-    llvm_unreachable("Unexpected user of Global");
-  }
-  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+  Global->removeFromParent();
 }
 
 static bool replaceCBufferAccesses(Module &M) {
-  std::optional<hlsl::CBufferMetadata> CBufMD = hlsl::CBufferMetadata::get(M);
+  std::optional<hlsl::CBufferMetadata> CBufMD = hlsl::CBufferMetadata::get(
+      M, [](Type *Ty) { return isa<llvm::dxil::PaddingExtType>(Ty); });
   if (!CBufMD)
     return false;
 
+  SmallVector<Constant *> CBufferGlobals;
+  for (const hlsl::CBufferMapping &Mapping : *CBufMD)
+    for (const hlsl::CBufferMember &Member : Mapping.Members)
+      CBufferGlobals.push_back(Member.GV);
+  convertUsersOfConstantsToInstructions(CBufferGlobals);
+
   for (const hlsl::CBufferMapping &Mapping : *CBufMD)
-    for (const hlsl::CBufferMember &Member : Mapping.Members) {
-      CBufferResource CBR(Mapping.Handle, Member.GV, Member.Offset);
-      replaceAccessesWithHandle(CBR);
-      Member.GV->removeFromParent();
-    }
+    for (const hlsl::CBufferMember &Member : Mapping.Members)
+      replaceUsersOfGlobal(Member.GV, Mapping.Handle, Member.Offset);
 
   CBufMD->eraseFromModule();
   return true;
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index d507d71..5f18c37 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -29,20 +29,6 @@ static const int MaxVecSize = 4;
 
 using namespace llvm;
 
-// Recursively creates an array-like version of a given vector type.
-static Type *equivalentArrayTypeFromVector(Type *T) {
-  if (auto *VecTy = dyn_cast<VectorType>(T))
-    return ArrayType::get(VecTy->getElementType(),
-                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
-  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
-    Type *NewElementType =
-        equivalentArrayTypeFromVector(ArrayTy->getElementType());
-    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
-  }
-  // If it's not a vector or array, return the original type.
-  return T;
-}
-
 class DXILDataScalarizationLegacy : public ModulePass {
 
 public:
@@ -121,12 +107,25 @@ DataScalarizerVisitor::lookupReplacementGlobal(Value *CurrOperand) {
 static bool isVectorOrArrayOfVectors(Type *T) {
   if (isa<VectorType>(T))
     return true;
-  if (ArrayType *ArrType = dyn_cast<ArrayType>(T))
-    return isa<VectorType>(ArrType->getElementType()) ||
-           isVectorOrArrayOfVectors(ArrType->getElementType());
+  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(T))
+    return isVectorOrArrayOfVectors(ArrayTy->getElementType());
   return false;
 }
 
+// Recursively creates an array-like version of a given vector type.
+static Type *equivalentArrayTypeFromVector(Type *T) {
+  if (auto *VecTy = dyn_cast<VectorType>(T))
+    return ArrayType::get(VecTy->getElementType(),
+                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
+  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
+    Type *NewElementType =
+        equivalentArrayTypeFromVector(ArrayTy->getElementType());
+    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
+  }
+  // If it's not a vector or array, return the original type.
+  return T;
+}
+
 bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
   Type *AllocatedType = AI.getAllocatedType();
   if (!isVectorOrArrayOfVectors(AllocatedType))
@@ -135,7 +134,7 @@ bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
   IRBuilder<> Builder(&AI);
   Type *NewType = equivalentArrayTypeFromVector(AllocatedType);
   AllocaInst *ArrAlloca =
-      Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarize");
+      Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarized");
   ArrAlloca->setAlignment(AI.getAlign());
   AI.replaceAllUsesWith(ArrAlloca);
   AI.eraseFromParent();
@@ -303,42 +302,44 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   GEPOperator *GOp = cast<GEPOperator>(&GEPI);
   Value *PtrOperand = GOp->getPointerOperand();
-  Type *NewGEPType = GOp->getSourceElementType();
-  bool NeedsTransform = false;
-
-  // Unwrap GEP ConstantExprs to find the base operand and element type
-  while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) {
-    if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) {
-      GOp = GEPCE;
-      PtrOperand = GEPCE->getPointerOperand();
-      NewGEPType = GEPCE->getSourceElementType();
-    } else
-      break;
+  Type *GEPType = GOp->getSourceElementType();
+
+  // Replace a GEP ConstantExpr pointer operand with a GEP instruction so that
+  // it can be visited
+  if (auto *PtrOpGEPCE = dyn_cast<ConstantExpr>(PtrOperand);
+      PtrOpGEPCE && PtrOpGEPCE->getOpcode() == Instruction::GetElementPtr) {
+    GetElementPtrInst *OldGEPI =
+        cast<GetElementPtrInst>(PtrOpGEPCE->getAsInstruction());
+    OldGEPI->insertBefore(GEPI.getIterator());
+
+    IRBuilder<> Builder(&GEPI);
+    SmallVector<Value *> Indices(GEPI.indices());
+    Value *NewGEP =
+        Builder.CreateGEP(GEPI.getSourceElementType(), OldGEPI, Indices,
+                          GEPI.getName(), GEPI.getNoWrapFlags());
+    assert(isa<GetElementPtrInst>(NewGEP) &&
+           "Expected newly-created GEP to be an instruction");
+    GetElementPtrInst *NewGEPI = cast<GetElementPtrInst>(NewGEP);
+
+    GEPI.replaceAllUsesWith(NewGEPI);
+    GEPI.eraseFromParent();
+    visitGetElementPtrInst(*OldGEPI);
+    visitGetElementPtrInst(*NewGEPI);
+    return true;
   }
 
-  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
-    NewGEPType = NewGlobal->getValueType();
-    PtrOperand = NewGlobal;
-    NeedsTransform = true;
-  } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
-    Type *AllocatedType = Alloca->getAllocatedType();
-    if (isa<ArrayType>(AllocatedType) &&
-        AllocatedType != GOp->getResultElementType()) {
-      NewGEPType = AllocatedType;
-      NeedsTransform = true;
-    }
-  }
+  Type *NewGEPType = equivalentArrayTypeFromVector(GEPType);
+  Value *NewPtrOperand = PtrOperand;
+  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand))
+    NewPtrOperand = NewGlobal;
 
+  bool NeedsTransform = NewPtrOperand != PtrOperand || NewGEPType != GEPType;
   if (!NeedsTransform)
     return false;
 
-  // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later
-  if (!isa<ArrayType>(GOp->getSourceElementType()))
-    NewGEPType = GOp->getSourceElementType();
-
   IRBuilder<> Builder(&GEPI);
-  SmallVector<Value *, MaxVecSize> Indices(GOp->indices());
-  Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
+  SmallVector<Value *, MaxVecSize> Indices(GOp->idx_begin(), GOp->idx_end());
+  Value *NewGEP = Builder.CreateGEP(NewGEPType, NewPtrOperand, Indices,
                                     GOp->getName(), GOp->getNoWrapFlags());
 
   GOp->replaceAllUsesWith(NewGEP);
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index ebb7c26..e0d2dbd 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -197,6 +197,7 @@ static Value *expand16BitIsNormal(CallInst *Orig) {
 
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
+  case Intrinsic::assume:
   case Intrinsic::abs:
   case Intrinsic::atan2:
   case Intrinsic::exp:
@@ -988,6 +989,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
+  case Intrinsic::assume:
+    Orig->eraseFromParent();
+    return true;
   case Intrinsic::atan2:
     Result = expandAtan2Intrinsic(Orig);
     break;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 8720460..e46a393 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -904,8 +904,6 @@ public:
       case Intrinsic::dx_resource_casthandle:
       // NOTE: llvm.dbg.value is supported as is in DXIL.
       case Intrinsic::dbg_value:
-      // NOTE: llvm.assume is supported as is in DXIL.
-      case Intrinsic::assume:
       case Intrinsic::not_intrinsic:
         if (F.use_empty())
           F.eraseFromParent();
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 6579d34..057d87b 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -10,6 +10,7 @@
 #include "DirectX.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -20,6 +21,7 @@
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "dxil-resource-access"
@@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset,
   APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
   if (GEP->accumulateConstantOffset(DL, ConstantOffset)) {
     APInt Scaled = ConstantOffset.udiv(ScalarSize);
-    return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled);
+    return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled);
   }
 
-  auto IndexIt = GEP->idx_begin();
-  assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
-         "GEP is not indexing through pointer");
-  ++IndexIt;
-  Value *Offset = *IndexIt;
-  assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
-  return Offset;
+  unsigned NumIndices = GEP->getNumIndices();
+
+  // If we have a single index we're indexing into a top level array. This
+  // generally only happens with cbuffers.
+  if (NumIndices == 1)
+    return *GEP->idx_begin();
+
+  // If we have two indices, this should be a simple access through a pointer.
+  if (NumIndices == 2) {
+    auto IndexIt = GEP->idx_begin();
+    assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
+           "GEP is not indexing through pointer");
+    ++IndexIt;
+    Value *Offset = *IndexIt;
+    assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
+    return Offset;
+  }
+
+  llvm_unreachable("Unhandled GEP structure for resource access");
 }
 
 static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI,
@@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) {
   LI->replaceAllUsesWith(V);
 }
 
+namespace {
+/// Helper for building a `load.cbufferrow` intrinsic given a simple type.
+struct CBufferRowIntrin {
+  Intrinsic::ID IID;
+  Type *RetTy;
+  unsigned int EltSize;
+  unsigned int NumElts;
+
+  CBufferRowIntrin(const DataLayout &DL, Type *Ty) {
+    assert(Ty == Ty->getScalarType() && "Expected scalar type");
+
+    switch (DL.getTypeSizeInBits(Ty)) {
+    case 16:
+      IID = Intrinsic::dx_resource_load_cbufferrow_8;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty);
+      EltSize = 2;
+      NumElts = 8;
+      break;
+    case 32:
+      IID = Intrinsic::dx_resource_load_cbufferrow_4;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty);
+      EltSize = 4;
+      NumElts = 4;
+      break;
+    case 64:
+      IID = Intrinsic::dx_resource_load_cbufferrow_2;
+      RetTy = StructType::get(Ty, Ty);
+      EltSize = 8;
+      NumElts = 2;
+      break;
+    default:
+      llvm_unreachable("Only 16, 32, and 64 bit types supported");
+    }
+  }
+};
+} // namespace
+
+static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset,
+                              dxil::ResourceTypeInfo &RTI) {
+  const DataLayout &DL = LI->getDataLayout();
+
+  Type *Ty = LI->getType();
+  assert(!isa<StructType>(Ty) && "Structs not handled yet");
+  CBufferRowIntrin Intrin(DL, Ty->getScalarType());
+
+  StringRef Name = LI->getName();
+  Value *Handle = II->getOperand(0);
+
+  IRBuilder<> Builder(LI);
+
+  ConstantInt *GlobalOffset = dyn_cast<ConstantInt>(II->getOperand(1));
+  assert(GlobalOffset && "CBuffer getpointer index must be constant");
+
+  unsigned int FixedOffset = GlobalOffset->getZExtValue();
+  // If we have a further constant offset we can just fold it in to the fixed
+  // offset.
+  if (auto *ConstOffset = dyn_cast_if_present<ConstantInt>(Offset)) {
+    FixedOffset += ConstOffset->getZExtValue();
+    Offset = nullptr;
+  }
+
+  Value *CurrentRow = ConstantInt::get(
+      Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes);
+  unsigned int CurrentIndex =
+      (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
+
+  assert(!(CurrentIndex && Offset) &&
+         "Dynamic indexing into elements of cbuffer rows is not supported");
+  // At this point if we have a non-constant offset it has to be an array
+  // offset, so we can assume that it's a multiple of the row size.
+  if (Offset)
+    CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset;
+
+  auto *CBufLoad = Builder.CreateIntrinsic(
+      Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load");
+  auto *Elt =
+      Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract");
+
+  // At this point we've loaded the first scalar of our result, but our original
+  // type may have been a vector.
+  unsigned int Remaining =
+      ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
+  if (Remaining == 0) {
+    // We only have a single element, so we're done.
+    Value *Result = Elt;
+
+    // However, if we loaded a <1 x T>, then we need to adjust the type.
+    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+      assert(VT->getNumElements() == 1 && "Can't have multiple elements here");
+      Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
+                                           Builder.getInt32(0), Name);
+    }
+    LI->replaceAllUsesWith(Result);
+    return;
+  }
+
+  // Walk each element and extract it, wrapping to new rows as needed.
+  SmallVector<Value *> Extracts{Elt};
+  while (Remaining--) {
+    CurrentIndex %= Intrin.NumElts;
+
+    if (CurrentIndex == 0) {
+      CurrentRow = Builder.CreateAdd(CurrentRow,
+                                     ConstantInt::get(Builder.getInt32Ty(), 1));
+      CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID,
+                                         {Handle, CurrentRow}, nullptr,
+                                         Name + ".load");
+    }
+
+    Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
+                                                  Name + ".extract"));
+  }
+
+  // Finally, we build up the original loaded value.
+  Value *Result = PoisonValue::get(Ty);
+  for (int I = 0, E = Extracts.size(); I < E; ++I)
+    Result = Builder.CreateInsertElement(
+        Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I));
+  LI->replaceAllUsesWith(Result);
+}
+
 static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
                                 dxil::ResourceTypeInfo &RTI) {
   switch (RTI.getResourceKind()) {
@@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::RawBuffer:
   case dxil::ResourceKind::StructuredBuffer:
     return createRawLoad(II, LI, Offset);
+  case dxil::ResourceKind::CBuffer:
+    return createCBufferLoad(II, LI, Offset, RTI);
   case dxil::ResourceKind::Texture1D:
   case dxil::ResourceKind::Texture2D:
   case dxil::ResourceKind::Texture2DMS:
@@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::TextureCubeArray:
   case dxil::ResourceKind::FeedbackTexture2D:
   case dxil::ResourceKind::FeedbackTexture2DArray:
-  case dxil::ResourceKind::CBuffer:
   case dxil::ResourceKind::TBuffer:
-    // TODO: handle these
+    reportFatalUsageError("Load not yet implemented for resource type");
     return;
   case dxil::ResourceKind::Sampler:
   case dxil::ResourceKind::RTAccelerationStructure:
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index b990b6c..ec82aa9 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -21,7 +21,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/MC/DXContainerRootSignature.h"
 #include "llvm/Pass.h"
-#include <optional>
 
 namespace llvm {
 namespace dxil {
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index ce6e812..e0049dc 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -100,6 +100,26 @@ static bool checkWaveOps(Intrinsic::ID IID) {
   }
 }
 
+// Checks to see if the status bit from a load with status
+// instruction is ever extracted. If it is, the module needs
+// to have the TiledResources shader flag set.
+bool checkIfStatusIsExtracted(const IntrinsicInst &II) {
+  [[maybe_unused]] Intrinsic::ID IID = II.getIntrinsicID();
+  assert(IID == Intrinsic::dx_resource_load_typedbuffer ||
+         IID == Intrinsic::dx_resource_load_rawbuffer &&
+             "unexpected intrinsic ID");
+  for (const User *U : II.users()) {
+    if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U)) {
+      // Resource load operations return a {result, status} pair.
+      // Check if we extract the status
+      if (EVI->getNumIndices() == 1 && EVI->getIndices()[0] == 1)
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Update the shader flags mask based on the given instruction.
 /// \param CSF Shader flags mask to update.
 /// \param I Instruction to check.
@@ -164,7 +184,7 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
     }
   }
 
-  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+  if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
     switch (II->getIntrinsicID()) {
     default:
       break;
@@ -192,6 +212,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF,
           DRTM[cast<TargetExtType>(II->getArgOperand(0)->getType())];
       if (RTI.isTyped())
         CSF.TypedUAVLoadAdditionalFormats |= RTI.getTyped().ElementCount > 1;
+      if (!CSF.TiledResources && checkIfStatusIsExtracted(*II))
+        CSF.TiledResources = true;
+      break;
+    }
+    case Intrinsic::dx_resource_load_rawbuffer: {
+      if (!CSF.TiledResources && checkIfStatusIsExtracted(*II))
+        CSF.TiledResources = true;
       break;
     }
     }
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index f94f799..a082057 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -22,7 +22,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
-#include <memory>
 
 namespace llvm {
 class Module;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833..e1a472f 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
   ASStateTag,
   WaveSize,
   EntryRootSig,
+  WaveRange = 23,
 };
 
 } // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
   case EntryPropsTag::ASStateTag:
   case EntryPropsTag::WaveSize:
   case EntryPropsTag::EntryRootSig:
+  case EntryPropsTag::WaveRange:
     llvm_unreachable("NYI: Unhandled entry property tag");
   }
   return MDVals;
 }
 
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
-                       const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+                                       uint64_t EntryShaderFlags,
+                                       const ModuleMetadataInfo &MMDI) {
   SmallVector<Metadata *> MDVals;
   LLVMContext &Ctx = EP.Entry->getContext();
   if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
     // FIXME: support more props.
     // See https://github.com/llvm/llvm-project/issues/57948.
     // Add shader kind for lib entries.
-    if (ShaderProfile == Triple::EnvironmentType::Library &&
+    if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
         EP.ShaderStage != Triple::EnvironmentType::Library)
       MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
                                           getShaderStage(EP.ShaderStage), Ctx));
 
     if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+      // Handle mandatory "hlsl.numthreads"
       MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
           Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
       Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
                                    ConstantAsMetadata::get(ConstantInt::get(
                                        Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
       MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+      // Handle optional "hlsl.wavesize". The fields are optionally represented
+      // if they are non-zero.
+      if (EP.WaveSizeMin != 0) {
+        bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+        bool IsWaveSize =
+            !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+        if (!IsWaveRange && !IsWaveSize) {
+          reportError(M, "Shader model 6.6 or greater is required to specify "
+                         "the \"hlsl.wavesize\" function attribute");
+          return nullptr;
+        }
+
+        // A range is being specified if EP.WaveSizeMax != 0
+        if (EP.WaveSizeMax && !IsWaveRange) {
+          reportError(
+              M, "Shader model 6.8 or greater is required to specify "
+                 "wave size range values of the \"hlsl.wavesize\" function "
+                 "attribute");
+          return nullptr;
+        }
+
+        EntryPropsTag Tag =
+            IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+        MDVals.emplace_back(ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+        SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+        if (IsWaveRange) {
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+        }
+
+        MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+      }
     }
   }
+
   if (MDVals.empty())
     return nullptr;
   return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
   return MDNode::get(Ctx, MDVals);
 }
 
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
-                            MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+                            MDTuple *Signatures, MDNode *MDResources,
                             const uint64_t EntryShaderFlags,
-                            const Triple::EnvironmentType ShaderProfile) {
-  MDTuple *Properties =
-      getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+                            const ModuleMetadataInfo &MMDI) {
+  MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
   return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
                                 EP.Entry->getContext());
 }
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
                    Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
                          "'"));
     }
-
-    EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
-                                            EntryShaderFlags,
-                                            MMDI.ShaderProfile));
+    EntryFnMDNodes.emplace_back(emitEntryMD(
+        M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
   }
 
   NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 26a8728..48a9085 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1169,8 +1169,8 @@ void DXILBitcodeWriter::writeModuleInfo() {
   // We need to hardcode a triple and datalayout that's compatible with the
   // historical DXIL triple and datalayout from DXC.
   StringRef Triple = "dxil-ms-dx";
-  StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-"
-                 "f16:32-f32:32-f64:64-n8:16:32:64";
+  StringRef DL = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-"
+                 "f16:16-f32:32-f64:64-n8:16:32:64";
   writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/);
   writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
 
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
index 8707b08..7cbc092e 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
@@ -18,9 +18,7 @@
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/MemoryBufferRef.h"
-#include <map>
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/lib/Target/DirectX/DirectX.td b/llvm/lib/Target/DirectX/DirectX.td
index 4d1d45b..1717d53 100644
--- a/llvm/lib/Target/DirectX/DirectX.td
+++ b/llvm/lib/Target/DirectX/DirectX.td
@@ -22,6 +22,8 @@ include "DXILStubs.td"
 // DirectX Subtarget features.
 //===----------------------------------------------------------------------===//
 
+defm : RemapAllTargetPseudoPointerOperands<DXILClass>;
+
 def DirectXInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
index bb2efa4..401881d 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -19,6 +19,6 @@
 using namespace llvm;
 
 DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI)
-    : DirectXGenInstrInfo(STI) {}
+    : DirectXGenInstrInfo(STI, RI) {}
 
 DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 84b1a31..fae9cbf 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -110,9 +110,9 @@ public:
   void addCodeGenPrepare() override {
     addPass(createDXILFinalizeLinkageLegacyPass());
     addPass(createGlobalDCEPass());
+    addPass(createDXILCBufferAccessLegacyPass());
     addPass(createDXILResourceAccessLegacyPass());
     addPass(createDXILIntrinsicExpansionLegacyPass());
-    addPass(createDXILCBufferAccessLegacyPass());
     addPass(createDXILDataScalarizationLegacyPass());
     ScalarizerPassOptions DxilScalarOptions;
     DxilScalarOptions.ScalarizeLoadStore = true;
@@ -206,7 +206,7 @@ DirectXTargetMachine::getTargetTransformInfo(const Function &F) const {
 
 DirectXTargetLowering::DirectXTargetLowering(const DirectXTargetMachine &TM,
                                              const DirectXSubtarget &STI)
-    : TargetLowering(TM) {
+    : TargetLowering(TM, STI) {
   addRegisterClass(MVT::i32, &dxil::DXILClassRegClass);
   computeRegisterProperties(STI.getRegisterInfo());
 }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 60dfd96..a755dd5 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                             int OpdIdx) const {
   switch (ID) {
   case Intrinsic::dx_asdouble:
-  case Intrinsic::dx_isinf:
-  case Intrinsic::dx_isnan:
   case Intrinsic::dx_firstbitlow:
-  case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbituhigh:
+  case Intrinsic::dx_isinf:
+  case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
     return OpdIdx == 0;
   default:
     return OpdIdx == -1;
@@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_frac:
   case Intrinsic::dx_isinf:
   case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
   case Intrinsic::dx_rsqrt:
   case Intrinsic::dx_saturate:
   case Intrinsic::dx_splitdouble:
@@ -62,6 +64,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_imad:
   case Intrinsic::dx_umad:
+  case Intrinsic::dx_ddx_coarse:
+  case Intrinsic::dx_ddy_coarse:
     return true;
   default:
     return false;