aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td27
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td171
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp2
5 files changed, 178 insertions, 31 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c46b18..9f8a257 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1053,13 +1053,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def AArch64uaddlv : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>;
def AArch64saddlv : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>;
-def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abdu node:$lhs, node:$rhs),
- (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
-def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs),
- [(abds node:$lhs, node:$rhs),
- (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
-
// Add Pairwise of two vectors
def AArch64addp_n : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
// Add Long Pairwise
@@ -5667,8 +5660,7 @@ let Predicates = [HasFullFP16] in {
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- AArch64uabd>;
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
// Match UABDL in log2-shuffle patterns.
def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))))),
@@ -6018,8 +6010,8 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
- TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
-defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+ TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
+defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
@@ -6037,8 +6029,8 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
- TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
-defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+ TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >;
+defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
@@ -6759,10 +6751,8 @@ defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
- AArch64sabd>;
-defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
- AArch64sabd>;
+defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
+defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
@@ -6780,8 +6770,7 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
- AArch64uabd>;
+defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 473ba5e..bb0f667b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.moreElementsToNextPow2(0)
.lower();
+ getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+ .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+ .lower();
+
getActionDefinitionsBuilder(
{G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
.legalFor({{s32, s32}, {s64, s32}})
@@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(AArch64::G_SMULL);
case Intrinsic::aarch64_neon_umull:
return LowerBinOp(AArch64::G_UMULL);
+ case Intrinsic::aarch64_neon_sabd:
+ return LowerBinOp(TargetOpcode::G_ABDS);
+ case Intrinsic::aarch64_neon_uabd:
+ return LowerBinOp(TargetOpcode::G_ABDU);
case Intrinsic::aarch64_neon_abs: {
// Lower the intrinsic to G_ABS.
MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a5bb83d..b5df4c6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 70150bd..f329f48 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
-class TMA_DIMS_UTIL<int dim> {
+class TMA_DIMS_UTIL<int dim, string mode = ""> {
// For example, when 'dim' is 3, this generates:
// an ins_dag: B32:$d0, B32:$d1, B32:$d2
// with base_str: $d0, $d1, $d2
dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+
+ // Tile::Gather4/scatter4 actually operate on a 2D tensor,
+ // though they take 5 co-ordinates.
+ //
+ // The scatter-gather happens over 4 rows with a fixed
+ // column-index. The first co-ordinate represents the
+ // col-index followed by four row-indices.
+ int num_dims = !cond(
+ !eq(mode, "tile_scatter4") : 2,
+ !eq(mode, "tile_gather4") : 2,
+ true : dim); // for all other modes
}
class TMA_IM2COL_UTIL<int dim, string mode> {
@@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+ defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+ defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+ defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+ defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+ defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+ defvar asm_str = !if(!empty(im2col_str),
+ asm_str_base,
+ asm_str_base # ", {{" # im2col_str # "}}");
+
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+ defvar inst_name = "cp.async.bulk.tensor"
+ # "." # dim_val # "d"
+ # "." # "shared::cluster.global"
+ # "." # !subst("_", "::", mode)
+ # "." # "mbarrier::complete_tx::bytes";
+ defvar intr = !cast<Intrinsic>(
+ "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d");
+
+ defvar ins_dag = !con(
+ (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+ dims_dag, im2col_dag,
+ (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg));
+
+ defvar intr_dag_base = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B16:$mc, B64:$ch));
+ defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
+ defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
+ defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
+ defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+
+ def "" : NVPTXInst<(outs), ins_dag,
+ inst_name # asm_str # ";",
+ [intr_dag_no_hints]>,
+ Requires<pred>;
+ def _MC : NVPTXInst<(outs), ins_dag,
+ inst_name # ".multicast::cluster" # asm_str # ", $mc;",
+ [intr_dag_with_mc]>,
+ Requires<pred>;
+ def _CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+ [intr_dag_with_ch]>,
+ Requires<pred>;
+ def _MC_CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;",
+ [intr_dag_with_mc_ch]>,
+ Requires<pred>;
+}
+foreach dim = 3...5 in {
+ foreach mode = ["im2col_w", "im2col_w_128"] in {
+ defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
+ : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+ }
+}
+defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
+ [hasTMACTAGroupSupport]>;
+
+multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
+ defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+ defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+ defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+ defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+ defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+ defvar asm_str = !if(!empty(im2col_str),
+ asm_str_base,
+ asm_str_base # ", {{" # im2col_str # "}}");
+
+ defvar ins_dag = !con(
+ (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+ dims_dag, im2col_dag,
+ (ins B64:$ch));
+
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+ defvar intr = !cast<Intrinsic>(
+ "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d");
+ defvar intr_dag = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B64:$ch, 0));
+ defvar intr_dag_with_ch = !con(
+ (intr addr:$dst, addr:$mbar, B64:$tmap),
+ !setdagop(dims_dag, intr),
+ !setdagop(im2col_dag, intr),
+ (intr B64:$ch, -1));
+ defvar inst_name = "cp.async.bulk.tensor"
+ # "." # dim_val # "d"
+ # "." # "shared::cta.global"
+ # "." # !subst("_", "::", mode)
+ # "." # "mbarrier::complete_tx::bytes";
+
+ def "" : NVPTXInst<(outs), ins_dag,
+ inst_name # asm_str # ";",
+ [intr_dag]>,
+ Requires<pred>;
+ def _CH : NVPTXInst<(outs), ins_dag,
+ inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+ [intr_dag_with_ch]>,
+ Requires<pred>;
+}
+foreach dim = 1...5 in {
+ defm TMA_G2S_CTA_TILE_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>;
+}
+foreach dim = 3...5 in {
+ defm TMA_G2S_CTA_IM2COL_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>;
+
+ defm TMA_G2S_CTA_IM2COL_W_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
+
+ defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
+ : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+}
+defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
+ [hasPTX<86>, hasSM<100>]>;
+
multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
defvar intr = !cast<Intrinsic>(
- "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d);
+ "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d");
+
defvar intr_dag = !con((intr addr:$src, B64:$tmap),
!setdagop(dims_dag, intr),
(intr B64:$ch, 0));
@@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
!setdagop(dims_dag, intr),
(intr B64:$ch, -1));
- // For im2col mode, the actual asm_str is "im2col_no_offs"
- defvar mode_asm_str = !if(!eq(mode, "im2col"),
- "im2col_no_offs", mode);
+ // Fix-up the asm_str when it is im2col/scatter4.
+ defvar mode_asm_str = !cond(
+ !eq(mode, "im2col") : "im2col_no_offs",
+ !eq(mode, "tile_scatter4") : "tile::scatter4",
+ true : mode);
defvar prefix = "cp.async.bulk.tensor"
- # "." # dim # "d"
+ # "." # dim_val # "d"
# ".global.shared::cta"
# "." # mode_asm_str
# ".bulk_group";
@@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
}
foreach dim = 1...5 in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
- defvar suffix = !toupper(mode) # "_" # dim # D;
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>;
}
}
+defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
+ [hasTMACTAGroupSupport]>;
def TMAReductionFlags : Operand<i32> {
let PrintMethod = "printTmaReductionMode";
@@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
asm_str_base,
asm_str_base # ", {{" # im2col_str # "}}");
+ defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
defvar inst_name = "cp.async.bulk.prefetch.tensor"
- # "." # dim # "d"
+ # "." # dim_val # "d"
# "." # "L2.global"
- # "." # mode;
+ # "." # !subst("_", "::", mode);
defvar intr = !cast<Intrinsic>(
- "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d);
+ "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d");
defvar ins_dag = !con((ins B64:$tmap),
dims_dag,
@@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
}
foreach dim = 1...5 in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
- defvar suffix = !toupper(mode) # "_" # dim # D;
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>;
}
}
+foreach dim = 3...5 in {
+ foreach mode = ["im2col_w", "im2col_w_128"] in {
+ defvar suffix = !toupper(mode) # "_" # dim # "D";
+ defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
+ [hasTMACTAGroupSupport]>;
+ }
+}
+defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
+ [hasTMACTAGroupSupport]>;
//Prefetch and Prefetchu
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3d060c6..387d289 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -567,7 +567,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
// DataFragment, so that we can get the size of instructions later in
// MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
// DataFragment.
- OS.insert(OS.getContext().allocFragment<MCFragment>());
+ OS.newFragment();
// Update the maximum alignment on the current section if necessary.
MCSection *Sec = OS.getCurrentSectionOnly();