diff options
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 787 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/atomics.ll | 22 |
4 files changed, 184 insertions, 643 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8a4b833..b566cdd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -994,6 +994,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand); // No FPOW or FREM in PTX. // Now deduce the information based on the above mentioned diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index fe9bb62..7d0c47f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -216,16 +216,25 @@ class fpimm_pos_inf<ValueType vt> // Utility class to wrap up information about a register and DAG type for more // convenient iteration and parameterization -class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> { +class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node, + bit supports_imm = 1> { ValueType Ty = ty; NVPTXRegClass RC = rc; Operand Imm = imm; + SDNode ImmNode = imm_node; + bit SupportsImm = supports_imm; int Size = ty.Size; } -def I16RT : RegTyInfo<i16, Int16Regs, i16imm>; -def I32RT : RegTyInfo<i32, Int32Regs, i32imm>; -def I64RT : RegTyInfo<i64, Int64Regs, i64imm>; +def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>; +def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>; +def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>; + +def F32RT : RegTyInfo<f32, Float32Regs, f32imm, fpimm>; +def F64RT : RegTyInfo<f64, Float64Regs, f64imm, fpimm>; +def F16RT : RegTyInfo<f16, Int16Regs, f16imm, fpimm, supports_imm = 0>; +def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b2e05a56..34cb63e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1975,529 +1975,135 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$ // Atomic Functions //----------------------------------- -class ATOMIC_GLOBAL_CHK <dag ops, dag frag> - : PatFrag<ops, frag, AS_match.global>; -class ATOMIC_SHARED_CHK <dag ops, dag frag> - : PatFrag<ops, frag, AS_match.shared>; -class ATOMIC_GENERIC_CHK <dag ops, dag frag> - : PatFrag<ops, frag, AS_match.generic>; - -multiclass F_ATOMIC_2< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, SDNode IMM, list<Predicate> Pred = []> { - let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def r : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires<Pred>; - if !not(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16"))) then - def i : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, IMMType:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, IMM:$b))]>, - Requires<Pred>; - } -} +class ATOMIC_GLOBAL_CHK <dag frag> + : PatFrag<!setdagop(frag, ops), frag, AS_match.global>; +class ATOMIC_SHARED_CHK <dag frag> + : PatFrag<!setdagop(frag, ops), frag, AS_match.shared>; +class ATOMIC_GENERIC_CHK <dag frag> + : PatFrag<!setdagop(frag, ops), frag, AS_match.generic>; + -// has 2 operands, neg the second one -multiclass F_ATOMIC_2_NEG< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - list<Predicate> Pred = []> { +multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str, + SDPatternOperator op, list<Predicate> preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def reg : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - !strconcat( - "{{ \n\t", - ".reg \t.s", TypeStr, " temp; \n\t", - "neg.s", TypeStr, " \ttemp, $b; \n\t", - "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t", - "}}"), - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires<Pred>; + def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>, + Requires<preds>; + if t.SupportsImm then + def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>, + Requires<preds>; } } // has 3 operands -multiclass F_ATOMIC_3< - ValueType regT, NVPTXRegClass regclass, string SemStr, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, list<Predicate> Pred = []> { +multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str, + SDPatternOperator op, list<Predicate> preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, regT:$c))]>, - Requires<Pred>; - - def ir : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, regT:$c))]>, - Requires<Pred>; - - def ri : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, imm:$c))]>, - Requires<Pred>; - - def ii : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, imm:$c))]>, - Requires<Pred>; + def rr : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, + Requires<preds>; + + def ir : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, + Requires<preds>; + + def ri : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, + Requires<preds>; + + def ii : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, + Requires<preds>; } } +multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, list<Predicate> preds = []> { + defvar frag_pat = (frag node:$a, node:$b); + defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>; + defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>; + defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>; +} + +multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> { + defvar frag_pat = (frag node:$a, node:$b, node:$c); + defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>; + defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>; + defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>; +} + // atom_add +defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS<I32RT, atomic_load_add_i32, "add.u32">; +defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS<I64RT, atomic_load_add_i64, "add.u64">; -def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; - -defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".add", - atomic_load_add_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".add", - atomic_load_add_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".add", - atomic_load_add_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".add", atomic_load_add_i32_gen, i32imm, imm>; - -defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", ".add", - atomic_load_add_i64_g, i64imm, imm>; -defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", ".add", - atomic_load_add_i64_s, i64imm, imm>; -defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add", - atomic_load_add_i64_gen, i64imm, imm>; -defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".add", atomic_load_add_i64_gen, i64imm, imm>; - -defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz", - atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz", - atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz", - atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>; - -defm INT_PTX_ATOM_ADD_G_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".global", ".bf16", ".add.noftz", - atomic_load_add_g, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_S_BF16 : F_ATOMIC_2<bf16, Int16Regs, ".shared", ".bf16", ".add.noftz", - atomic_load_add_s, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_GEN_BF16 : F_ATOMIC_2<bf16, Int16Regs, "", ".bf16", ".add.noftz", - atomic_load_add_gen, bf16imm, fpimm, [hasSM<90>, hasPTX<78>]>; - -defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add", - atomic_load_add_g, f32imm, fpimm>; -defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add", - atomic_load_add_s, f32imm, fpimm>; -defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<f32, Float32Regs, "", ".f32", ".add", - atomic_load_add_gen, f32imm, fpimm>; - -defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<f64, Float64Regs, ".global", ".f64", ".add", - atomic_load_add_g, f64imm, fpimm, [hasAtomAddF64]>; -defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<f64, Float64Regs, ".shared", ".f64", ".add", - atomic_load_add_s, f64imm, fpimm, [hasAtomAddF64]>; -defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<f64, Float64Regs, "", ".f64", ".add", - atomic_load_add_gen, f64imm, fpimm, [hasAtomAddF64]>; - -// atom_sub - -def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", ".add", - atomic_load_sub_i32_g>; -defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", ".add", - atomic_load_sub_i64_g>; -defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<i32, Int32Regs, "", "32", ".add", - atomic_load_sub_i32_gen>; -defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<i32, Int32Regs, ".global", "32", - ".add", atomic_load_sub_i32_gen>; -defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<i32, Int32Regs, ".shared", "32", ".add", - atomic_load_sub_i32_s>; -defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<i64, Int64Regs, ".shared", "64", ".add", - atomic_load_sub_i64_s>; -defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<i64, Int64Regs, "", "64", ".add", - atomic_load_sub_i64_gen>; -defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<i64, Int64Regs, ".global", "64", - ".add", atomic_load_sub_i64_gen>; +defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS<F16RT, atomic_load_fadd, "add.noftz.f16", [hasSM<70>, hasPTX<63>]>; +defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2_AS<BF16RT, atomic_load_fadd, "add.noftz.bf16", [hasSM<90>, hasPTX<78>]>; +defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS<F32RT, atomic_load_fadd, "add.f32">; +defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS<F64RT, atomic_load_fadd, "add.f64", [hasAtomAddF64]>; // atom_swap - -def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".exch", - atomic_swap_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".exch", - atomic_swap_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".exch", - atomic_swap_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".exch", atomic_swap_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".exch", - atomic_swap_i64_g, i64imm, imm>; -defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".exch", - atomic_swap_i64_s, i64imm, imm>; -defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".exch", - atomic_swap_i64_gen, i64imm, imm>; -defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".exch", atomic_swap_i64_gen, i64imm, imm>; +defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2_AS<I32RT, atomic_swap_i32, "exch.b32">; +defm INT_PTX_ATOM_SWAP_64 : F_ATOMIC_2_AS<I64RT, atomic_swap_i64, "exch.b64">; // atom_max - -def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32", - ".max", atomic_load_max_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32", - ".max", atomic_load_max_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".max", - atomic_load_max_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".s32", ".max", atomic_load_max_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64", - ".max", atomic_load_max_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64", - ".max", atomic_load_max_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".max", - atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".s64", ".max", atomic_load_max_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".max", atomic_load_umax_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", - ".max", atomic_load_umax_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".max", - atomic_load_umax_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".u32", ".max", atomic_load_umax_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".max", atomic_load_umax_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", - ".max", atomic_load_umax_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".max", - atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".u64", ".max", atomic_load_umax_i64_gen, i64imm, imm, [hasSM<32>]>; +defm INT_PTX_ATOMIC_MAX_32 : F_ATOMIC_2_AS<I32RT, atomic_load_max_i32, "max.s32">; +defm INT_PTX_ATOMIC_MAX_64 : F_ATOMIC_2_AS<I64RT, atomic_load_max_i64, "max.s64", [hasSM<32>]>; +defm INT_PTX_ATOMIC_UMAX_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umax_i32, "max.u32">; +defm INT_PTX_ATOMIC_UMAX_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umax_i64, "max.u64", [hasSM<32>]>; // atom_min - -def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".s32", - ".min", atomic_load_min_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".s32", - ".min", atomic_load_min_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".s32", ".min", - atomic_load_min_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".s32", ".min", atomic_load_min_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".s64", - ".min", atomic_load_min_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".s64", - ".min", atomic_load_min_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".s64", ".min", - atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".s64", ".min", atomic_load_min_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".min", atomic_load_umin_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", - ".min", atomic_load_umin_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".min", - atomic_load_umin_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", - ".u32", ".min", atomic_load_umin_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64", - ".min", atomic_load_umin_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".u64", - ".min", atomic_load_umin_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".min", - atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", - ".u64", ".min", atomic_load_umin_i64_gen, i64imm, imm, [hasSM<32>]>; +defm INT_PTX_ATOMIC_MIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_min_i32, "min.s32">; +defm INT_PTX_ATOMIC_MIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_min_i64, "min.s64", [hasSM<32>]>; +defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u32">; +defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>; // atom_inc atom_dec - -def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; - -defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".inc", - atomic_load_inc_32_g, i32imm, imm>; -defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".inc", - atomic_load_inc_32_s, i32imm, imm>; -defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".inc", - atomic_load_inc_32_gen, i32imm, imm>; -defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".inc", atomic_load_inc_32_gen, i32imm, imm>; -defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", ".dec", - atomic_load_dec_32_g, i32imm, imm>; -defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".u32", ".dec", - atomic_load_dec_32_s, i32imm, imm>; -defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".u32", ".dec", - atomic_load_dec_32_gen, i32imm, imm>; -defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".u32", - ".dec", atomic_load_dec_32_gen, i32imm, imm>; +defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_inc_32, "inc.u32">; +defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_dec_32, "dec.u32">; // atom_and - -def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".and", - atomic_load_and_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".and", - atomic_load_and_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".and", - atomic_load_and_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".and", atomic_load_and_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".and", - atomic_load_and_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".and", - atomic_load_and_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".and", - atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".and", atomic_load_and_i64_gen, i64imm, imm, [hasSM<32>]>; +defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">; +defm INT_PTX_ATOM_AND_64 : F_ATOMIC_2_AS<I64RT, atomic_load_and_i64, "and.b64", [hasSM<32>]>; // atom_or - -def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".or", - atomic_load_or_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".or", - atomic_load_or_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".or", atomic_load_or_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".or", - atomic_load_or_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".or", - atomic_load_or_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".or", - atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".or", atomic_load_or_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".or", - atomic_load_or_i64_s, i64imm, imm, [hasSM<32>]>; +defm INT_PTX_ATOM_OR_32 : F_ATOMIC_2_AS<I32RT, atomic_load_or_i32, "or.b32">; +defm INT_PTX_ATOM_OR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_or_i64, "or.b64", [hasSM<32>]>; // atom_xor +defm INT_PTX_ATOM_XOR_32 : F_ATOMIC_2_AS<I32RT, atomic_load_xor_i32, "xor.b32">; +defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_xor_i64, "xor.b64", [hasSM<32>]>; -def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", ".xor", - atomic_load_xor_i32_g, i32imm, imm>; -defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<i32, Int32Regs, ".shared", ".b32", ".xor", - atomic_load_xor_i32_s, i32imm, imm>; -defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<i32, Int32Regs, "", ".b32", ".xor", - atomic_load_xor_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<i32, Int32Regs, ".global", ".b32", - ".xor", atomic_load_xor_i32_gen, i32imm, imm>; -defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", ".xor", - atomic_load_xor_i64_g, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<i64, Int64Regs, ".shared", ".b64", ".xor", - atomic_load_xor_i64_s, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor", - atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>; -defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64", - ".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>; - -multiclass ternary_atomic_op_as { - // one record per address space - def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.generic>; - - def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.global>; - - def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.shared>; -} - -// generate pattern fragments for size x memory order -// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs -// using larger-bitwidth cas -foreach size = ["i16", "i32", "i64"] in { - foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in { - defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as; - } -} - -// eg. with type = 32, order = ".acquire", addrspace = ".global", -// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. -// preds = [hasSM<70>, hasPTX<63>] -// F_ATOMIC_3<i32, Int32Regs, ".acquire", ".global", ".b32", -// ".cas", atomic_cmp_swap_i32_acquire_global, i32imm, -// [hasSM<70>, hasPTX<63>]> -multiclass INT_PTX_ATOM_CAS<string atomic_cmp_swap_pat, string type, - string order, string addrspace, list<Predicate> preds> - : F_ATOMIC_3<!cast<ValueType>("i"#type), - !cast<NVPTXRegClass>("Int"#type#"Regs"), - order, - addrspace, - ".b"#type, - ".cas", - !cast<PatFrag>(atomic_cmp_swap_pat), - !cast<Operand>("i"#type#"imm"), - preds>; // Define atom.cas for all combinations of size x addrspace x memory order // supported in PTX *and* on the hardware. -foreach size = ["32", "64"] in { - foreach addrspace = ["generic", "global", "shared"] in { - defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); - foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { - defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#size#_#order#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - cas_order_string, cas_addrspace_string, - [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - "", cas_addrspace_string, []>; - } +foreach t = [I32RT, I64RT] in { + foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { + defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order); + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. + defm INT_PTX_ATOM_CAS_#t.Size#_#order + : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old + : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>; } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas", - atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas", - atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas", - atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -2505,185 +2111,116 @@ defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas" // NOTE: not all possible combinations are implemented // 'space' is limited to generic as it's the only one needed to support CUDA. // 'scope' = 'gpu' is default and is handled by regular atomic instructions. -class ATOM23_impl<string AsmStr, ValueType regT, NVPTXRegClass regclass, list<Predicate> Preds, - dag ins, dag Operands> - : NVPTXInst<(outs regclass:$result), ins, - AsmStr, - [(set regT:$result, Operands)]>, - Requires<Preds>; // Define instruction variants for all addressing modes. -multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr, - ValueType regT, NVPTXRegClass regclass, Operand ImmType, - SDNode Imm, ValueType ImmTy, - list<Predicate> Preds> { - let AddedComplexity = 1 in { - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, regclass:$b), - (Intr addr:$src, regT:$b)>; - } - // tablegen can't infer argument types from Intrinsic (though it can - // from Instruction) so we have to enforce specific type on - // immediates via explicit cast to ImmTy. - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, ImmType:$b), - (Intr addr:$src, (ImmTy Imm:$b))>; -} - -multiclass ATOM3P_impl<string AsmStr, Intrinsic Intr, - ValueType regT, NVPTXRegClass regclass, - Operand ImmType, SDNode Imm, ValueType ImmTy, - list<Predicate> Preds> { - // Variants for register/immediate permutations of $b and $c - let AddedComplexity = 2 in { - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, regclass:$b, regclass:$c), - (Intr addr:$src, regT:$b, regT:$c)>; - } - let AddedComplexity = 1 in { - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, ImmType:$b, regclass:$c), - (Intr addr:$src, (ImmTy Imm:$b), regT:$c)>; - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, regclass:$b, ImmType:$c), - (Intr addr:$src, regT:$b, (ImmTy Imm:$c))>; - } - def : ATOM23_impl<AsmStr, regT, regclass, Preds, - (ins ADDR:$src, ImmType:$b, ImmType:$c), - (Intr addr:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>; -} // Constructs intrinsic name and instruction asm strings. multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr, string ScopeStr, string SpaceStr, - ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm, - ValueType ImmTy, list<Predicate> Preds> { - defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b;", - !cast<Intrinsic>( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; + RegTyInfo t, list<Predicate> Preds> { + defm "" : F_ATOMIC_2<t, + as_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr), + sem_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr), + op_str = OpStr # "." # TypeStr, + op = !cast<Intrinsic>( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr, string ScopeStr, string SpaceStr, - ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm, - ValueType ImmTy, list<Predicate> Preds> { - defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b, $c;", - !cast<Intrinsic>( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; -} - -// Constructs variants for different address spaces. -// For now we only need variants for generic space pointers. -multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr, - string ScopeStr, ValueType regT, NVPTXRegClass regclass, Operand ImmType, - SDNode Imm, ValueType ImmTy, list<Predicate> Preds> { - defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen", - regT, regclass, ImmType, Imm, ImmTy, Preds>; -} -multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr, - string ScopeStr, ValueType regT, NVPTXRegClass regclass, Operand ImmType, - SDNode Imm, ValueType ImmTy, list<Predicate> Preds> { - defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen", - regT, regclass, ImmType, Imm, ImmTy, Preds>; + RegTyInfo t, list<Predicate> Preds> { + defm "" : F_ATOMIC_3<t, + as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr), + sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr), + op_str = OpStr # "." # TypeStr, + op = !cast<Intrinsic>( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr, - ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm, - ValueType ImmTy, list<Predicate> Preds> { + RegTyInfo t, list<Predicate> Preds> { // .gpu scope is default and is currently covered by existing // atomics w/o explicitly specified scope. - defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta", - regT, regclass, ImmType, Imm, ImmTy, - !listconcat(Preds,[hasAtomScope])>; - defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys", - regT, regclass, ImmType, Imm, ImmTy, - !listconcat(Preds,[hasAtomScope])>; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, scope, space, + t, !listconcat(Preds, [hasAtomScope])>; + } + } } multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr, - ValueType regT, NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy, - list<Predicate> Preds> { + RegTyInfo t, list<Predicate> Preds> { // No need to define ".gpu"-scoped atomics. They do the same thing // as the regular, non-scoped atomics defined elsewhere. - defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta", - regT, regclass, ImmType, Imm, ImmTy, - !listconcat(Preds,[hasAtomScope])>; - defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys", - regT, regclass, ImmType, Imm, ImmTy, - !listconcat(Preds,[hasAtomScope])>; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space, + t, !listconcat(Preds,[hasAtomScope])>; + } + } } // atom.add multiclass ATOM2_add_impl<string OpStr> { - defm _s32 : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _u32 : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _u64 : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>; - defm _bf16 : ATOM2S_impl<OpStr, "f", "bf16", bf16, Int16Regs, bf16imm, fpimm, bf16, - [hasSM<90>, hasPTX<78>]>; - defm _f16 : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16, - [hasSM<70>, hasPTX<63>]>; - defm _f32 : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32, - []>; - defm _f64 : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64, - [hasAtomAddF64]>; + defm _s32 : ATOM2S_impl<OpStr, "i", "s32", I32RT, []>; + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", I32RT, []>; + defm _u64 : ATOM2S_impl<OpStr, "i", "u64", I64RT, []>; + defm _bf16 : ATOM2S_impl<OpStr, "f", "bf16", BF16RT, [hasSM<90>, hasPTX<78>]>; + defm _f16 : ATOM2S_impl<OpStr, "f", "f16", F16RT, []>; + defm _f32 : ATOM2S_impl<OpStr, "f", "f32", F32RT, []>; + defm _f64 : ATOM2S_impl<OpStr, "f", "f64", F64RT, []>; } // atom.{and,or,xor} multiclass ATOM2_bitwise_impl<string OpStr> { - defm _b32 : ATOM2S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _b64 : ATOM2S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, - [hasAtomBitwise64]>; + defm _b32 : ATOM2S_impl<OpStr, "i", "b32", I32RT, []>; + defm _b64 : ATOM2S_impl<OpStr, "i", "b64", I64RT, [hasAtomBitwise64]>; } // atom.exch multiclass ATOM2_exch_impl<string OpStr> { - defm _b32 : ATOM2S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _b64 : ATOM2S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>; + defm _b32 : ATOM2S_impl<OpStr, "i", "b32", I32RT, []>; + defm _b64 : ATOM2S_impl<OpStr, "i", "b64", I64RT, []>; } // atom.{min,max} multiclass ATOM2_minmax_impl<string OpStr> { - defm _s32 : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _u32 : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _s64 : ATOM2S_impl<OpStr, "i", "s64", i64, Int64Regs, i64imm, imm, i64, - [hasAtomMinMax64]>; - defm _u64 : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, - [hasAtomMinMax64]>; + defm _s32 : ATOM2S_impl<OpStr, "i", "s32", I32RT, []>; + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", I32RT, []>; + defm _s64 : ATOM2S_impl<OpStr, "i", "s64", I64RT, [hasAtomMinMax64]>; + defm _u64 : ATOM2S_impl<OpStr, "i", "u64", I64RT, [hasAtomMinMax64]>; } // atom.{inc,dec} multiclass ATOM2_incdec_impl<string OpStr> { - defm _u32 : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>; + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", I32RT, []>; } // atom.cas multiclass ATOM3_cas_impl<string OpStr> { - defm _b16 : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>; - defm _b32 : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>; - defm _b64 : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>; -} - -defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; -defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; -defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; -defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; -defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">; -defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; -defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; -defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; -defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; -defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; + defm _b16 : ATOM3S_impl<OpStr, "i", "b16", I16RT, []>; + defm _b32 : ATOM3S_impl<OpStr, "i", "b32", I32RT, []>; + defm _b64 : ATOM3S_impl<OpStr, "i", "b64", I64RT, []>; +} + +defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; +defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; +defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; +defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; +defm INT_PTX_SATOM_EXCH : ATOM2_exch_impl<"exch">; +defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; +defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; +defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; +defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; +defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; //----------------------------------- // Support for ldu on sm_20 or later diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index e1fbb53..e1d9aaf 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -40,18 +40,15 @@ define i64 @atom1(ptr %addr, i64 %val) { define i32 @atom2(ptr %subr, i32 %val) { ; CHECK-LABEL: atom2( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; ; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s32 temp; -; CHECK-NEXT: neg.s32 temp, %r1; -; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: neg.s32 %r2, %r1; +; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i32 %val seq_cst ret i32 %ret @@ -61,17 +58,14 @@ define i32 @atom2(ptr %subr, i32 %val) { define i64 @atom3(ptr %subr, i64 %val) { ; CHECK-LABEL: atom3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; ; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s64 temp; -; CHECK-NEXT: neg.s64 temp, %rd2; -; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: neg.s64 %rd3, %rd2; +; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i64 %val seq_cst ret i64 %ret |