diff options
Diffstat (limited to 'flang/lib/Optimizer/Builder/IntrinsicCall.cpp')
| -rw-r--r-- | flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 136 | 
1 files changed, 115 insertions, 21 deletions
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ca3e1cd..b9ea8b1 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -290,10 +290,22 @@ static constexpr IntrinsicHandler handlers[]{      {"atan2pi", &I::genAtanpi},      {"atand", &I::genAtand},      {"atanpi", &I::genAtanpi}, +    {"atomicadd_r2x2", +     &I::genAtomicAddVector, +     {{{"a", asAddr}, {"v", asAddr}}}, +     false}, +    {"atomicadd_r4x2", +     &I::genAtomicAddVector, +     {{{"a", asAddr}, {"v", asAddr}}}, +     false},      {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},      {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},      {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},      {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, +    {"atomicaddr2", +     &I::genAtomicAddR2, +     {{{"a", asAddr}, {"v", asAddr}}}, +     false},      {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},      {"atomiccasd",       &I::genAtomicCas, @@ -3119,7 +3131,6 @@ static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,  mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,                                             llvm::ArrayRef<mlir::Value> args) {    assert(args.size() == 2); -    mlir::LLVM::AtomicBinOp binOp =        mlir::isa<mlir::IntegerType>(args[1].getType())            ? mlir::LLVM::AtomicBinOp::add @@ -3127,6 +3138,85 @@ mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,    return genAtomBinOp(builder, loc, binOp, args[0], args[1]);  } +fir::ExtendedValue +IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, +                                 llvm::ArrayRef<fir::ExtendedValue> args) { +  assert(args.size() == 2); + +  mlir::Value a = fir::getBase(args[0]); + +  if (mlir::isa<fir::BaseBoxType>(a.getType())) { +    a = fir::BoxAddrOp::create(builder, loc, a); +  } + +  auto loc = builder.getUnknownLoc(); +  auto f16Ty = builder.getF16Type(); +  auto i32Ty = builder.getI32Type(); +  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); +  mlir::Type idxTy = builder.getIndexType(); +  auto f16RefTy = fir::ReferenceType::get(f16Ty); +  auto zero = builder.createIntegerConstant(loc, idxTy, 0); +  auto one = builder.createIntegerConstant(loc, idxTy, 1); +  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, +                                           fir::getBase(args[1]), zero); +  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, +                                           fir::getBase(args[1]), one); +  auto v1 = fir::LoadOp::create(builder, loc, v1Coord); +  auto v2 = fir::LoadOp::create(builder, loc, v2Coord); +  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); +  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( +      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); +  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( +      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); +  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); +  auto i32VecTy = mlir::VectorType::get({1}, i32Ty); +  mlir::Value vecI32 = +      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); +  return mlir::vector::ExtractOp::create(builder, loc, vecI32, +                                         mlir::ArrayRef<int64_t>{0}); +} + +fir::ExtendedValue +IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, +                                     llvm::ArrayRef<fir::ExtendedValue> args) { +  assert(args.size() == 2); +  mlir::Value res = fir::AllocaOp::create( +      builder, loc, fir::SequenceType::get({2}, resultType)); +  mlir::Value a = fir::getBase(args[0]); +  if (mlir::isa<fir::BaseBoxType>(a.getType())) { +    a = fir::BoxAddrOp::create(builder, loc, a); +  } +  auto vecTy = mlir::VectorType::get({2}, resultType); +  auto refTy = fir::ReferenceType::get(resultType); +  mlir::Type i32Ty = builder.getI32Type(); +  mlir::Type idxTy = builder.getIndexType(); +  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); +  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); +  mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy, +                                                  fir::getBase(args[1]), zero); +  mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy, +                                                  fir::getBase(args[1]), one); +  mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord); +  mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord); +  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); +  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( +      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); +  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( +      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); +  mlir::Value add = +      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); +  mlir::Value r1 = mlir::LLVM::ExtractElementOp::create( +      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); +  mlir::Value r2 = mlir::LLVM::ExtractElementOp::create( +      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); +  mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); +  mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); +  fir::StoreOp::create(builder, loc, r1, c1); +  fir::StoreOp::create(builder, loc, r2, c2); +  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); +  return fir::ArrayBoxValue(res, {ext}); +} +  mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,                                             llvm::ArrayRef<mlir::Value> args) {    assert(args.size() == 2); @@ -3345,13 +3435,12 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,    assert(args.size() == 2);    mlir::Value barrier = convertPtrToNVVMSpace(        builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); -  mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); -  // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and -  // currently just the sink symbol `_`. -  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -  mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], -                                               {}); -  return fir::LoadOp::create(builder, loc, token); +  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, +                                         {barrier, args[1]}, {}, +                                         "mbarrier.arrive.expect_tx.release." +                                         "cta.shared::cta.b64 %0, [%1], %2;", +                                         {}) +      .getResult(0);  }  // BARRIER_INIT (CUDA) @@ -3359,8 +3448,8 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {    assert(args.size() == 2);    mlir::Value barrier = convertPtrToNVVMSpace(        builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); -  mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, -                                           fir::getBase(args[1]), {}); +  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, +                                     fir::getBase(args[1]), {});    auto kind = mlir::NVVM::ProxyKindAttr::get(        builder.getContext(), mlir::NVVM::ProxyKind::async_shared);    auto space = mlir::NVVM::SharedSpaceAttr::get( @@ -3392,13 +3481,15 @@ IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,    builder.setInsertionPointToStart(afterBlock);    auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());    auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); -  mlir::Value ret = -      mlir::NVVM::InlinePtxOp::create( -          builder, loc, {resultType}, {barrier, args[1], ns}, {}, -          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " -          "selp.b32 %0, 1, 0, p;", -          {}) -          .getResult(0); +  mlir::Value ret = mlir::NVVM::InlinePtxOp::create( +                        builder, loc, {resultType}, {barrier, args[1], ns}, {}, +                        "{\n" +                        "  .reg .pred p;\n" +                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" +                        "  selp.b32 %0, 1, 0, p;\n" +                        "}", +                        {}) +                        .getResult(0);    mlir::scf::YieldOp::create(builder, loc, ret);    builder.setInsertionPointAfter(whileOp);    return whileOp.getResult(0); @@ -3413,8 +3504,11 @@ IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,    auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);    return mlir::NVVM::InlinePtxOp::create(               builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, -             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " -             "selp.b32 %0, 1, 0, p;", +             "{\n" +             "  .reg .pred p;\n" +             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" +             "  selp.b32 %0, 1, 0, p;\n" +             "}",               {})        .getResult(0);  } @@ -9455,7 +9549,7 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {        builder, loc, dst, src, fir::getBase(args[2]), {}, {});    mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, -                                  "cp.async.bulk.commit_group", {}); +                                  "cp.async.bulk.commit_group;", {});    mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,                                               builder.getI32IntegerAttr(0), {});  } @@ -9471,7 +9565,7 @@ static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,    mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,                                                       size, {}, {});    mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, -                                  "cp.async.bulk.commit_group", {}); +                                  "cp.async.bulk.commit_group;", {});    mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,                                               builder.getI32IntegerAttr(0), {});  }  | 
