//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps   ------------------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utility methods for working with the XeGPU dialect.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>

using namespace mlir;

/// convert ArrayRef<ValueRange> into SmallVector<Value>
SmallVector<Value> xegpu::flattenValues(ArrayRef<ValueRange> values) {
  SmallVector<Value> result;
  for (const auto &vals : values)
    llvm::append_range(result, vals);
  return result;
}

FailureOr<VectorType>
mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
  // It only works for subgroup level layout, which only has lane_layout
  // and lane_data, and is to distribute a SIMD code into SIMT code.
  if (!layout || !layout.isForSubgroup())
    return failure();

  SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
  SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
  auto tdescShape = tdescTy.getShape();
  auto elementType = tdescTy.getElementType();

  // compute sgSize by multiply elements of laneLayout
  // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
  // e.g. for 1D layout, sgSize = laneLayout[0]
  int64_t sgSize = llvm::product_of(laneLayout);

  // Case 1: regular loads/stores
  auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
  if (scatterAttr) {
    auto chunkSize = scatterAttr.getChunkSize().getInt();
    // Verify if the first dimension of the tensor descriptor shape is
    // distributable.
    assert(tdescShape[0] == laneLayout[0] &&
           "tensor descriptor shape is not distributable");
    return VectorType::get({chunkSize}, elementType);
  }

  // Case 2: block loads/stores
  // Check if the tensor descriptor shape is distributable.
  int64_t tensorSize = 1;
  for (auto [tdescDim, laneDim, laneDataDim] :
       llvm::zip_equal(tdescShape, laneLayout, laneData)) {
    assert((tdescDim % (laneDim * laneDataDim) == 0) &&
           "tensor descriptor shape is not distributable");
    tensorSize *= tdescDim;
  }
  // tensorSize must be adjusted for array_length.
  tensorSize *= tdescTy.getArrayLength();

  return VectorType::get({tensorSize / sgSize}, elementType);
}

FailureOr<VectorType>
mlir::xegpu::getDistributedVectorType(VectorType originalType,
                                      xegpu::LayoutAttr layout) {
  int64_t rank = originalType.getRank();
  // Distributed vector type is only supported for 1D, 2D and 3D vectors.
  if (rank < 1 || rank > 3)
    return failure();
  ArrayRef<int64_t> shape = originalType.getShape();
  // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
  // of the 3D vector.
  int arrayLength = 1;
  if (rank == 3) {
    arrayLength = shape[0];
    shape = shape.drop_front();
  }
  auto helperTdescTy = xegpu::TensorDescType::get(
      shape, originalType.getElementType(), arrayLength,
      /*boundary_check=*/true,
      /*memory_space=*/xegpu::MemorySpace::Global, layout);
  return xegpu::getDistributedVectorType(helperTdescTy);
}

std::string xegpu::getLayoutName(const OpOperand &operand) {
  const StringRef prefix("layout_operand_");
  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
  return llvm::formatv("{0}{1}", prefix, idx).str();
}

std::string xegpu::getLayoutName(const OpResult result) {
  const StringRef prefix = "layout_result_";
  return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
}

xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
  if (!value)
    return nullptr;

  if (auto tdescTy =
          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
    return tdescTy.getLayoutAttr();

  if (auto result = dyn_cast<OpResult>(value)) {
    Operation *defOp = result.getDefiningOp();
    assert(defOp && "result must have a defining op");

    // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
      return convertOp.getTargetLayoutAttr();

    // for LoadNdOp, the layout is stored in the tensor descriptor
    if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
      return getDistributeLayoutAttr(loadNd.getTensorDesc());

    // for LoadMatrixOp, the layout is attached to the property of the op
    if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
      return loadOp.getLayoutAttr();

    // for StoreMatrixOp, the layout is attached to the property of the op
    if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
      return storeOp.getLayoutAttr();

    std::string layoutName = getLayoutName(result);
    if (defOp->hasAttr(layoutName))
      return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
  }

  if (auto arg = dyn_cast<BlockArgument>(value)) {
    auto parentOp = arg.getOwner()->getParentOp();
    if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
      OpOperand *tiedInit = loop.getTiedLoopInit(arg);
      if (tiedInit)
        return getDistributeLayoutAttr(tiedInit->get());
    }
  }

  return nullptr;
}

xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
  Operation *op = opr.getOwner();

  if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
    return loadOp.getLayoutAttr();

  if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
    return storeOp.getLayoutAttr();

  std::string layoutName = xegpu::getLayoutName(opr);
  if (op->hasAttr(layoutName))
    return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
  return getDistributeLayoutAttr(opr.get());
}

template <typename T, typename>
void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
                                    const DistributeLayoutAttr layout) {
  Operation *owner = operandOrResult.getOwner();
  std::string name = xegpu::getLayoutName(operandOrResult);
  if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name))
    owner->setAttr(name, layout);
}

// Explicit instantiation for OpResult
template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
    const mlir::OpResult &result,
    const mlir::xegpu::DistributeLayoutAttr layout);

// Explicit instantiation for OpOperand
template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
    const mlir::OpOperand &operand,
    const mlir::xegpu::DistributeLayoutAttr layout);

void xegpu::setDistributeLayoutAttrs(
    Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
  op->walk([&](Operation *nestOp) {
    if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
      return;

    for (OpOperand &opr : nestOp->getOpOperands()) {
      auto layout = getLayoutImpl(opr.get());
      setDistributeLayoutAttr(opr, layout);
    }
    for (OpResult result : nestOp->getOpResults()) {
      auto layout = getLayoutImpl(result);
      setDistributeLayoutAttr(result, layout);
    }
  });
}

template <typename T, typename>
void xegpu::removeLayoutAttr(const T &operandOrResult) {
  Operation *owner = operandOrResult.getOwner();
  std::string name = xegpu::getLayoutName(operandOrResult);
  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
    owner->removeAttr(name);
}

// Explicit instantiation for OpResult
template void
xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);

// Explicit instantiation for OpOperand
template void
xegpu::removeLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand);

void xegpu::removeLayoutAttrs(Operation *op) {
  op->walk([&](Operation *nestOp) {
    for (OpOperand &opr : nestOp->getOpOperands())
      removeLayoutAttr(opr);
    for (OpResult result : nestOp->getOpResults())
      removeLayoutAttr(result);
  });
}

SmallVector<Value>
xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
                                        Value value, ArrayRef<int64_t> shape) {
  auto vecTy = dyn_cast<VectorType>(value.getType());
  if (!vecTy)
    return {value};

  ArrayRef<int64_t> srcShape = vecTy.getShape();
  if (!computeShapeRatio(srcShape, shape))
    return {value};

  SmallVector<Value> result;
  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) {
    SmallVector<int64_t> staticStrides(offsets.size(), 1);
    result.push_back(vector::ExtractStridedSliceOp::create(
        builder, loc, value, offsets, shape, staticStrides));
  }

  return result;
}

Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
                                             ValueRange values,
                                             ArrayRef<int64_t> shape) {
  VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
  assert(llvm::all_of(values.getTypes(),
                      [&](Type type) { return type == inputTy; }) &&
         "values must be of the same VectorType");

  Type elemTy = inputTy.getElementType();
  ArrayRef<int64_t> tileShape = inputTy.getShape();

  VectorType resultTy = VectorType::get(shape, elemTy);
  auto zeroAttr = builder.getZeroAttr(elemTy);
  Value result = arith::ConstantOp::create(
      builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));

  for (auto [src, offsets] :
       llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
    SmallVector<int64_t> staticStrides(offsets.size(), 1);
    result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
                                                  offsets, staticStrides);
  }
  return result;
}

void xegpu::doSCFStructuralTypeConversionWithTensorType(
    Operation *op, TypeConverter converter) {
  MLIRContext *context = op->getContext();

  auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
                            Location loc) -> Value {
    return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
        .getResult(0);
  };

  { // convert VectorType to RankedTensorType for SCF Structural ops
    TypeConverter converter;
    converter.addConversion([](Type type) -> Type { return type; });
    converter.addConversion([](VectorType type) -> Type {
      return RankedTensorType::get(type.getShape(), type.getElementType());
    });
    converter.addSourceMaterialization(materializeCast);
    converter.addTargetMaterialization(materializeCast);

    mlir::ConversionTarget target(*context);
    target.addLegalOp<UnrealizedConversionCastOp>();

    mlir::RewritePatternSet patterns(context);
    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                         target);
    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
  }

  { // propagate the layout attribute to RankedTensorType by checking
    // BuiltInUnrealizedCastOps
    // for VectorType to RankedTensorType cast.
    op->walk([](UnrealizedConversionCastOp castOp) {
      if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
        return WalkResult::skip();

      Value input = castOp.getInputs()[0];
      Value result = castOp.getResults()[0];
      auto inputTy = dyn_cast<VectorType>(input.getType());
      auto resultTy = dyn_cast<RankedTensorType>(result.getType());

      // Only look at ops casting from VectorType to RankedTensorType
      if (!inputTy || !resultTy)
        return WalkResult::skip();

      xegpu::DistributeLayoutAttr layout =
          xegpu::getDistributeLayoutAttr(input);
      if (!layout)
        return WalkResult::skip();

      RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
      result.setType(newTy);

      // update the arguments if user is a LoopLike op.
      for (OpOperand &use : result.getUses()) {
        if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
          BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
          arg.setType(newTy);
        }
        // whileOp has two regions, the BlockArgument of the after region
        // is not exposed by LoopLikeOpInterface
        if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
          unsigned idx = use.getOperandNumber();
          BlockArgument arg = whileOp.getAfterArguments()[idx];
          arg.setType(newTy);
        }
      }
      return WalkResult::advance();
    });

    // using yieldOp as anchor to update the result type of its ParentOp
    op->walk([](scf::YieldOp yieldOp) {
      Operation *parentOp = yieldOp->getParentOp();
      for (OpResult r : parentOp->getOpResults()) {
        unsigned idx = r.getResultNumber();
        Type resultTy = r.getType();
        Type yieldTy = yieldOp.getResults()[idx].getType();
        if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
          r.setType(yieldTy);
      }
    });
  }

  { // perform the conversion from RankedTensorType to VectorType based on the
    // DistributeLayoutAttr

    // Handle the UnrealizedConversionCastOp introduced by the first step.
    // For vector->RankedTensorType, it will simply forward the inputs.
    // For RankedTensorType->vector, it will update the inputs with the
    // one from the adaptor.
    class UnrealizedConversionCastOpPattern
        : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
      using OpConversionPattern<
          mlir::UnrealizedConversionCastOp>::OpConversionPattern;

      mlir::LogicalResult
      matchAndRewrite(mlir::UnrealizedConversionCastOp op,
                      OneToNOpAdaptor adaptor,
                      ConversionPatternRewriter &rewriter) const override {
        auto inputs = op.getOperands();
        auto outputs = op.getOutputs();

        if (inputs.size() != 1 || outputs.size() != 1)
          return failure();

        auto inputTy = inputs[0].getType();
        auto outputTy = outputs[0].getType();

        if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
          rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
          return success();
        }

        if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
          SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
          auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
                                                          outputTy, values);
          rewriter.replaceOp(op, newOp);
          return success();
        }
        return failure();
      }
    };

    converter.addSourceMaterialization(materializeCast);
    converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
                                           ValueRange inputs, Location loc) {
      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
          .getResults();
    });

    mlir::ConversionTarget target(*context);
    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
        [](UnrealizedConversionCastOp op) {
          auto isTensorTy = [](Type type) {
            return isa<RankedTensorType>(type);
          };
          return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
                 llvm::none_of(op->getResultTypes(), isTensorTy);
        });
    mlir::RewritePatternSet patterns(context);
    patterns.insert<UnrealizedConversionCastOpPattern>(context);
    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                         target);
    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
  }
}

std::optional<std::string> xegpu::getChipStr(Operation *op) {
  auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();

  if (!gpuModuleOp)
    return std::nullopt;

  auto targetAttrs = gpuModuleOp.getTargets();
  if (targetAttrs) {
    for (auto &attr : *targetAttrs) {
      auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
      if (xevmAttr)
        return xevmAttr.getChip().str();
    }
  }

  return std::nullopt;
}

/// Generates element-wise addition ops of two arrays with same length.
SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
                                                Location loc,
                                                ArrayRef<OpFoldResult> lhs,
                                                ArrayRef<OpFoldResult> rhs) {
  assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
  SmallVector<OpFoldResult> results;
  for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
    auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
    auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
    results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
  }
  return results;
}

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
/// the longer array are preserved unchanged. This is commonly used for offset
/// computation where higher-dimensional offsets need to be added to
/// lower-dimensional adjustments.
///
/// Example:
///   lhs = [l1, l2, l3], rhs = [r1, r2]
///   Result: [11, l2+r1, l3+r2]
SmallVector<OpFoldResult>
xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
                           ArrayRef<OpFoldResult> lhs,
                           ArrayRef<OpFoldResult> rhs) {
  // ensure a is longer than b
  ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
  ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
  SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
  a = a.slice(a.size() - b.size());
  results.append(addElementwise(builder, loc, a, b));
  return results;
}