mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941

//===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the linalg dialect Tiling pass.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/Passes.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Transforms/FoldUtils.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include <utility>

namespace mlir {
#define GEN_PASS_DEF_LINALGTILINGPASS
#include "mlir/Dialect/Linalg/Passes.h.inc"
} // namespace mlir

using namespace mlir;
using namespace mlir::affine;
using namespace mlir::linalg;
using namespace mlir::scf;

#define DEBUG_TYPE "linalg-tiling"

std::tuple<SmallVector<Range, 4>, LoopIndexToRangeIndexMap>
mlir::linalg::makeTiledLoopRanges(RewriterBase &b, Location loc, AffineMap map,
                                  ArrayRef<OpFoldResult> allShapeSizes,
                                  ArrayRef<OpFoldResult> allTileSizes) {
  assert(allTileSizes.size() == map.getNumResults());
  // Apply `map` to get shape sizes in loop order.
  SmallVector<OpFoldResult> shapeSizes =
      makeComposedFoldedMultiResultAffineApply(b, loc, map, allShapeSizes);
  SmallVector<OpFoldResult> tileSizes(allTileSizes.begin(), allTileSizes.end());

  // Traverse the tile sizes, which are in loop order, erase zeros everywhere.
  LoopIndexToRangeIndexMap loopIndexToRangeIndex;
  for (int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
    if (getConstantIntValue(tileSizes[idx - zerosCount]) ==
        static_cast<int64_t>(0)) {
      shapeSizes.erase(shapeSizes.begin() + idx - zerosCount);
      tileSizes.erase(tileSizes.begin() + idx - zerosCount);
      ++zerosCount;
      continue;
    }
    loopIndexToRangeIndex[idx] = idx - zerosCount;
  }

  // Create a new range with the applied tile sizes.
  SmallVector<Range, 4> res;
  for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx)
    res.push_back(Range{b.getIndexAttr(0), shapeSizes[idx], tileSizes[idx]});
  return std::make_tuple(res, loopIndexToRangeIndex);
}

void mlir::linalg::transformIndexOps(
    RewriterBase &b, LinalgOp op, SmallVectorImpl<Value> &ivs,
    const LoopIndexToRangeIndexMap &loopIndexToRangeIndex) {
  SmallVector<Value> allIvs(op.getNumLoops(), nullptr);
  for (auto en : enumerate(allIvs)) {
    auto rangeIndex = loopIndexToRangeIndex.find(en.index());
    if (rangeIndex == loopIndexToRangeIndex.end())
      continue;
    en.value() = ivs[rangeIndex->second];
  }
  offsetIndices(b, op, getAsOpFoldResult(allIvs));
}

/// Asserts that the given index-typed value is strictly positive. If the value
/// is an attribute, asserts at compile time, otherwise emits an assertion
/// checked at runtime.
static void emitIsPositiveIndexAssertion(ImplicitLocOpBuilder &b,
                                         OpFoldResult value) {
  if (auto attr = llvm::dyn_cast_if_present<Attribute>(value)) {
    assert(cast<IntegerAttr>(attr).getValue().isStrictlyPositive() &&
           "expected strictly positive tile size and divisor");
    return;
  }

  Value zero = b.create<arith::ConstantIndexOp>(0);
  Value condition = b.create<arith::CmpIOp>(arith::CmpIPredicate::sgt,
                                            value.get<Value>(), zero);
  b.create<cf::AssertOp>(
      condition,
      b.getStringAttr("expected strictly positive tile size and divisor"));
}

FailureOr<StaticMultiSizeSpecification>
mlir::linalg::computeStaticMultiTileSizes(LinalgOp op, unsigned dimension,
                                          int64_t targetSize, int64_t divisor) {
  assert(!op.hasDynamicShape() &&
         "cannot compute static multi-tile sizes for an op with dynamic shape");
  assert(targetSize > 0 && "target size must be non-negative");
  assert(divisor > 0 && "divisor must be non-negative");
  assert(dimension < op.getNumLoops() && "dimension overflow");

  StaticMultiSizeSpecification spec;
  int64_t tripCount = op.getStaticLoopRanges()[dimension];
  int64_t a = tripCount / divisor;
  int64_t t = (targetSize + divisor - 1) / divisor;
  int64_t totalTripCount = (a + t - 1) / t;
  spec.lowTileSize = (a / totalTripCount) * divisor;
  spec.highTileSize = spec.lowTileSize + divisor;
  spec.highTripCount = a % totalTripCount;
  spec.lowTripCount = totalTripCount - spec.highTripCount;
  if (spec.lowTileSize * spec.lowTripCount +
          spec.highTileSize * spec.highTripCount !=
      tripCount) {
    return failure();
  }
  return spec;
}

FailureOr<MultiSizeSpecification>
mlir::linalg::computeMultiTileSizes(OpBuilder &builder, LinalgOp op,
                                    unsigned dimension, OpFoldResult targetSize,
                                    OpFoldResult divisor, bool emitAssertions) {
  // Bail out on dimension overflow.
  if (dimension >= op.getNumLoops())
    return failure();

  // The code below works only on values.
  Location loc = op.getLoc();
  ImplicitLocOpBuilder b(loc, builder);
  if (emitAssertions) {
    emitIsPositiveIndexAssertion(b, targetSize);
    emitIsPositiveIndexAssertion(b, divisor);
  }
  Value targetSizeValue =
      getValueOrCreateConstantIndexOp(builder, loc, targetSize);
  Value divisorValue = getValueOrCreateConstantIndexOp(builder, loc, divisor);

  // Find the trip count of the iteration space dimension for which the tile
  // sizes are computed.
  SmallVector<OpFoldResult> allShapes =
      op.createFlatListOfOperandDims(b, b.getLoc());
  AffineMap shapesToLoops = op.getShapesToLoopsMap();
  SmallVector<OpFoldResult> loopRanges =
      makeComposedFoldedMultiResultAffineApply(b, op.getLoc(), shapesToLoops,
                                               allShapes);
  Value tripCount =
      getValueOrCreateConstantIndexOp(b, op.getLoc(), loopRanges[dimension]);

  // Compute the tile sizes and the respective numbers of tiles.
  AffineExpr s0 = b.getAffineSymbolExpr(0);
  AffineExpr s1 = b.getAffineSymbolExpr(1);
  AffineExpr s2 = b.getAffineSymbolExpr(2);
  auto apply = [&](AffineExpr expr, ArrayRef<OpFoldResult> ofrs) -> Value {
    return affine::makeComposedAffineApply(b, b.getLoc(), expr, ofrs);
  };
  Value a = apply(s0.floorDiv(s1), {tripCount, divisorValue});
  Value t = apply((s0 + s1 - 1).floorDiv(s1), {targetSizeValue, divisorValue});
  Value d = apply((s0 + s1 - 1).floorDiv(s1), {a, t});
  Value s = apply(s0.floorDiv(s1) * s2, {a, d, divisorValue});
  Value v = apply(s0 % s1, {a, d});
  Value u = apply(s0 - s1, {d, v});

  MultiSizeSpecification spec;
  spec.lowTileSize = s;
  spec.highTileSize = apply(s0 + s1, {s, divisorValue});
  spec.lowTripCount = u;
  spec.highTripCount = v;

  // If requested, emit the check that the tile sizes are computed correctly.
  // For example, for iteration dimension size of 15 and the target size 8 it is
  // impossible to find two tile sizes both divisible by 8 that fully cover the
  // original space dimension.
  if (emitAssertions) {
    AffineExpr s3 = builder.getAffineSymbolExpr(3);
    Value coveredSize =
        apply(s0 * s1 + s2 * s3, {spec.lowTileSize, spec.lowTripCount,
                                  spec.highTileSize, spec.highTripCount});
    Value equals = b.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
                                           coveredSize, tripCount);
    b.create<cf::AssertOp>(
        equals, builder.getStringAttr(
                    "could not compute dynamic multi-size tile shapes"));
  }

  return spec;
}

/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
/// than `iterationSize`.
static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
                                           OpFoldResult numThreads,
                                           OpFoldResult iterationSize) {
  std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);
  std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);
  std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);
  if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
    return false;
  return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
}

/// Build an `affine_max` of all the `vals`.
static OpFoldResult buildMax(OpBuilder &b, Location loc,
                             ArrayRef<OpFoldResult> vals) {
  return affine::makeComposedFoldedAffineMax(
      b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()),
      vals);
}

/// Build an `affine_min` of all the `vals`.
static OpFoldResult buildMin(OpBuilder &b, Location loc,
                             ArrayRef<OpFoldResult> vals) {
  return affine::makeComposedFoldedAffineMin(
      b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), loc.getContext()),
      vals);
}

/// Fill out the `tiledOffsets` and `tiledSizes` to be used to tile to a given
/// number of threads.
static void calculateTileOffsetsAndSizes(
    RewriterBase &b, Location loc, scf::ForallOp forallOp,
    ArrayRef<OpFoldResult> numThreads, SmallVector<Range> loopRanges,
    bool omitTileOffsetBoundsCheck,
    std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
    SmallVector<OpFoldResult> &tiledOffsets,
    SmallVector<OpFoldResult> &tiledSizes) {
  OpBuilder::InsertionGuard g(b);
  b.setInsertionPointToStart(forallOp.getBody(0));

  SmallVector<Value> threadIds = forallOp.getInductionVars();
  SmallVector<OpFoldResult> nonZeroNumThreads =
      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
        return !isConstantIntValue(ofr, 0);
      }));
  int64_t nLoops = loopRanges.size();
  tiledOffsets.reserve(nLoops);
  tiledSizes.reserve(nLoops);
  for (unsigned loopIdx = 0, threadIdIdx = 0; loopIdx < nLoops; ++loopIdx) {
    bool overflow = loopIdx >= numThreads.size();
    bool isZero = !overflow && isConstantIntValue(numThreads[loopIdx], 0);
    // Degenerate case: take the whole domain.
    if (overflow || isZero) {
      tiledOffsets.push_back(loopRanges[loopIdx].offset);
      tiledSizes.push_back(loopRanges[loopIdx].size);
      continue;
    }

    // Tiled case: compute the offset and size.
    AffineExpr i, j, m, n, o;
    bindDims(b.getContext(), i, j);
    bindSymbols(b.getContext(), m, n, o);
    OpFoldResult size = loopRanges[loopIdx].size;
    OpFoldResult offset = loopRanges[loopIdx].offset;
    OpFoldResult threadId = threadIds[threadIdIdx];
    // Symbolic fixed max size per thread.
    // TODO: floor + 0/1 depending on case for better load-balancing.
    OpFoldResult tileSizePerThread =
        nominalTileSizes.has_value()
            ? (*nominalTileSizes)[loopIdx]
            : makeComposedFoldedAffineApply(
                  b, loc, m.ceilDiv(n),
                  ArrayRef<OpFoldResult>{size, nonZeroNumThreads[threadIdIdx]});

    // Dynamic offset shifted by threadId * maxSizePerThread.
    OpFoldResult offsetPerThread = makeComposedFoldedAffineApply(
        b, loc, i + j * m, {offset, threadId, tileSizePerThread});
    // Dynamic upper-bound depending on the threadId.
    OpFoldResult residualTileSize = makeComposedFoldedAffineApply(
        b, loc, i + j * m - n,
        {offset, nonZeroNumThreads[threadIdIdx], tileSizePerThread, size});
    if (!isConstantIntValue(residualTileSize, 0)) {
      OpFoldResult sizeMinusOffsetPerThread = makeComposedFoldedAffineApply(
          b, loc, -i + m, {offsetPerThread, size});
      tileSizePerThread =
          buildMin(b, loc, {sizeMinusOffsetPerThread, tileSizePerThread});
    }

    tiledOffsets.push_back(offsetPerThread);
    // TODO: if tileSizePerThread <= 0 early exit.
    if (!omitTileOffsetBoundsCheck &&
        !canOmitTileOffsetInBoundsCheck(tileSizePerThread,
                                        nonZeroNumThreads[threadIdIdx], size))
      tileSizePerThread =
          buildMax(b, loc, {b.getIndexAttr(0), tileSizePerThread});

    tiledSizes.push_back(tileSizePerThread);
    ++threadIdIdx;
  }
}

/// Returns a vector of bools representing if, for each axis, `op` can be tiled
/// without incurring in a race condition and thus it is thread-safe to do the
/// tiling. This is checked by iterating over numThreads and ensuring that the
/// corresponding iterator type is "parallel". If it is not, then we know that
/// such dimension is unsafe to tile.
SmallVector<bool> safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp,
                                     ArrayRef<OpFoldResult> numThreads) {
  auto iterators = linalgOp.getIteratorTypesArray();
  SmallVector<bool> safeToTile(numThreads.size(), true);

  for (unsigned i = 0, e = numThreads.size(); i != e; i++) {
    if (auto attr = llvm::dyn_cast_if_present<Attribute>(numThreads[i])) {
      if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1) {
        safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
      }
    } else {
      safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
    }
  }
  return safeToTile;
}

/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
/// tiling is specified by the number of tiles/threads `numThreads` and the
/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
/// not specified, then  it is derived from `numThreads` as `ceilDiv(dimSize[i],
/// numThreads[i])`. If non-empty, the `mapping` is added as an
/// attribute to the resulting `scf.forall`. A zero tile sizes indicate
/// that the dimension is not tiled, and can be thought of as tiling by the full
/// size of data.
/// It is the user's responsibility to ensure that `numThreads` is a valid
/// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
/// Linalg case). If the dimension is not parallelizable, a warning is issued to
/// notify the user that the generated code is not safe to parallelize. If
/// `omitTileOffsetBoundsCheck` is true, then the function will assume that
/// `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
static FailureOr<ForallTilingResult> tileToForallOpImpl(
    RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
    std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
    std::optional<ArrayAttr> mapping, bool omitTileOffsetBoundsCheck) {
  Location loc = op->getLoc();
  OpBuilder::InsertionGuard g(b);

  SmallVector<Range> loopRanges = op.getIterationDomain(b);
  if (loopRanges.empty())
    return op->emitOpError("expected non-empty loop ranges");
  auto hasStrideOne = [](Range r) { return !isConstantIntValue(r.stride, 1); };
  if (llvm::any_of(loopRanges, hasStrideOne))
    return op->emitOpError("only stride-1 supported atm");

  // Gather destination tensors.
  SmallVector<Value> dest;
  if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))
    return op->emitOpError("failed to get destination tensors");

  SmallVector<OpFoldResult> nonZeroNumThreads =
      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
        return !isConstantIntValue(ofr, 0);
      }));
  SmallVector<Value> materializedNonZeroNumThreads =
      llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](OpFoldResult ofr) {
        return getValueOrCreateConstantIndexOp(b, loc, ofr);
      }));

  LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
  if (linalgOp) {
    // Check if tiling is thread safe and print a warning if not.
    SmallVector<bool> tilingSafety =
        safeToTileToForall(b.getContext(), linalgOp, numThreads);
    for (size_t i = 0; i < tilingSafety.size(); i++)
      if (!tilingSafety[i])
        op.emitWarning() << "tiling is not thread safe at axis #" << i;
  }

  // 1. Create the ForallOp. We don't use the lambda body-builder
  // version because we require the use of RewriterBase in the body, so we
  // manually move the insertion point to the body below.
  scf::ForallOp forallOp = b.create<scf::ForallOp>(
      loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping);

  // 2. Fill out the ForallOp body.
  SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges,
                               omitTileOffsetBoundsCheck, nominalTileSizes,
                               tiledOffsets, tiledSizes);

  // 3. Clone the tileable op and update its destination operands to use the
  // output bbArgs of the ForallOp.
  ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
  Operation *tiledOp = nullptr;
  SmallVector<Value> tiledValues;
  {
    // 3.a. RAII guard, inserting within forallOp, before terminator.
    OpBuilder::InsertionGuard g(b);
    b.setInsertionPoint(forallOp.getTerminator());
    Operation *clonedOp = b.clone(*op.getOperation());
    auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
    if (destinationStyleOp) {
      for (OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) {
        // Swap tensor inits with the corresponding block argument of the
        // scf.forall op. Memref inits remain as is.
        if (isa<TensorType>(outOperand.get().getType())) {
          auto *it = llvm::find(dest, outOperand.get());
          assert(it != dest.end() && "could not find destination tensor");
          unsigned destNum = std::distance(dest.begin(), it);
          outOperand.set(destBbArgs[destNum]);
        }
      }
    }

    // 4. Tile the cloned op and delete the clone.
    FailureOr<TilingResult> tilingResult =
        cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
                                                               tiledSizes);
    if (failed(tilingResult))
      return clonedOp->emitError("Failed to tile op: ");
    if (tilingResult->tiledOps.size() != 1) {
      return clonedOp->emitError("expected a single produced tiled op, got ")
             << tilingResult->tiledOps.size();
    }

    b.eraseOp(clonedOp);
    tiledOp = tilingResult->tiledOps.front();
    tiledValues = tilingResult->tiledValues;
  }

  // 5. Parallel insert back into the result tensor.
  for (auto it : llvm::zip(llvm::seq(unsigned(0), unsigned(dest.size())),
                           tiledValues, destBbArgs)) {
    // 5.a. Partial subset information is inserted just before the terminator.
    OpBuilder::InsertionGuard g(b);
    b.setInsertionPoint(forallOp.getTerminator());

    SmallVector<OpFoldResult> resultOffsets, resultSizes;
    if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
                                        tiledSizes, resultOffsets,
                                        resultSizes)))
      return op->emitOpError("output offsets couldn't be calculated");
    SmallVector<OpFoldResult> strides(resultSizes.size(), b.getIndexAttr(1));

    // 5.b. Parallel insertions are inserted at the end of the combining
    // terminator.
    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
    b.create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
                                            std::get<2>(it), resultOffsets,
                                            resultSizes, strides);
  }
  return ForallTilingResult{forallOp, tiledOp};
}

FailureOr<ForallTilingResult>
linalg::tileToForallOp(RewriterBase &b, TilingInterface op,
                       ArrayRef<OpFoldResult> numThreads,
                       std::optional<ArrayAttr> mapping) {
  return tileToForallOpImpl(b, op, numThreads,
                            /*nominalTileSizes=*/std::nullopt, mapping,
                            /*omitTileOffsetBoundsCheck=*/false);
}

FailureOr<ForallTilingResult>
linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op,
                                     ArrayRef<OpFoldResult> tileSizes,
                                     std::optional<ArrayAttr> mapping) {
  SmallVector<Range> loopRanges = op.getIterationDomain(b);
  unsigned nLoops = loopRanges.size();
  SmallVector<OpFoldResult> numThreads;
  numThreads.reserve(nLoops);
  AffineExpr s0, s1;
  bindSymbols(b.getContext(), s0, s1);
  AffineExpr divExpr = s0.ceilDiv(s1);
  for (const auto &it : llvm::zip(tileSizes, loopRanges)) {
    OpFoldResult numTiles = std::get<0>(it);
    if (!isConstantIntValue(numTiles, 0))
      numTiles = makeComposedFoldedAffineApply(
          b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
    numThreads.push_back(numTiles);
  }
  return tileToForallOpImpl(b, op, numThreads,
                            /*nominalTileSizes=*/tileSizes, mapping,
                            /*omitTileOffsetBoundsCheck=*/true);
}

template <typename LoopTy>
static FailureOr<TiledLinalgOp>
tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,
                 const LinalgTilingOptions &options) {
  OpBuilder::InsertionGuard g(b);

  auto nLoops = op.getNumLoops();
  // Initial tile sizes may be too big, only take the first nLoops.
  tileSizes = tileSizes.take_front(nLoops);

  if (llvm::all_of(tileSizes, [](OpFoldResult ofr) {
        return getConstantIntValue(ofr) == static_cast<int64_t>(0);
      })) {
    TiledLinalgOp tiledOp;
    tiledOp.op = cast<LinalgOp>(b.clone(*op.getOperation()));
    tiledOp.tensorResults.assign(tiledOp.op->result_begin(),
                                 tiledOp.op->result_end());
    return tiledOp;
  }

  // 1. Build the tiled loop ranges.
  SmallVector<OpFoldResult> allShapeSizes =
      op.createFlatListOfOperandDims(b, op.getLoc());
  AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
  if (!shapeSizesToLoopsMap)
    return failure();

  auto [loopRanges, loopIndexToRangeIndex] = makeTiledLoopRanges(
      b, op.getLoc(), shapeSizesToLoopsMap, allShapeSizes, tileSizes);

  SmallVector<utils::IteratorType, 4> iteratorTypes;
  for (const auto &attr : enumerate(op.getIteratorTypesArray())) {
    if (loopIndexToRangeIndex.count(attr.index()))
      iteratorTypes.push_back(attr.value());
  }
  // If interchangeVector is empty, use the identity. Build the permutation map
  // otherwise.
  auto invPermutationMap =
      AffineMap::getMultiDimIdentityMap(tileSizes.size(), b.getContext());
  if (!options.interchangeVector.empty()) {
    // Based on the pruned iterations (due to zero tile size), recompute the
    // interchange vector.
    SmallVector<unsigned, 4> interchangeVector;
    interchangeVector.reserve(options.interchangeVector.size());
    for (auto pos : options.interchangeVector) {
      auto it = loopIndexToRangeIndex.find(pos);
      if (it == loopIndexToRangeIndex.end())
        continue;
      interchangeVector.push_back(it->second);
    }
    // Interchange vector is guaranteed to be a permutation,
    // `inversePermutation` must succeed.
    invPermutationMap = inversePermutation(
        AffineMap::getPermutationMap(interchangeVector, b.getContext()));
    assert(invPermutationMap);
    SmallVector<int64_t> permutation(interchangeVector.begin(),
                                     interchangeVector.end());
    applyPermutationToVector(loopRanges, permutation);
    applyPermutationToVector(iteratorTypes, permutation);
  }

  // Handle distribution. Create a vector of the same size of loops that are to
  // be tiled.
  SmallVector<linalg::ProcInfo> procInfo;
  if (options.distribution) {
    procInfo.resize(
        iteratorTypes.size(),
        linalg::ProcInfo{nullptr, nullptr, linalg::DistributionMethod::None});
    // Collect loop ranges of tiled loops, loops that are parallel.
    SmallVector<Range> parallelLoopRanges;
    for (const auto &iteratorType : llvm::enumerate(iteratorTypes)) {
      if (!isParallelIterator(iteratorType.value()))
        break;
      parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
    }
    auto returnedProcInfo =
        options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges);
    unsigned procIdIdx = 0;
    // Update the distribution information for the loops.
    for (const auto &iteratorType : llvm::enumerate(iteratorTypes)) {
      if (!isParallelIterator(iteratorType.value()))
        break;
      procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++];
    }
  }

  // 2. Create the tiled loops.
  LinalgOp res = op;
  SmallVector<Value, 4> ivs, tensorResults;
  auto tiledLoopBodyBuilder =
      [&](OpBuilder &builder, Location loc, ValueRange localIvs,
          ValueRange operandValuesToUse) -> scf::ValueVector {
    ivs.assign(localIvs.begin(), localIvs.end());

    // When an `interchangeVector` is present, it has been applied to the
    // loop ranges and the iterator types. Apply its inverse to the
    // resulting loop `ivs` to match the op definition.
    SmallVector<Value, 4> interchangedIvs;
    if (!options.interchangeVector.empty()) {
      for (AffineExpr result : invPermutationMap.getResults())
        interchangedIvs.push_back(
            ivs[cast<AffineDimExpr>(result).getPosition()]);
    } else {
      interchangedIvs.assign(ivs.begin(), ivs.end());
    }

    // Tile the `operandValuesToUse` that either match the `op` operands
    // themselves or the tile loop arguments forwarding them.
    assert(operandValuesToUse.size() ==
               static_cast<size_t>(op->getNumOperands()) &&
           "expect the number of operands and inputs and outputs to match");
    SmallVector<Value> valuesToTile = operandValuesToUse;
    SmallVector<OpFoldResult> sizeBounds =
        makeComposedFoldedMultiResultAffineApply(b, loc, shapeSizesToLoopsMap,
                                                 allShapeSizes);
    SmallVector<Value> tiledOperands = makeTiledShapes(
        b, loc, op, valuesToTile, getAsOpFoldResult(interchangedIvs), tileSizes,
        sizeBounds,
        /*omitPartialTileCheck=*/false);

    SmallVector<Type> resultTensorTypes =
        getTensorOutputTypes(op, tiledOperands);
    res = clone(b, op, resultTensorTypes, tiledOperands);
    tensorResults =
        insertSlicesBack(builder, loc, op, tiledOperands, res->getResults());
    return scf::ValueVector(tensorResults.begin(), tensorResults.end());
  };
  GenerateLoopNest<LoopTy>::doit(b, op.getLoc(), loopRanges, op, iteratorTypes,
                                 tiledLoopBodyBuilder, procInfo);

  // 3. Transform IndexOp results w.r.t. the tiling.
  transformIndexOps(b, res, ivs, loopIndexToRangeIndex);

  // 4. Gather the newly created loops and return them with the new op.
  SmallVector<Operation *, 8> loops;
  loops.reserve(ivs.size());
  for (auto iv : ivs) {
    if (isa<BlockArgument>(iv)) {
      loops.push_back(cast<BlockArgument>(iv).getOwner()->getParentOp());
      assert(loops.back() && "no owner found for induction variable!");
    } else {
      // TODO: Instead of doing this, try to recover the ops used instead of the
      // loop.
      loops.push_back(nullptr);
    }
  }

  // 5. Get the tensor results from the outermost loop if available. Otherwise
  // use the previously captured `tensorResults`.
  Operation *outermostLoop = nullptr;
  for (Operation *loop : loops)
    if ((outermostLoop = loop))
      break;

  return TiledLinalgOp{
      res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};
}

FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
    RewriterBase &b, PartialReductionOpInterface op,
    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,
    std::optional<ArrayAttr> mapping) {
  Location loc = op.getLoc();
  OpBuilder::InsertionGuard g(b);

  // Ops implementing PartialReductionOpInterface are expected to implement
  // TilingInterface.
  // TODO: proper core mechanism to tie interfaces together.
  auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());

  // Ops implementing PartialReductionOpInterface are not necessarily expected
  // to implement TilingInterface.. This cast is unsafe atm.
  // TODO: proper core mechanism to tie interfaces together.
  // TODO: this function requires a pair of interfaces ..
  auto destinationStyleOp =
      dyn_cast<DestinationStyleOpInterface>(op.getOperation());
  if (!destinationStyleOp)
    return b.notifyMatchFailure(op, "not a destination style op");

  // Actually this only work for Linalg ops atm.
  auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
  if (!linalgOp)
    return b.notifyMatchFailure(op, "not a linalg op");

  SmallVector<Range> iterationDomain = tilingInterfaceOp.getIterationDomain(b);
  if (op->getNumResults() != 1)
    return b.notifyMatchFailure(
        op, "don't support ops with multiple results for now");

  SmallVector<utils::IteratorType> iterators =
      tilingInterfaceOp.getLoopIteratorTypes();
  SmallVector<unsigned> redDims;
  linalgOp.getReductionDims(redDims);
  if (redDims.size() != 1)
    return b.notifyMatchFailure(
        op, "only support ops with one reduction dimension.");
  if (!tileSizes.empty() && tileSizes.size() != numThreads.size())
    return b.notifyMatchFailure(op, "if tile sizes are present it must have as "
                                    "many elements as number of threads");
  int reductionDim = static_cast<int>(redDims.front());

  if (redDims.front() >= numThreads.size())
    return b.notifyMatchFailure(
        op, "reduction dimension must be mapped to threads");

  // 1. Create the inital tensor value.
  FailureOr<SmallVector<Value>> maybeInitTensors =
      op.generateInitialTensorForPartialReduction(b, loc, numThreads,
                                                  reductionDim);
  if (failed(maybeInitTensors))
    return b.notifyMatchFailure(
        op, "Failed to create inital tensors for partial reduction");
  SmallVector<Value> &initTensors = maybeInitTensors.value();

  // Gather destination tensors.
  SmallVector<Value> dest;
  if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))
    return b.notifyMatchFailure(op, "failed to get destination tensors");

  Operation *tiledOp = nullptr;

  SmallVector<OpFoldResult> nonZeroNumThreads =
      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
        return !isConstantIntValue(ofr, 0);
      }));
  SmallVector<Value> materializedNonZeroNumThreads =
      getValueOrCreateConstantIndexOp(b, loc, nonZeroNumThreads);

  // 2. Create the ForallOp with an empty region.
  scf::ForallOp forallOp = b.create<scf::ForallOp>(
      loc, getAsOpFoldResult(materializedNonZeroNumThreads), initTensors,
      mapping);

  // 3. Calculate the tile offsets and sizes for the subsequent loop that will
  // be nested under `forallOp`.
  SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, iterationDomain,
                               /*omitTileOffsetBoundsCheck =*/false,
                               /*nominalTileSizes=*/std::nullopt, tiledOffsets,
                               tiledSizes);

  // 4b. Clone the tileable op and update its destination operands to use the
  // output bbArgs of the ForallOp.
  SmallVector<Value> tilingResults;
  ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
  {
    // 4.a. RAII guard, inserting within forallOp, before terminator.
    OpBuilder::InsertionGuard g(b);
    b.setInsertionPoint(forallOp.getTerminator());

    SmallVector<Value> tiledDpsInitOperands;
    for (Value initOperand : destinationStyleOp.getDpsInits()) {
      auto *it = llvm::find(dest, initOperand);
      assert(it != dest.end() && "dest operand not found in dest");
      unsigned destNum = std::distance(dest.begin(), it);
      SmallVector<OpFoldResult> strides(numThreads.size(), b.getIndexAttr(1));
      SmallVector<OpFoldResult> outOffsets(numThreads.size(),
                                           b.getIndexAttr(0));
      SmallVector<OpFoldResult> sizes = tiledSizes;
      sizes[reductionDim] = b.getIndexAttr(1);
      outOffsets[reductionDim] = forallOp.getInductionVars()[0];
      // TODO: use SubsetExtractOpInterface once it is available.
      tiledDpsInitOperands.push_back(b.create<tensor::ExtractSliceOp>(
          loc, cast<RankedTensorType>(initOperand.getType()),
          destBbArgs[destNum], outOffsets, sizes, strides));
    }

    // 4.b. Clone the op and update init operands.
    // We cannot use a IRMapping here because it can replace
    // different OpOperands with the same value.
    Operation *clonedOp = b.clone(*op.getOperation());
    b.modifyOpInPlace(clonedOp, [&]() {
      for (auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
               cast<DestinationStyleOpInterface>(clonedOp).getDpsInitsMutable(),
               tiledDpsInitOperands)) {
        initOperandPtr.set(tiledInitValue);
      }
    });

    // 5. Tile the cloned op and delete the clone.
    if (tileSizes.empty()) {
      FailureOr<TilingResult> tilingResult =
          cast<TilingInterface>(clonedOp).getTiledImplementation(
              b, tiledOffsets, tiledSizes);
      if (failed(tilingResult))
        return clonedOp->emitError("Failed to tile op: ");
      if (tilingResult->tiledOps.size() != 1) {
        return clonedOp->emitError("expected a single produced tiled op, got ")
               << tilingResult->tiledOps.size();
      }
      tiledOp = tilingResult->tiledOps.front();
      tilingResults = tilingResult->tiledValues;
    } else {
      LinalgTilingOptions options;
      FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
          b, cast<LinalgOp>(clonedOp), tileSizes, options);
      if (failed(maybeTiled))
        return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");

      SmallVector<Value> ids = forallOp.getInductionVars();
      mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,
                            materializedNonZeroNumThreads);
      if (maybeTiled->loops.size() != 1) {
        return clonedOp->emitError("expected a single produced loop");
      }
      tiledOp = maybeTiled->op;
      tilingResults = maybeTiled->loops.front()->getResults();
    }

    b.eraseOp(clonedOp);
  }

  // 6. Insert the partial reductions back into a new tensor.
  for (auto [index, result, bbArg] : llvm::zip(
           llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
    // 6.a. Partial subset information is inserted just before the terminator.
    OpBuilder::InsertionGuard g(b);
    b.setInsertionPoint(forallOp.getTerminator());

    SmallVector<OpFoldResult> resultOffsets, resultSizes;
    if (failed(tilingInterfaceOp.getResultTilePosition(
            b, index, tiledOffsets, tiledSizes, resultOffsets, resultSizes)))
      return op->emitOpError("output offsets couldn't be calculated");
    SmallVector<OpFoldResult> resultOffsetsRank, resultSizesRank;
    int64_t offIdx = 0;
    int64_t sizeIdx = 0;
    for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
      if (i == reductionDim) {
        resultOffsetsRank.push_back(forallOp.getInductionVars()[0]);
        resultSizesRank.push_back(b.getIndexAttr(1));
        continue;
      }
      resultOffsetsRank.push_back(resultOffsets[offIdx++]);
      resultSizesRank.push_back(resultSizes[sizeIdx++]);
    }
    SmallVector<OpFoldResult> strides(resultSizesRank.size(),
                                      b.getIndexAttr(1));

    // 6.b. Parallel insertions are inserted at the end of the combining
    // terminator.
    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
    b.create<tensor::ParallelInsertSliceOp>(
        loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
  }

  // 7. Merge the partial reductions.
  b.setInsertionPointAfter(forallOp);
  FailureOr<MergeResult> mergeResult =
      op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);
  if (failed(mergeResult)) {
    return failure();
  }
  b.replaceOp(op, mergeResult->replacements);

  // 8. Return.
  ForallReductionTilingResult results;
  results.initialValues = initTensors;
  results.loops = forallOp;
  results.parallelTiledOps.push_back(tiledOp);
  results.mergeOps.append(mergeResult->mergeOps);
  return results;
}

template <typename LoopTy>
FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
    RewriterBase &b, LinalgOp op, const LinalgTilingOptions &options) {
  OpBuilder::InsertionGuard g(b);
  b.setInsertionPoint(op);

  if (!options.tileSizeComputationFunction)
    return failure();

  // Enforce the convention that "tiling by zero" skips tiling a particular
  // dimension. This convention is significantly simpler to handle instead of
  // adjusting affine maps to account for missing dimensions.
  auto nLoops = op.getNumLoops();
  SmallVector<OpFoldResult> tileSizeVector =
      getAsOpFoldResult(options.tileSizeComputationFunction(b, op));
  if (tileSizeVector.size() < nLoops) {
    tileSizeVector.append(nLoops - tileSizeVector.size(), b.getIndexAttr(0));
  }

  return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
}

FailureOr<TiledLinalgOp>
mlir::linalg::tileLinalgOp(RewriterBase &b, LinalgOp op,
                           const LinalgTilingOptions &options) {
  switch (options.loopType) {
  case LinalgTilingLoopType::Loops:
    return tileLinalgOpImpl<scf::ForOp>(b, op, options);
  case LinalgTilingLoopType::ParallelLoops:
    return tileLinalgOpImpl<scf::ParallelOp>(b, op, options);
  default:;
  }
  return failure();
}

namespace {
/// Helper classes for type list expansion.
template <typename... OpTypes>
class CanonicalizationPatternList;

template <>
class CanonicalizationPatternList<> {
public:
  static void insert(RewritePatternSet &patterns) {}
};

template <typename OpTy, typename... OpTypes>
class CanonicalizationPatternList<OpTy, OpTypes...> {
public:
  static void insert(RewritePatternSet &patterns) {
    OpTy::getCanonicalizationPatterns(patterns, patterns.getContext());
    CanonicalizationPatternList<OpTypes...>::insert(patterns);
  }
};
} // namespace

RewritePatternSet
mlir::linalg::getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx) {
  RewritePatternSet patterns(ctx);
  populateLinalgTilingCanonicalizationPatterns(patterns);
  return patterns;
}

void mlir::linalg::populateLinalgTilingCanonicalizationPatterns(
    RewritePatternSet &patterns) {
  auto *ctx = patterns.getContext();
  affine::AffineApplyOp::getCanonicalizationPatterns(patterns, ctx);
  affine::AffineForOp::getCanonicalizationPatterns(patterns, ctx);
  affine::AffineMinOp::getCanonicalizationPatterns(patterns, ctx);
  affine::AffineMaxOp::getCanonicalizationPatterns(patterns, ctx);
  arith::ConstantIndexOp::getCanonicalizationPatterns(patterns, ctx);

  memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);
  memref::ViewOp::getCanonicalizationPatterns(patterns, ctx);

  scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
  scf::ParallelOp::getCanonicalizationPatterns(patterns, ctx);

  tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
  tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
  tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, ctx);
  tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, ctx);
  tensor::PadOp::getCanonicalizationPatterns(patterns, ctx);
  ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(patterns);

  CanonicalizationPatternList<
#define GET_OP_LIST
#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
      >::insert(patterns);
}