aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/LoongArch
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/LoongArch')
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll200
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll88
-rw-r--r--llvm/test/CodeGen/LoongArch/sink-fold-addi.ll758
5 files changed, 900 insertions, 272 deletions
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118f..b3155c9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT: xvclz.b $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <32 x i8>, ptr %src
+ %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+ store <32 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.h $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i16>, ptr %src
+ %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+ store <16 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.w $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i32>, ptr %src
+ %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+ store <8 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.d $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i64>, ptr %src
+ %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+ store <4 x i64> %res, ptr %dst
+ ret void
+}
+
declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
index 79407c3..fa5f27e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
@@ -7,38 +7,8 @@ define void @ceil_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrp.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrp.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrp.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -52,21 +22,7 @@ define void @ceil_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrp.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrp.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -81,38 +37,8 @@ define void @floor_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrm.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrm.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrm.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -126,21 +52,7 @@ define void @floor_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrm.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrm.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -155,38 +67,8 @@ define void @trunc_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrz.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrz.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrz.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -200,21 +82,7 @@ define void @trunc_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrz.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrz.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -229,38 +97,8 @@ define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrne.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrne.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrne.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -274,21 +112,7 @@ define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrne.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrne.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8..6ac7d51 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vxori.b $vr0, $vr0, 255
+; CHECK-NEXT: vclz.b $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i8>, ptr %src
+ %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+ store <16 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.h $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i16>, ptr %src
+ %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+ store <8 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.w $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i32>, ptr %src
+ %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+ store <4 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.d $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <2 x i64>, ptr %src
+ %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+ %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+ store <2 x i64> %res, ptr %dst
+ ret void
+}
+
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
index 1ca6290..cb01ac0 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
@@ -7,22 +7,8 @@ define void @ceil_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrp.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -36,13 +22,7 @@ define void @ceil_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrp.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -57,22 +37,8 @@ define void @floor_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrm.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -86,13 +52,7 @@ define void @floor_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrm.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -107,22 +67,8 @@ define void @trunc_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrz.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -136,13 +82,7 @@ define void @trunc_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrz.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -157,22 +97,8 @@ define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrne.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -186,13 +112,7 @@ define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrne.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000..9a806a1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB0_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: move $s5, $zero
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB0_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: ld.w $a0, $s2, 4
+; LA32-NEXT: ld.w $a1, $s2, 0
+; LA32-NEXT: add.w $a0, $a0, $s6
+; LA32-NEXT: add.w $s3, $a1, $s3
+; LA32-NEXT: sltu $a1, $s3, $a1
+; LA32-NEXT: addi.w $s4, $s4, 1
+; LA32-NEXT: sltui $a2, $s4, 1
+; LA32-NEXT: add.w $s5, $s5, $a2
+; LA32-NEXT: xor $a2, $s4, $s1
+; LA32-NEXT: xor $a3, $s5, $s0
+; LA32-NEXT: or $a2, $a2, $a3
+; LA32-NEXT: add.w $s6, $a0, $a1
+; LA32-NEXT: bnez $a2, .LBB0_2
+; LA32-NEXT: b .LBB0_4
+; LA32-NEXT: .LBB0_3:
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT: st.w $s3, $s2, 0
+; LA32-NEXT: st.w $s6, $s2, 4
+; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB0_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB0_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: ld.d $a0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: add.d $s2, $a0, $s2
+; LA64-NEXT: bnez $s0, .LBB0_2
+; LA64-NEXT: b .LBB0_4
+; LA64-NEXT: .LBB0_3:
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT: st.d $s2, $s1, 0
+; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load i64, ptr %y
+ %add = add nsw i64 %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ store i64 %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB1_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB1_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: fld.s $fa0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT: bnez $a0, .LBB1_2
+; LA32-NEXT: b .LBB1_4
+; LA32-NEXT: .LBB1_3:
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT: fst.s $fs0, $s2, 0
+; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB1_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB1_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: fld.s $fa0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT: bnez $s0, .LBB1_2
+; LA64-NEXT: b .LBB1_4
+; LA64-NEXT: .LBB1_3:
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT: fst.s $fs0, $s1, 0
+; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load float, ptr %y
+ %add = fadd float %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ store float %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB2_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB2_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vld $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB2_2
+; LA32-NEXT: b .LBB2_4
+; LA32-NEXT: .LBB2_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT: vst $vr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $a1, .LBB2_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB2_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vld $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB2_2
+; LA64-NEXT: b .LBB2_4
+; LA64-NEXT: .LBB2_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT: vst $vr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <4 x i32>, ptr %y
+ %addv = add <4 x i32> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <4 x i32> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 32
+; LA32-NEXT: bnez $a1, .LBB3_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB3_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvld $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB3_2
+; LA32-NEXT: b .LBB3_4
+; LA32-NEXT: .LBB3_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT: xvst $xr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 32
+; LA64-NEXT: blez $a1, .LBB3_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB3_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvld $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB3_2
+; LA64-NEXT: b .LBB3_4
+; LA64-NEXT: .LBB3_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT: xvst $xr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <16 x i16>, ptr %y
+ %addv = add <16 x i16> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <16 x i16> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB4_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB4_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vldrepl.b $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB4_2
+; LA32-NEXT: b .LBB4_4
+; LA32-NEXT: .LBB4_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB4_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB4_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vldrepl.b $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB4_2
+; LA64-NEXT: b .LBB4_4
+; LA64-NEXT: .LBB4_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load i8, ptr %y
+ %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+ %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+ %addv = add <16 x i8> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <16 x i8> %sum.lcssa, i32 1
+ store i8 %res, ptr %y
+ ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB5_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB5_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB5_2
+; LA32-NEXT: b .LBB5_4
+; LA32-NEXT: .LBB5_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB5_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB5_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB5_2
+; LA64-NEXT: b .LBB5_4
+; LA64-NEXT: .LBB5_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load double, ptr %y
+ %ins0 = insertelement <4 x double> poison, double %e, i32 0
+ %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+ %addv = fadd <4 x double> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <4 x double> %sum.lcssa, i32 1
+ store double %res, ptr %y
+ ret void
+}
+
+declare void @f(ptr)