aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiana Picus <Diana-Magda.Picus@amd.com>2025-09-26 10:03:13 +0200
committerGitHub <noreply@github.com>2025-09-26 10:03:13 +0200
commitb8375c5824dcb2bf6f1a2b2cd6824128fa66932a (patch)
tree7e2a954d77bd7675a775761f8b27c0bfdeb1489e
parent82d978a5b7e400289323bf75987ac9bcf29ef668 (diff)
parent3257dc35fe9ed872788e90c948cb4bb593b8fa05 (diff)
downloadllvm-users/rovka/remove-dvgpr-target-features.zip
llvm-users/rovka/remove-dvgpr-target-features.tar.gz
llvm-users/rovka/remove-dvgpr-target-features.tar.bz2
Merge branch 'main' into users/rovka/remove-dvgpr-target-featuresusers/rovka/remove-dvgpr-target-features
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp8
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.cpp3
-rw-r--r--llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp39
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir1
-rw-r--r--llvm/test/CodeGen/ARM/fnegs.ll19
-rw-r--r--llvm/test/CodeGen/ARM/fnmscs.ll36
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll9
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll48
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll4
10 files changed, 91 insertions, 78 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ec9885..f291191 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2730,7 +2730,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
HasVMemStore = true;
}
for (const MachineOperand &Op : MI.all_uses()) {
- if (!TRI->isVectorRegister(*MRI, Op.getReg()))
+ if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
// Vgpr use
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9a247bb..78b7066 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -5573,7 +5573,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
llvm_unreachable("Unknown VFP cmp argument!");
}
-/// OptimizeVFPBrcond - With nnan, it's legal to optimize some
+/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
/// f32 and even f64 comparisons to integer ones.
SDValue
ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
@@ -5729,9 +5729,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
}
SDNodeFlags Flags = Op->getFlags();
- if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) &&
- (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
- DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) &&
+ if (Flags.hasNoNaNs() &&
+ DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
+ DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
(CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
CC == ISD::SETUNE)) {
if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index f059294..3329bea 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -222,8 +222,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
const FeatureBitset &Bits = getFeatureBits();
if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
- (Options.UnsafeFPMath || isTargetDarwin() ||
- DM == DenormalMode::getPreserveSign()))
+ (isTargetDarwin() || DM == DenormalMode::getPreserveSign()))
HasNEONForFP = true;
if (isRWPI())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7bf6493..5d4a8fd 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1603,7 +1603,7 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL,
/// value is necessary in order to fit the above form.
static SDValue
lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2, SelectionDAG &DAG,
+ SDValue V1, SelectionDAG &DAG,
const LoongArchSubtarget &Subtarget) {
int SplatIndex = -1;
for (const auto &M : Mask) {
@@ -1996,8 +1996,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue Result;
// TODO: Add more comparison patterns.
if (V2.isUndef()) {
- if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG,
- Subtarget)))
+ if ((Result =
+ lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
if ((Result =
lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
@@ -2053,7 +2053,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
/// value is necessary in order to fit the above form.
static SDValue
lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
- SDValue V1, SDValue V2, SelectionDAG &DAG,
+ SDValue V1, SelectionDAG &DAG,
const LoongArchSubtarget &Subtarget) {
int SplatIndex = -1;
for (const auto &M : Mask) {
@@ -2096,10 +2096,29 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget);
}
+/// Lower VECTOR_SHUFFLE into XVPERMI (if possible).
+static SDValue
+lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
+ // Only consider XVPERMI_D.
+ if (Mask.size() != 4 || (VT != MVT::v4i64 && VT != MVT::v4f64))
+ return SDValue();
+
+ unsigned MaskImm = 0;
+ for (unsigned i = 0; i < Mask.size(); ++i) {
+ if (Mask[i] == -1)
+ continue;
+ MaskImm |= Mask[i] << (i * 2);
+ }
+
+ return DAG.getNode(LoongArchISD::XVPERMI, DL, VT, V1,
+ DAG.getConstant(MaskImm, DL, Subtarget.getGRLenVT()));
+}
+
/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
+ MVT VT, SDValue V1, SelectionDAG &DAG,
const LoongArchSubtarget &Subtarget) {
// LoongArch LASX only have XVPERM_W.
if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
@@ -2540,14 +2559,16 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue Result;
// TODO: Add more comparison patterns.
if (V2.isUndef()) {
- if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
- Subtarget)))
+ if ((Result =
+ lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
Subtarget)))
return Result;
if ((Result =
- lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+ lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
+ return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
// TODO: This comment may be enabled in the future to better match the
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
index 0ddd2aa..0d54bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s
diff --git a/llvm/test/CodeGen/ARM/fnegs.ll b/llvm/test/CodeGen/ARM/fnegs.ll
index 435a600..6055b8f 100644
--- a/llvm/test/CodeGen/ARM/fnegs.ll
+++ b/llvm/test/CodeGen/ARM/fnegs.ll
@@ -10,11 +10,11 @@
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
; RUN: | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \
; RUN: | FileCheck %s -check-prefix=CORTEXA8U
; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
-; RUN: | FileCheck %s -check-prefix=CORTEXA8U
+; RUN: | FileCheck %s -check-prefix=CORTEXA8U-DARWIN
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \
; RUN: | FileCheck %s -check-prefix=CORTEXA9
@@ -41,7 +41,10 @@ entry:
; CORTEXA8: vneg.f32 s{{.*}}, s{{.*}}
; CORTEXA8U-LABEL: test1:
-; CORTEXA8U: vneg.f32 d{{.*}}, d{{.*}}
+; CORTEXA8U: vsub.f32 d{{.*}}, d{{.*}}, d{{.*}}
+
+; CORTEXA8U-DARWIN-LABEL: test1:
+; CORTEXA8U-DARWIN: vneg.f32 d{{.*}}, d{{.*}}
; CORTEXA9-LABEL: test1:
; CORTEXA9: vneg.f32 s{{.*}}, s{{.*}}
@@ -110,9 +113,13 @@ define <2 x float> @fneg_bitcast(i64 %i) {
; CORTEXA8-NOT: vneg.f32
; CORTEXA8U-LABEL: fneg_bitcast:
-; CORTEXA8U-DAG: eor r0, r0, #-2147483648
-; CORTEXA8U-DAG: eor r1, r1, #-2147483648
-; CORTEXA8U-NOT: vneg.f32
+; CORTEXA8U-DAG: vmov.i32 d{{.*}}, #0x80000000
+; CORTEXA8U-DAG: vsub.f32 d{{.*}}, d{{.*}}, d{{.*}}
+
+; CORTEXA8U-DARWIN-LABEL: fneg_bitcast:
+; CORTEXA8U-DARWIN-DAG: eor r0, r0, #-2147483648
+; CORTEXA8U-DARWIN-DAG: eor r1, r1, #-2147483648
+; CORTEXA8U-DARWIN-NOT: vneg.f32
; CORTEXA9-LABEL: fneg_bitcast:
; CORTEXA9-DAG: eor r0, r0, #-2147483648
diff --git a/llvm/test/CodeGen/ARM/fnmscs.ll b/llvm/test/CodeGen/ARM/fnmscs.ll
index 0fa878c..49f9dcf 100644
--- a/llvm/test/CodeGen/ARM/fnmscs.ll
+++ b/llvm/test/CodeGen/ARM/fnmscs.ll
@@ -13,11 +13,11 @@
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -regalloc=basic %s -o - \
; RUN: | FileCheck %s -check-prefix=A8
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \
; RUN: | FileCheck %s -check-prefix=A8U
; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \
-; RUN: | FileCheck %s -check-prefix=A8U
+; RUN: | FileCheck %s -check-prefix=A8U-DARWIN
define float @t1(float %acc, float %a, float %b) nounwind {
entry:
@@ -31,15 +31,20 @@ entry:
; NEON: vnmla.f32
; A8U-LABEL: t1:
-; A8U: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
-; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
+; A8U: vmov.i32 d{{[0-9]+}}, #0x80000000
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+
+; A8U-DARWIN-LABEL: t1:
+; A8U-DARWIN: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
+; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
; A8-LABEL: t1:
; A8: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
; A8: vsub.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}}
%0 = fmul float %a, %b
%1 = fsub float -0.0, %0
- %2 = fsub float %1, %acc
+ %2 = fsub float %1, %acc
ret float %2
}
@@ -55,8 +60,13 @@ entry:
; NEON: vnmla.f32
; A8U-LABEL: t2:
-; A8U: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
-; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
+; A8U: vmov.i32 d{{[0-9]+}}, #0x80000000
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+
+; A8U-DARWIN-LABEL: t2:
+; A8U-DARWIN: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
+; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
; A8-LABEL: t2:
; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
@@ -79,8 +89,12 @@ entry:
; NEON: vnmla.f64
; A8U-LABEL: t3:
-; A8U: vnmul.f64 d
; A8U: vsub.f64 d
+; A8U: vsub.f64 d
+
+; A8U-DARWIN-LABEL: t3:
+; A8U-DARWIN: vnmul.f64 d
+; A8U-DARWIN: vsub.f64 d
; A8-LABEL: t3:
; A8: vnmul.f64 d
@@ -103,8 +117,12 @@ entry:
; NEON: vnmla.f64
; A8U-LABEL: t4:
-; A8U: vnmul.f64 d
; A8U: vsub.f64 d
+; A8U: vsub.f64 d
+
+; A8U-DARWIN-LABEL: t4:
+; A8U-DARWIN: vnmul.f64 d
+; A8U-DARWIN: vsub.f64 d
; A8-LABEL: t4:
; A8: vnmul.f64 d
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index 3053942..0b8015d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,13 +7,12 @@
define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve.d $xr2, $xr1, 3
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 238
-; CHECK-NEXT: xvrepl128vei.d $xr3, $xr3, 1
-; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT: xvpermi.d $xr2, $xr0, 3
+; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT: vextrins.d $vr2, $vr3, 16
; CHECK-NEXT: xvpickve.d $xr1, $xr1, 2
; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
; CHECK-NEXT: ret
entry:
%c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 24f1b31..245f764 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -6,11 +6,8 @@ define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
; CHECK-LABEL: shuffle_v32i8:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78
; CHECK-NEXT: xvshuf.h $xr1, $xr2, $xr0
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ret
@@ -34,11 +31,8 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
; CHECK-LABEL: shuffle_v16i16:
; CHECK: # %bb.0:
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
+; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78
; CHECK-NEXT: xvshuf.w $xr1, $xr2, $xr0
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ret
@@ -72,10 +66,7 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
; CHECK-LABEL: shuffle_v8i32_same_lane:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 225
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -84,14 +75,7 @@ define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) {
define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
; CHECK-LABEL: shuffle_v4i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI6_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 39
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
ret <4 x i64> %shuffle
@@ -100,10 +84,7 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
; CHECK-LABEL: shuffle_v4i64_same_lane:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 225
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
ret <4 x i64> %shuffle
@@ -136,14 +117,7 @@ define <8 x float> @shuffle_v8f32_same_lane(<8 x float> %a) {
define <4 x double> @shuffle_v4f64(<4 x double> %a) {
; CHECK-LABEL: shuffle_v4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI10_0)
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI10_1)
-; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
-; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 39
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
ret <4 x double> %shuffle
@@ -152,11 +126,7 @@ define <4 x double> @shuffle_v4f64(<4 x double> %a) {
define <4 x double> @shuffle_v4f64_same_lane(<4 x double> %a) {
; CHECK-LABEL: shuffle_v4f64_same_lane:
; CHECK: # %bb.0:
-; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT: xvpermi.d $xr0, $xr0, 78
-; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0
-; CHECK-NEXT: xvori.b $xr0, $xr1, 0
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 75
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
ret <4 x double> %shuffle
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
index c0fa734..2007f85 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll
@@ -127,9 +127,7 @@ define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind {
define <4 x i64> @byte_rotate_v4i64_3(<4 x i64> %a) nounwind {
; CHECK-LABEL: byte_rotate_v4i64_3:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvbsrl.v $xr1, $xr0, 8
-; CHECK-NEXT: xvbsll.v $xr0, $xr0, 8
-; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvpermi.d $xr0, $xr0, 177
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x i64> %shuffle