Skip to content

Conversation

@mleleszi
Copy link
Member

#173036

This patch removes the single-use restriction of selects in foldShuffleOfSelects, allowing the fold to trigger for multi-use instructions as well if the cost model finds it cheaper.

@github-actions
Copy link

github-actions bot commented Dec 20, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@llvmbot
Copy link
Member

llvmbot commented Dec 20, 2025

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: Marcell Leleszi (mleleszi)

Changes

#173036

This patch removes the single-use restriction of selects in foldShuffleOfSelects, allowing the fold to trigger for multi-use instructions as well if the cost model finds it cheaper.


Full diff: https://github.com/llvm/llvm-project/pull/173166.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+18-8)
  • (modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll (+84)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9239cb1b989b2..e581c225aec6f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2547,12 +2547,14 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
   ArrayRef<int> Mask;
   Value *C1, *T1, *F1, *C2, *T2, *F2;
-  if (!match(&I, m_Shuffle(
-                     m_OneUse(m_Select(m_Value(C1), m_Value(T1), m_Value(F1))),
-                     m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
-                     m_Mask(Mask))))
+  if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
+                           m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
+                           m_Mask(Mask))))
     return false;
 
+  auto *Sel1 = cast<Instruction>(I.getOperand(0));
+  auto *Sel2 = cast<Instruction>(I.getOperand(1));
+
   auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
   auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
   if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
@@ -2570,11 +2572,14 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
   auto *DstVecTy = cast<FixedVectorType>(I.getType());
   auto SK = TargetTransformInfo::SK_PermuteTwoSrc;
   auto SelOp = Instruction::Select;
-  InstructionCost OldCost = TTI.getCmpSelInstrCost(
+
+  InstructionCost CostSel1 = TTI.getCmpSelInstrCost(
       SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
-  OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
-                                    CmpInst::BAD_ICMP_PREDICATE, CostKind);
-  OldCost +=
+  InstructionCost CostSel2 = TTI.getCmpSelInstrCost(
+      SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+  InstructionCost OldCost =
+      CostSel1 + CostSel2 +
       TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
                          {I.getOperand(0), I.getOperand(1)}, &I);
 
@@ -2590,6 +2595,11 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
   NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
                                     CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
+  if (!Sel1->hasOneUse())
+    NewCost += CostSel1;
+  if (!Sel2->hasOneUse())
+    NewCost += CostSel2;
+
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
                     << "\n");
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
index 7883eb42aefac..cf57a503c2197 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
@@ -637,3 +637,87 @@ define <4 x i32> @src_v2tov4_i32_change_to_other_vector(<2 x i1> %a, <2 x i1> %b
   %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   ret <4 x i32> %res
 }
+
+define <4 x i32> @src_v2tov4_i32_multiuse_sel1(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_sel1(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT:    store <2 x i32> [[SELECT_XZ]], ptr [[P]], align 8
+; CHECK-NEXT:    [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+  store <2 x i32> %select.xz, ptr %p
+  %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+  %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @src_v2tov4_i32_multiuse_sel2(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_sel2(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT:    [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT:    store <2 x i32> [[SELECT_YX]], ptr [[P]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+  %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+  store <2 x i32> %select.yx, ptr %p
+  %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @src_v2tov4_i32_multiuse_both(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z, ptr %p1, ptr %p2) {
+; CHECK-LABEL: define <4 x i32> @src_v2tov4_i32_multiuse_both(
+; CHECK-SAME: <2 x i1> [[A:%.*]], <2 x i1> [[B:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SELECT_XZ:%.*]] = select <2 x i1> [[A]], <2 x i32> [[X]], <2 x i32> [[Z]]
+; CHECK-NEXT:    store <2 x i32> [[SELECT_XZ]], ptr [[P1]], align 8
+; CHECK-NEXT:    [[SELECT_YX:%.*]] = select <2 x i1> [[B]], <2 x i32> [[Y]], <2 x i32> [[X]]
+; CHECK-NEXT:    store <2 x i32> [[SELECT_YX]], ptr [[P2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i32> [[SELECT_XZ]], <2 x i32> [[SELECT_YX]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %select.xz = select <2 x i1> %a, <2 x i32> %x, <2 x i32> %z
+  store <2 x i32> %select.xz, ptr %p1
+  %select.yx = select <2 x i1> %b, <2 x i32> %y, <2 x i32> %x
+  store <2 x i32> %select.yx, ptr %p2
+  %res = shufflevector <2 x i32> %select.xz, <2 x i32> %select.yx, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(<1024 x i1> %a, <1024 x i1> %b, <1024 x i32> %x, <1024 x i32> %y, <1024 x i32> %z, ptr %p1, ptr %p2) {
+; SSE-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; SSE-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; SSE-NEXT:    store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; SSE-NEXT:    [[SELECT_YX:%.*]] = select <1024 x i1> [[B]], <1024 x i32> [[Y]], <1024 x i32> [[X]]
+; SSE-NEXT:    [[RES:%.*]] = shufflevector <1024 x i32> [[SELECT_XZ]], <1024 x i32> [[SELECT_YX]], <2 x i32> <i32 0, i32 1024>
+; SSE-NEXT:    ret <2 x i32> [[RES]]
+;
+; AVX2-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; AVX2-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; AVX2-NEXT:    store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; AVX2-NEXT:    [[SELECT_YX:%.*]] = select <1024 x i1> [[B]], <1024 x i32> [[Y]], <1024 x i32> [[X]]
+; AVX2-NEXT:    [[RES:%.*]] = shufflevector <1024 x i32> [[SELECT_XZ]], <1024 x i32> [[SELECT_YX]], <2 x i32> <i32 0, i32 1024>
+; AVX2-NEXT:    ret <2 x i32> [[RES]]
+;
+; AVX512-LABEL: define <2 x i32> @src_v1024tov2_i32_multiuse_sel1(
+; AVX512-SAME: <1024 x i1> [[A:%.*]], <1024 x i1> [[B:%.*]], <1024 x i32> [[X:%.*]], <1024 x i32> [[Y:%.*]], <1024 x i32> [[Z:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[SELECT_XZ:%.*]] = select <1024 x i1> [[A]], <1024 x i32> [[X]], <1024 x i32> [[Z]]
+; AVX512-NEXT:    store <1024 x i32> [[SELECT_XZ]], ptr [[P1]], align 4096
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <1024 x i1> [[A]], <1024 x i1> [[B]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <1024 x i32> [[X]], <1024 x i32> [[Y]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <1024 x i32> [[Z]], <1024 x i32> [[X]], <2 x i32> <i32 0, i32 1024>
+; AVX512-NEXT:    [[RES:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]]
+; AVX512-NEXT:    ret <2 x i32> [[RES]]
+;
+  %select.xz = select <1024 x i1> %a, <1024 x i32> %x, <1024 x i32> %z
+  store <1024 x i32> %select.xz, ptr %p1
+  %select.yx = select <1024 x i1> %b, <1024 x i32> %y, <1024 x i32> %x
+  %res = shufflevector <1024 x i32> %select.xz, <1024 x i32> %select.yx, <2 x i32> <i32 0, i32 1024>
+  ret <2 x i32> %res
+}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants