diff options
Diffstat (limited to 'llvm/lib')
42 files changed, 815 insertions, 274 deletions
| diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 84ee8c0..11d8294 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -2854,14 +2854,18 @@ bool DependenceInfo::testMIV(const SCEV *Src, const SCEV *Dst,           banerjeeMIVtest(Src, Dst, Loops, Result);  } -// Given a product, e.g., 10*X*Y, returns the first constant operand, -// in this case 10. If there is no constant part, returns std::nullopt. -static std::optional<APInt> getConstantPart(const SCEV *Expr) { +/// Given a SCEVMulExpr, returns its first operand if its first operand is a +/// constant and the product doesn't overflow in a signed sense. Otherwise, +/// returns std::nullopt. For example, given (10 * X * Y)<nsw>, it returns 10. +/// Notably, if it doesn't have nsw, the multiplication may overflow, and if +/// so, it may not a multiple of 10. +static std::optional<APInt> getConstanCoefficient(const SCEV *Expr) {    if (const auto *Constant = dyn_cast<SCEVConstant>(Expr))      return Constant->getAPInt();    if (const auto *Product = dyn_cast<SCEVMulExpr>(Expr))      if (const auto *Constant = dyn_cast<SCEVConstant>(Product->getOperand(0))) -      return Constant->getAPInt(); +      if (Product->hasNoSignedWrap()) +        return Constant->getAPInt();    return std::nullopt;  } @@ -2887,7 +2891,7 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,    if (AddRec->getLoop() == CurLoop) {      CurLoopCoeff = Step;    } else { -    std::optional<APInt> ConstCoeff = getConstantPart(Step); +    std::optional<APInt> ConstCoeff = getConstanCoefficient(Step);      // If the coefficient is the product of a constant and other stuff, we can      // use the constant in the GCD computation. @@ -2940,7 +2944,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);      // If the coefficient is the product of a constant and other stuff,      // we can use the constant in the GCD computation. -    std::optional<APInt> ConstCoeff = getConstantPart(Coeff); +    std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff);      if (!ConstCoeff)        return false;      RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2958,7 +2962,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);      // If the coefficient is the product of a constant and other stuff,      // we can use the constant in the GCD computation. -    std::optional<APInt> ConstCoeff = getConstantPart(Coeff); +    std::optional<APInt> ConstCoeff = getConstanCoefficient(Coeff);      if (!ConstCoeff)        return false;      RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff->abs()); @@ -2979,7 +2983,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,        } else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {          // Search for constant operand to participate in GCD;          // If none found; return false. -        std::optional<APInt> ConstOp = getConstantPart(Product); +        std::optional<APInt> ConstOp = getConstanCoefficient(Product);          if (!ConstOp)            return false;          ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOp->abs()); @@ -3032,7 +3036,7 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,      Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);      // If the coefficient is the product of a constant and other stuff,      // we can use the constant in the GCD computation. -    std::optional<APInt> ConstCoeff = getConstantPart(Delta); +    std::optional<APInt> ConstCoeff = getConstanCoefficient(Delta);      if (!ConstCoeff)        // The difference of the two coefficients might not be a product        // or constant, in which case we give up on this direction. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 8aa488f..f65d88a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1443,7 +1443,7 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,            MF.hasBBSections() && NumMBBSectionRanges > 1,            // Use static_cast to avoid breakage of tests on windows.            static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls, -          static_cast<bool>(EmitBBHash)}; +          static_cast<bool>(EmitBBHash), false};  }  void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index fbcd614..485b44ae 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -287,6 +287,25 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {        }        continue;      } +    case 'h': { // Basic block hash secifier. +      // Skip the profile when the profile iterator (FI) refers to the +      // past-the-end element. +      if (FI == ProgramPathAndClusterInfo.end()) +        continue; +      for (auto BBIDHashStr : Values) { +        auto [BBIDStr, HashStr] = BBIDHashStr.split(':'); +        unsigned long long BBID = 0, Hash = 0; +        if (getAsUnsignedInteger(BBIDStr, 10, BBID)) +          return createProfileParseError(Twine("unsigned integer expected: '") + +                                         BBIDStr + "'"); +        if (getAsUnsignedInteger(HashStr, 16, Hash)) +          return createProfileParseError( +              Twine("unsigned integer expected in hex format: '") + HashStr + +              "'"); +        FI->second.BBHashes[BBID] = Hash; +      } +      continue; +    }      default:        return createProfileParseError(Twine("invalid specifier: '") +                                       Twine(Specifier) + "'"); diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index df88490..b546e81 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -12,7 +12,6 @@  #include "llvm/TargetParser/Triple.h"  namespace llvm { -extern llvm::cl::opt<bool> DebugInfoCorrelate;  extern llvm::cl::opt<llvm::InstrProfCorrelator::ProfCorrelatorKind>      ProfileCorrelate;  } // namespace llvm @@ -64,8 +63,7 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,  }  std::string getDefaultProfileGenName() { -  return llvm::DebugInfoCorrelate || -                 llvm::ProfileCorrelate != InstrProfCorrelator::NONE +  return llvm::ProfileCorrelate != InstrProfCorrelator::NONE               ? "default_%m.proflite"               : "default_%m.profraw";  } diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 6da97f9..354c51d 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -831,17 +831,17 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,    };    uint8_t Version = 0; -  uint8_t Feature = 0; +  uint16_t Feature = 0;    BBAddrMap::Features FeatEnable{};    while (!ULEBSizeErr && !MetadataDecodeErr && Cur &&           Cur.tell() < Content.size()) {      Version = Data.getU8(Cur);      if (!Cur)        break; -    if (Version < 2 || Version > 4) +    if (Version < 2 || Version > 5)        return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +                           Twine(static_cast<int>(Version))); -    Feature = Data.getU8(Cur); // Feature byte +    Feature = Version < 5 ? Data.getU8(Cur) : Data.getU16(Cur);      if (!Cur)        break;      auto FeatEnableOrErr = BBAddrMap::Features::decode(Feature); @@ -858,6 +858,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,                           "basic block hash feature is enabled: version = " +                           Twine(static_cast<int>(Version)) +                           " feature = " + Twine(static_cast<int>(Feature))); +    if (FeatEnable.PostLinkCfg && Version < 5) +      return createError("version should be >= 5 for SHT_LLVM_BB_ADDR_MAP when " +                         "post link cfg feature is enabled: version = " + +                         Twine(static_cast<int>(Version)) + +                         " feature = " + Twine(static_cast<int>(Feature)));      uint32_t NumBlocksInBBRange = 0;      uint32_t NumBBRanges = 1;      typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0; @@ -946,6 +951,10 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,          uint64_t BBF = FeatEnable.BBFreq                             ? readULEB128As<uint64_t>(Data, Cur, ULEBSizeErr)                             : 0; +        uint32_t PostLinkBBFreq = +            FeatEnable.PostLinkCfg +                ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +                : 0;          // Branch probability          llvm::SmallVector<PGOAnalysisMap::PGOBBEntry::SuccessorEntry, 2> @@ -955,13 +964,20 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,            for (uint64_t I = 0; I < SuccCount; ++I) {              uint32_t BBID = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);              uint32_t BrProb = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr); +            uint32_t PostLinkFreq = +                FeatEnable.PostLinkCfg +                    ? readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +                    : 0; +              if (PGOAnalyses) -              Successors.push_back({BBID, BranchProbability::getRaw(BrProb)}); +              Successors.push_back( +                  {BBID, BranchProbability::getRaw(BrProb), PostLinkFreq});            }          }          if (PGOAnalyses) -          PGOBBEntries.push_back({BlockFrequency(BBF), std::move(Successors)}); +          PGOBBEntries.push_back( +              {BlockFrequency(BBF), PostLinkBBFreq, std::move(Successors)});        }        if (PGOAnalyses) diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 8b75fbe..8530785 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -1465,13 +1465,19 @@ void ELFState<ELFT>::writeSectionContent(    for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {      // Write version and feature values.      if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) { -      if (E.Version > 4) +      if (E.Version > 5)          WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "                               << static_cast<int>(E.Version)                               << "; encoding using the most recent version";        CBA.write(E.Version); -      CBA.write(E.Feature); -      SHeader.sh_size += 2; +      SHeader.sh_size += 1; +      if (E.Version < 5) { +        CBA.write(static_cast<uint8_t>(E.Feature)); +        SHeader.sh_size += 1; +      } else { +        CBA.write<uint16_t>(E.Feature, ELFT::Endianness); +        SHeader.sh_size += 2; +      }      }      auto FeatureOrErr = llvm::object::BBAddrMap::Features::decode(E.Feature);      bool MultiBBRangeFeatureEnabled = false; @@ -1556,11 +1562,15 @@ void ELFState<ELFT>::writeSectionContent(      for (const auto &PGOBBE : PGOBBEntries) {        if (PGOBBE.BBFreq)          SHeader.sh_size += CBA.writeULEB128(*PGOBBE.BBFreq); +      if (FeatureOrErr->PostLinkCfg || PGOBBE.PostLinkBBFreq.has_value()) +        SHeader.sh_size += CBA.writeULEB128(PGOBBE.PostLinkBBFreq.value_or(0));        if (PGOBBE.Successors) {          SHeader.sh_size += CBA.writeULEB128(PGOBBE.Successors->size()); -        for (const auto &[ID, BrProb] : *PGOBBE.Successors) { +        for (const auto &[ID, BrProb, PostLinkBrFreq] : *PGOBBE.Successors) {            SHeader.sh_size += CBA.writeULEB128(ID);            SHeader.sh_size += CBA.writeULEB128(BrProb); +          if (FeatureOrErr->PostLinkCfg || PostLinkBrFreq.has_value()) +            SHeader.sh_size += CBA.writeULEB128(PostLinkBrFreq.value_or(0));          }        }      } diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f8a84b0..e5e5fc2 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1886,7 +1886,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry>::mapping(      IO &IO, ELFYAML::BBAddrMapEntry &E) {    assert(IO.getContext() && "The IO context is not initialized");    IO.mapRequired("Version", E.Version); -  IO.mapOptional("Feature", E.Feature, Hex8(0)); +  IO.mapOptional("Feature", E.Feature, Hex16(0));    IO.mapOptional("NumBBRanges", E.NumBBRanges);    IO.mapOptional("BBRanges", E.BBRanges);  } @@ -1920,6 +1920,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry>::mapping(      IO &IO, ELFYAML::PGOAnalysisMapEntry::PGOBBEntry &E) {    assert(IO.getContext() && "The IO context is not initialized");    IO.mapOptional("BBFreq", E.BBFreq); +  IO.mapOptional("PostLinkBBFreq", E.PostLinkBBFreq);    IO.mapOptional("Successors", E.Successors);  } @@ -1929,6 +1930,7 @@ void MappingTraits<ELFYAML::PGOAnalysisMapEntry::PGOBBEntry::SuccessorEntry>::    assert(IO.getContext() && "The IO context is not initialized");    IO.mapRequired("ID", E.ID);    IO.mapRequired("BrProb", E.BrProb); +  IO.mapOptional("PostLinkBrFreq", E.PostLinkBrFreq);  }  void MappingTraits<ELFYAML::GnuHashHeader>::mapping(IO &IO, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 9ce1224..aed325c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -221,12 +221,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {  bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {    const DebugLoc &DL = I.getDebugLoc();    MachineBasicBlock *BB = I.getParent(); +  Register VCCReg = I.getOperand(1).getReg(); +  MachineInstr *Cmp; + +  if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { +    unsigned CmpOpc = +        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; +    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0); +  } else { +    // For gfx7 and earlier, S_CMP_LG_U64 doesn't exist, so we use S_OR_B64 +    // which sets SCC as a side effect. +    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); +    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst) +              .addReg(VCCReg) +              .addReg(VCCReg); +  } -  unsigned CmpOpc = -      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; -  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) -                          .addReg(I.getOperand(1).getReg()) -                          .addImm(0);    if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))      return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 5407566..b84c30e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -500,6 +500,16 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {    MI.eraseFromParent();  } +void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) { +  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg()); +  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg()); +  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo}); +  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi}); +  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), +                          {ResLo.getReg(0), ResHi.getReg(0)}); +  MI.eraseFromParent(); +} +  static bool isSignedBFE(MachineInstr &MI) {    if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))      return (GI->is(Intrinsic::amdgcn_sbfe)); @@ -804,6 +814,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,      }      break;    } +  case UnpackAExt: +    return lowerUnpackAExt(MI);    case WidenMMOToS32:      return widenMMOToS32(cast<GAnyLoad>(MI));    } @@ -1120,7 +1132,8 @@ void RegBankLegalizeHelper::applyMappingDst(        assert(RB == SgprRB);        Register NewDst = MRI.createVirtualRegister(SgprRB_S32);        Op.setReg(NewDst); -      B.buildTrunc(Reg, NewDst); +      if (!MRI.use_empty(Reg)) +        B.buildTrunc(Reg, NewDst);        break;      }      case InvalidMapping: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815..ad3ff1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -124,6 +124,7 @@ private:    void lowerSplitTo32Select(MachineInstr &MI);    void lowerSplitTo32SExtInReg(MachineInstr &MI);    void lowerUnpackMinMax(MachineInstr &MI); +  void lowerUnpackAExt(MachineInstr &MI);  };  } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a..01abd35 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -470,7 +470,19 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,        .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})        .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})        .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) -      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); +      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) +      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt}) +      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) +      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}}) +      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}); + +  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}}); + +  addRulesForGOpcs({G_UADDE, G_USUBE}, Standard) +      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}}) +      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});    addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efd..030bd75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -223,7 +223,8 @@ enum LoweringMethodID {    UniCstExt,    SplitLoad,    WidenLoad, -  WidenMMOToS32 +  WidenMMOToS32, +  UnpackAExt  };  enum FastRulesTypes { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 75a94ac..b28c50e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1315,6 +1315,9 @@ void AMDGPUPassConfig::addIRPasses() {        isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); +    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); @@ -2066,6 +2069,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {    if (isPassEnabled(EnableImageIntrinsicOptimizer))      addPass(AMDGPUImageIntrinsicOptimizerPass(TM)); +  if (EnableUniformIntrinsicCombine) +    addPass(AMDGPUUniformIntrinsicCombinePass());    // This can be disabled by passing ::Disable here or on the command line    // with --expand-variadics-override=disable.    addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b34ab2a..8bb2808 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,    SDLoc SL(N);    if (Src.getOpcode() == ISD::SETCC) { +    SDValue Op0 = Src.getOperand(0); +    SDValue Op1 = Src.getOperand(1); +    // Need to expand bfloat to float for comparison (setcc). +    if (Op0.getValueType() == MVT::bf16) { +      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); +      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); +    }      // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) -    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), -                       Src.getOperand(1), Src.getOperand(2)); +    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));    }    if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {      // (ballot 0) -> 0 diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a4d3d62..6b06534 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -22109,6 +22109,11 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(            ScalarTy->isIntegerTy(32));  } +ArrayRef<MCPhysReg> ARMTargetLowering::getRoundingControlRegisters() const { +  static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM}; +  return RCRegs; +} +  Value *ARMTargetLowering::createComplexDeinterleavingIR(      IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,      ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 357d2c5..bf3438b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -1009,6 +1009,8 @@ class VectorType;      bool isUnsupportedFloatingType(EVT VT) const; +    ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; +      SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,                      SDValue ARMcc, SDValue Flags, SelectionDAG &DAG) const;      SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 3b810d0..79863e1 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -34,7 +34,7 @@ class HexagonCopyHoisting : public MachineFunctionPass {  public:    static char ID; -  HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {} +  HexagonCopyHoisting() : MachineFunctionPass(ID) {}    StringRef getPassName() const override { return "Hexagon Copy Hoisting"; } @@ -56,8 +56,8 @@ public:    void moveCopyInstr(MachineBasicBlock *DestBB,                       std::pair<Register, Register> Key, MachineInstr *MI); -  MachineFunction *MFN; -  MachineRegisterInfo *MRI; +  MachineFunction *MFN = nullptr; +  MachineRegisterInfo *MRI = nullptr;    std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>>        CopyMIList;  }; diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index 93418f7..a10c937 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -34,13 +34,13 @@ STATISTIC(HexagonNumStoreAbsConversions,  namespace {  class HexagonGenMemAbsolute : public MachineFunctionPass { -  const HexagonInstrInfo *TII; -  MachineRegisterInfo *MRI; -  const TargetRegisterInfo *TRI; +  const HexagonInstrInfo *TII = nullptr; +  MachineRegisterInfo *MRI = nullptr; +  const TargetRegisterInfo *TRI = nullptr;  public:    static char ID; -  HexagonGenMemAbsolute() : MachineFunctionPass(ID), TII(0), MRI(0), TRI(0) {} +  HexagonGenMemAbsolute() : MachineFunctionPass(ID) {}    StringRef getPassName() const override {      return "Hexagon Generate Load/Store Set Absolute Address Instruction"; diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index b9cdd6a..ce2de75 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -544,7 +544,7 @@ int HexagonSubtarget::updateLatency(MachineInstr &SrcInst,    if (!hasV60Ops())      return Latency; -  auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo()); +  const HexagonInstrInfo &QII = *getInstrInfo();    // BSB scheduling.    if (QII.isHVXVec(SrcInst) || useBSBScheduling())      Latency = (Latency + 1) >> 1; diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp index 71bdfc66..5a85f34 100644 --- a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp @@ -43,7 +43,7 @@ namespace {  class HexagonTfrCleanup : public MachineFunctionPass {  public:    static char ID; -  HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {} +  HexagonTfrCleanup() : MachineFunctionPass(ID) {}    StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesAll(); @@ -52,8 +52,8 @@ public:    bool runOnMachineFunction(MachineFunction &MF) override;  private: -  const HexagonInstrInfo *HII; -  const TargetRegisterInfo *TRI; +  const HexagonInstrInfo *HII = nullptr; +  const TargetRegisterInfo *TRI = nullptr;    typedef DenseMap<unsigned, uint64_t> ImmediateMap; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9a6afa1..b25a054 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3995,6 +3995,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,      case RISCV::CTZW:      case RISCV::CPOPW:      case RISCV::SLLI_UW: +    case RISCV::ABSW:      case RISCV::FMV_W_X:      case RISCV::FCVT_H_W:      case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1c930ac..56881f7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -433,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    if (Subtarget.hasStdExtP() ||        (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction(ISD::ABS, XLenVT, Legal); +    if (Subtarget.is64Bit()) +      setOperationAction(ISD::ABS, MVT::i32, Custom);    } else if (Subtarget.hasShortForwardBranchOpt()) {      // We can use PseudoCCSUB to implement ABS.      setOperationAction(ISD::ABS, XLenVT, Legal); @@ -14816,8 +14818,16 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&             "Unexpected custom legalisation"); +    if (Subtarget.hasStdExtP()) { +      SDValue Src = +          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); +      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); +      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs)); +      return; +    } +      if (Subtarget.hasStdExtZbb()) { -      // Emit a special ABSW node that will be expanded to NEGW+MAX at isel. +      // Emit a special node that will be expanded to NEGW+MAX at isel.        // This allows us to remember that the result is sign extended. Expanding        // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.        SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, @@ -20290,6 +20300,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,      break;    } +  case RISCVISD::ABSW:    case RISCVISD::CLZW:    case RISCVISD::CTZW: {      // Only the lower 32 bits of the first operand are read @@ -21862,6 +21873,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(    case RISCVISD::REMUW:    case RISCVISD::ROLW:    case RISCVISD::RORW: +  case RISCVISD::ABSW:    case RISCVISD::FCVT_W_RV64:    case RISCVISD::FCVT_WU_RV64:    case RISCVISD::STRICT_FCVT_W_RV64: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index cc085bb..4cbbba3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1461,5 +1461,10 @@ let Predicates = [HasStdExtP, IsRV32] in {  // Codegen patterns  //===----------------------------------------------------------------------===// +def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; +  let Predicates = [HasStdExtP] in  def : PatGpr<abs, ABS>; + +let Predicates = [HasStdExtP, IsRV64] in +def : PatGpr<riscv_absw, ABSW>; diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index d08115b..ea98cdb 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -172,6 +172,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,        case RISCV::CTZW:        case RISCV::CPOPW:        case RISCV::SLLI_UW: +      case RISCV::ABSW:        case RISCV::FMV_W_X:        case RISCV::FCVT_H_W:        case RISCV::FCVT_H_W_INX: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 624cff2..49beada 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48778,10 +48778,9 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,        SDValue BC0 = peekThroughBitcasts(Op0);        if (BC0.getOpcode() == X86ISD::PCMPEQ &&            ISD::isBuildVectorAllZeros(BC0.getOperand(1).getNode())) { -        SDLoc DL(EFLAGS);          CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE); -        SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0)); -        return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X); +        SDValue X = DAG.getBitcast(OpVT, DAG.getFreeze(BC0.getOperand(0))); +        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, X, X);        }      }    } @@ -48837,7 +48836,7 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,                MVT FloatSVT = MVT::getFloatingPointVT(EltBits);                MVT FloatVT =                    MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); -              Res = DAG.getBitcast(FloatVT, Res); +              Res = DAG.getBitcast(FloatVT, DAG.getFreeze(Res));                return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);              } else if (EltBits == 16) {                MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; @@ -48856,8 +48855,30 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,      }      // TESTZ(X,-1) == TESTZ(X,X) -    if (ISD::isBuildVectorAllOnes(Op1.getNode())) +    if (ISD::isBuildVectorAllOnes(Op1.getNode())) { +      Op0 = DAG.getFreeze(Op0);        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); +    } + +    // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets. +    if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) { +      KnownBits KnownOp1 = DAG.computeKnownBits(Op1); +      assert(KnownOp1.getBitWidth() == 64 && +             "Illegal PTEST vector element width"); +      if (KnownOp1.isConstant()) { +        const APInt &Mask = KnownOp1.getConstant(); +        if (Mask.isSignMask()) { +          MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); +          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); +          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); +        } +        if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { +          MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); +          Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); +          return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); +        } +      } +    }      // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)      // TODO: Add COND_NE handling? @@ -53480,6 +53501,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,    return SDValue();  } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, +                              SelectionDAG &DAG, +                              const X86Subtarget &Subtarget) { +  using namespace SDPatternMatch; + +  // Only handle normal stores and its chain was a matching normal load. +  auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); +  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || +      !ISD::isNormalLoad(Ld) || !Ld->isSimple() || +      Ld->getBasePtr() != St->getBasePtr() || +      Ld->getOffset() != St->getOffset()) +    return SDValue(); + +  SDValue LoadVal(Ld, 0); +  SDValue StoredVal = St->getValue(); +  EVT VT = StoredVal.getValueType(); + +  // Only narrow larger than legal scalar integers. +  if (!VT.isScalarInteger() || +      VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) +    return SDValue(); + +  // BTR: X & ~(1 << ShAmt) +  // BTS: X | (1 << ShAmt) +  // BTC: X ^ (1 << ShAmt) +  SDValue ShAmt; +  if (!StoredVal.hasOneUse() || +      !(sd_match(StoredVal, m_And(m_Specific(LoadVal), +                                  m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || +        sd_match(StoredVal, +                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || +        sd_match(StoredVal, +                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) +    return SDValue(); + +  // Ensure the shift amount is in bounds. +  KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); +  if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) +    return SDValue(); + +  // Split the shift into an alignment shift that moves the active i32 block to +  // the bottom bits for truncation and a modulo shift that can act on the i32. +  EVT AmtVT = ShAmt.getValueType(); +  SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                                 DAG.getSignedConstant(-32LL, DL, AmtVT)); +  SDValue ModuloAmt = +      DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + +  // Compute the byte offset for the i32 block that is changed by the RMW. +  // combineTruncate will adjust the load for us in a similar way. +  EVT PtrVT = St->getBasePtr().getValueType(); +  SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); +  SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, +                                   DAG.getShiftAmountConstant(3, PtrVT, DL)); +  SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, +                                            SDNodeFlags::NoUnsignedWrap); + +  // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. +  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); +  X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + +  SDValue Mask = +      DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), +                  DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); +  if (StoredVal.getOpcode() == ISD::AND) +    Mask = DAG.getNOT(DL, Mask, MVT::i32); + +  SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); +  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), +                      Align(), St->getMemOperand()->getFlags()); +} +  static SDValue combineStore(SDNode *N, SelectionDAG &DAG,                              TargetLowering::DAGCombinerInfo &DCI,                              const X86Subtarget &Subtarget) { @@ -53706,6 +53801,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,      }    } +  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) +    return R; +    // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)    //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)    if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54660,8 +54758,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,    // truncation, see if we can convert the shift into a pointer offset instead.    // Limit this to normal (non-ext) scalar integer loads.    if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && -      Src.hasOneUse() && Src.getOperand(0).hasOneUse() && -      ISD::isNormalLoad(Src.getOperand(0).getNode())) { +      Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && +      (Src.getOperand(0).hasOneUse() || +       !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {      auto *Ld = cast<LoadSDNode>(Src.getOperand(0));      if (Ld->isSimple() && VT.isByteSized() &&          isPowerOf2_64(VT.getSizeInBits())) { @@ -56459,6 +56558,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,  static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,                              TargetLowering::DAGCombinerInfo &DCI,                              const X86Subtarget &Subtarget) { +  using namespace SDPatternMatch;    const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();    const SDValue LHS = N->getOperand(0);    const SDValue RHS = N->getOperand(1); @@ -56517,6 +56617,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,        if (SDValue AndN = MatchAndCmpEq(RHS, LHS))          return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); +      // If we're performing a bit test on a larger than legal type, attempt +      // to (aligned) shift down the value to the bottom 32-bits and then +      // perform the bittest on the i32 value. +      // ICMP_ZERO(AND(X,SHL(1,IDX))) +      // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) +      if (isNullConstant(RHS) && +          OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { +        SDValue X, ShAmt; +        if (sd_match(LHS, m_OneUse(m_And(m_Value(X), +                                         m_Shl(m_One(), m_Value(ShAmt)))))) { +          // Only attempt this if the shift amount is known to be in bounds. +          KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); +          if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { +            EVT AmtVT = ShAmt.getValueType(); +            SDValue AlignAmt = +                DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                            DAG.getSignedConstant(-32LL, DL, AmtVT)); +            SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, +                                            DAG.getConstant(31, DL, AmtVT)); +            SDValue Mask = DAG.getNode( +                ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), +                DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); +            X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); +            X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); +            X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); +            return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), +                                CC); +          } +        } +      } +        // cmpeq(trunc(x),C) --> cmpeq(x,C)        // cmpne(trunc(x),C) --> cmpne(x,C)        // iff x upper bits are zero. diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7795cce..b5548d4 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -69,14 +69,6 @@ namespace llvm {  // Command line option to enable vtable value profiling. Defined in  // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=  extern cl::opt<bool> EnableVTableValueProfiling; -// TODO: Remove -debug-info-correlate in next LLVM release, in favor of -// -profile-correlate=debug-info. -cl::opt<bool> DebugInfoCorrelate( -    "debug-info-correlate", -    cl::desc("Use debug info to correlate profiles. (Deprecated, use " -             "-profile-correlate=debug-info)"), -    cl::init(false)); -  LLVM_ABI cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(      "profile-correlate",      cl::desc("Use debug info or binary file to correlate profiles."), @@ -1047,7 +1039,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {    // in lightweight mode. We need to move the value profile pointer to the    // Counter struct to get this working.    assert( -      !DebugInfoCorrelate && ProfileCorrelate == InstrProfCorrelator::NONE && +      ProfileCorrelate == InstrProfCorrelator::NONE &&        "Value profiling is not yet supported with lightweight instrumentation");    GlobalVariable *Name = Ind->getName();    auto It = ProfileDataMap.find(Name); @@ -1504,7 +1496,7 @@ static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) {  }  void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) { -  assert(!DebugInfoCorrelate && +  assert(ProfileCorrelate != InstrProfCorrelator::DEBUG_INFO &&           "Value profiling is not supported with lightweight instrumentation");    if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())      return; @@ -1584,8 +1576,7 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,    // Use internal rather than private linkage so the counter variable shows up    // in the symbol table when using debug info for correlation. -  if ((DebugInfoCorrelate || -       ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) && +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO &&        TT.isOSBinFormatMachO() && Linkage == GlobalValue::PrivateLinkage)      Linkage = GlobalValue::InternalLinkage; @@ -1691,8 +1682,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {    auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);    PD.RegionCounters = CounterPtr; -  if (DebugInfoCorrelate || -      ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) { +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {      LLVMContext &Ctx = M.getContext();      Function *Fn = Inc->getParent()->getParent();      if (auto *SP = Fn->getSubprogram()) { @@ -1737,7 +1727,7 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {  void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {    // When debug information is correlated to profile data, a data variable    // is not needed. -  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)      return;    GlobalVariable *NamePtr = Inc->getName(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 71736cf..af53fa0 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -456,7 +456,7 @@ createIRLevelProfileFlagVar(Module &M,      ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;    if (PGOInstrumentLoopEntries)      ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES; -  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) +  if (ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)      ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;    if (PGOFunctionEntryCoverage)      ProfileVersion |= diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 7ebcc21..4ba4ba3 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -162,8 +162,6 @@ class IndVarSimplify {                                   const SCEV *ExitCount,                                   PHINode *IndVar, SCEVExpander &Rewriter); -  bool sinkUnusedInvariants(Loop *L); -  public:    IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,                   const DataLayout &DL, TargetLibraryInfo *TLI, @@ -1079,85 +1077,6 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,    return true;  } -//===----------------------------------------------------------------------===// -//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders. -//===----------------------------------------------------------------------===// - -/// If there's a single exit block, sink any loop-invariant values that -/// were defined in the preheader but not used inside the loop into the -/// exit block to reduce register pressure in the loop. -bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { -  BasicBlock *ExitBlock = L->getExitBlock(); -  if (!ExitBlock) return false; - -  BasicBlock *Preheader = L->getLoopPreheader(); -  if (!Preheader) return false; - -  bool MadeAnyChanges = false; -  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { - -    // Skip BB Terminator. -    if (Preheader->getTerminator() == &I) -      continue; - -    // New instructions were inserted at the end of the preheader. -    if (isa<PHINode>(I)) -      break; - -    // Don't move instructions which might have side effects, since the side -    // effects need to complete before instructions inside the loop.  Also don't -    // move instructions which might read memory, since the loop may modify -    // memory. Note that it's okay if the instruction might have undefined -    // behavior: LoopSimplify guarantees that the preheader dominates the exit -    // block. -    if (I.mayHaveSideEffects() || I.mayReadFromMemory()) -      continue; - -    // Skip debug or pseudo instructions. -    if (I.isDebugOrPseudoInst()) -      continue; - -    // Skip eh pad instructions. -    if (I.isEHPad()) -      continue; - -    // Don't sink alloca: we never want to sink static alloca's out of the -    // entry block, and correctly sinking dynamic alloca's requires -    // checks for stacksave/stackrestore intrinsics. -    // FIXME: Refactor this check somehow? -    if (isa<AllocaInst>(&I)) -      continue; - -    // Determine if there is a use in or before the loop (direct or -    // otherwise). -    bool UsedInLoop = false; -    for (Use &U : I.uses()) { -      Instruction *User = cast<Instruction>(U.getUser()); -      BasicBlock *UseBB = User->getParent(); -      if (PHINode *P = dyn_cast<PHINode>(User)) { -        unsigned i = -          PHINode::getIncomingValueNumForOperand(U.getOperandNo()); -        UseBB = P->getIncomingBlock(i); -      } -      if (UseBB == Preheader || L->contains(UseBB)) { -        UsedInLoop = true; -        break; -      } -    } - -    // If there is, the def must remain in the preheader. -    if (UsedInLoop) -      continue; - -    // Otherwise, sink it to the exit block. -    I.moveBefore(ExitBlock->getFirstInsertionPt()); -    SE->forgetValue(&I); -    MadeAnyChanges = true; -  } - -  return MadeAnyChanges; -} -  static void replaceExitCond(BranchInst *BI, Value *NewCond,                              SmallVectorImpl<WeakTrackingVH> &DeadInsts) {    auto *OldCond = BI->getCondition(); @@ -2065,10 +1984,6 @@ bool IndVarSimplify::run(Loop *L) {    // The Rewriter may not be used from this point on. -  // Loop-invariant instructions in the preheader that aren't used in the -  // loop may be sunk below the loop to reduce register pressure. -  Changed |= sinkUnusedInvariants(L); -    // rewriteFirstIterationLoopExitValues does not rely on the computation of    // trip count and therefore can further simplify exit values in addition to    // rewriteLoopExitValues. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index b2c526b..d13b990 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -211,9 +211,15 @@ static Instruction *cloneInstructionInExitBlock(  static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,                               MemorySSAUpdater &MSSAU); -static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest, -                                  ICFLoopSafetyInfo &SafetyInfo, -                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE); +static void moveInstructionBefore( +    Instruction &I, BasicBlock::iterator Dest, ICFLoopSafetyInfo &SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, +    MemorySSA::InsertionPlace Point = MemorySSA::BeforeTerminator); + +static bool sinkUnusedInvariantsFromPreheaderToExit( +    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, +    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE);  static void foreachMemoryAccess(MemorySSA *MSSA, Loop *L,                                  function_ref<void(Instruction *)> Fn); @@ -471,6 +477,12 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI,                                      TLI, TTI, L, MSSAU, &SafetyInfo, Flags, ORE)              : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,                           MSSAU, &SafetyInfo, Flags, ORE); + +  // sink pre-header defs that are unused in-loop into the unique exit to reduce +  // pressure. +  Changed |= sinkUnusedInvariantsFromPreheaderToExit(L, AA, &SafetyInfo, MSSAU, +                                                     SE, DT, Flags, ORE); +    Flags.setIsSink(false);    if (Preheader)      Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, @@ -1456,19 +1468,80 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,  static void moveInstructionBefore(Instruction &I, BasicBlock::iterator Dest,                                    ICFLoopSafetyInfo &SafetyInfo, -                                  MemorySSAUpdater &MSSAU, -                                  ScalarEvolution *SE) { +                                  MemorySSAUpdater &MSSAU, ScalarEvolution *SE, +                                  MemorySSA::InsertionPlace Point) {    SafetyInfo.removeInstruction(&I);    SafetyInfo.insertInstructionTo(&I, Dest->getParent());    I.moveBefore(*Dest->getParent(), Dest);    if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(            MSSAU.getMemorySSA()->getMemoryAccess(&I))) -    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), -                      MemorySSA::BeforeTerminator); +    MSSAU.moveToPlace(OldMemAcc, Dest->getParent(), Point);    if (SE)      SE->forgetBlockAndLoopDispositions(&I);  } +// If there's a single exit block, sink any loop-invariant values that were +// defined in the preheader but not used inside the loop into the exit block +// to reduce register pressure in the loop. +static bool sinkUnusedInvariantsFromPreheaderToExit( +    Loop *L, AAResults *AA, ICFLoopSafetyInfo *SafetyInfo, +    MemorySSAUpdater &MSSAU, ScalarEvolution *SE, DominatorTree *DT, +    SinkAndHoistLICMFlags &SinkFlags, OptimizationRemarkEmitter *ORE) { +  BasicBlock *ExitBlock = L->getExitBlock(); +  if (!ExitBlock) +    return false; + +  BasicBlock *Preheader = L->getLoopPreheader(); +  if (!Preheader) +    return false; + +  bool MadeAnyChanges = false; + +  for (Instruction &I : llvm::make_early_inc_range(llvm::reverse(*Preheader))) { + +    // Skip terminator. +    if (Preheader->getTerminator() == &I) +      continue; + +    // New instructions were inserted at the end of the preheader. +    if (isa<PHINode>(I)) +      break; + +    // Don't move instructions which might have side effects, since the side +    // effects need to complete before instructions inside the loop. Note that +    // it's okay if the instruction might have undefined behavior: LoopSimplify +    // guarantees that the preheader dominates the exit block. +    if (I.mayHaveSideEffects()) +      continue; + +    if (!canSinkOrHoistInst(I, AA, DT, L, MSSAU, true, SinkFlags, nullptr)) +      continue; + +    // Determine if there is a use in or before the loop (direct or +    // otherwise). +    bool UsedInLoopOrPreheader = false; +    for (Use &U : I.uses()) { +      auto *UserI = cast<Instruction>(U.getUser()); +      BasicBlock *UseBB = UserI->getParent(); +      if (auto *PN = dyn_cast<PHINode>(UserI)) { +        UseBB = PN->getIncomingBlock(U); +      } +      if (UseBB == Preheader || L->contains(UseBB)) { +        UsedInLoopOrPreheader = true; +        break; +      } +    } +    if (UsedInLoopOrPreheader) +      continue; + +    moveInstructionBefore(I, ExitBlock->getFirstInsertionPt(), *SafetyInfo, +                          MSSAU, SE, MemorySSA::Beginning); +    MadeAnyChanges = true; +  } + +  return MadeAnyChanges; +} +  static Instruction *sinkThroughTriviallyReplaceablePHI(      PHINode *TPN, Instruction *I, LoopInfo *LI,      SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies, diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6..001215a 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1318,6 +1318,11 @@ public:    /// the loop, in which case some special-case heuristics may be used.    bool AllFixupsOutsideLoop = true; +  /// This records whether all of the fixups using this LSRUse are unconditional +  /// within the loop, meaning they will be executed on every path to the loop +  /// latch. This includes fixups before early exits. +  bool AllFixupsUnconditional = true; +    /// RigidFormula is set to true to guarantee that this use will be associated    /// with a single formula--the one that initially matched. Some SCEV    /// expressions cannot be expanded. This allows LSR to consider the registers @@ -1421,16 +1426,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,      if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||          TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {        const SCEV *Start; -      const SCEVConstant *Step; -      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) +      const APInt *Step; +      if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {          // If the step size matches the base offset, we could use pre-indexed          // addressing. -        if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() && -             Step->getAPInt() == F.BaseOffset.getFixedValue()) || -            ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) && -             SE->isLoopInvariant(Start, L))) +        bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && +                           F.BaseOffset.isFixed() && +                           *Step == F.BaseOffset.getFixedValue(); +        bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && +                            !isa<SCEVConstant>(Start) && +                            SE->isLoopInvariant(Start, L); +        // We can only pre or post index when the load/store is unconditional. +        if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)            LoopCost = 0; +      }      } +      // If the loop counts down to zero and we'll be using a hardware loop then      // the addrec will be combined into the hardware loop instruction.      if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() && @@ -1783,6 +1794,9 @@ void LSRUse::print(raw_ostream &OS) const {    if (AllFixupsOutsideLoop)      OS << ", all-fixups-outside-loop"; +  if (AllFixupsUnconditional) +    OS << ", all-fixups-unconditional"; +    if (WidestFixupType)      OS << ", widest fixup type: " << *WidestFixupType;  } @@ -2213,6 +2227,7 @@ class LSRInstance {    void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);    void CountRegisters(const Formula &F, size_t LUIdx);    bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); +  bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;    void CollectLoopInvariantFixupsAndFormulae(); @@ -3607,6 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {      LF.PostIncLoops = TmpPostIncLoops;      LF.Offset = Offset;      LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); +    LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);      // Create SCEV as Formula for calculating baseline cost      if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) { @@ -3680,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {    return true;  } +/// Test whether this fixup will be executed each time the corresponding IV +/// increment instruction is executed. +bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const { +  // If the fixup block dominates the IV increment block then there is no path +  // through the loop to the increment that doesn't pass through the fixup. +  return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent()); +} +  /// Check for other uses of loop-invariant values which we're tracking. These  /// other uses will pin these values in registers, making them less profitable  /// for elimination. @@ -3803,6 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {          LF.OperandValToReplace = U;          LF.Offset = Offset;          LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); +        LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);          if (!LU.WidestFixupType ||              SE.getTypeSizeInBits(LU.WidestFixupType) <              SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -4940,6 +4965,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {        LLVM_DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');        LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; +      LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;        // Transfer the fixups of LU to LUThatHas.        for (LSRFixup &Fixup : LU.Fixups) { diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e043d07..08be5df 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1534,8 +1534,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,    bool SrcNotDom = false;    auto CaptureTrackingWithModRef = -      [&](Instruction *AI, -          function_ref<bool(Instruction *)> ModRefCallback) -> bool { +      [&](Instruction *AI, function_ref<bool(Instruction *)> ModRefCallback, +          bool &AddressCaptured) -> bool {      SmallVector<Instruction *, 8> Worklist;      Worklist.push_back(AI);      unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); @@ -1559,8 +1559,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,          if (!Visited.insert(&U).second)            continue;          UseCaptureInfo CI = DetermineUseCaptureKind(U, AI); -        if (capturesAnything(CI.UseCC)) +        if (capturesAnyProvenance(CI.UseCC))            return false; +        AddressCaptured |= capturesAddress(CI.UseCC);          if (UI->mayReadOrWriteMemory()) {            if (UI->isLifetimeStartOrEnd()) { @@ -1627,7 +1628,9 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,      return true;    }; -  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) +  bool DestAddressCaptured = false; +  if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback, +                                 DestAddressCaptured))      return false;    // Bailout if Dest may have any ModRef before Store.    if (!ReachabilityWorklist.empty() && @@ -1653,7 +1656,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,      return true;    }; -  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) +  bool SrcAddressCaptured = false; +  if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback, +                                 SrcAddressCaptured)) +    return false; + +  // If both the source and destination address are captured, the fact that they +  // are no longer two separate allocations may be observed. +  if (DestAddressCaptured && SrcAddressCaptured)      return false;    // We can do the transformation. First, move the SrcAlloca to the start of the diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 5af6c96..bb6c879 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -81,6 +81,7 @@ STATISTIC(  STATISTIC(NumInvariantConditionsInjected,            "Number of invariant conditions injected and unswitched"); +namespace llvm {  static cl::opt<bool> EnableNonTrivialUnswitch(      "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,      cl::desc("Forcibly enables non-trivial loop unswitching rather than " @@ -131,11 +132,17 @@ static cl::opt<bool> InjectInvariantConditions(  static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(      "simple-loop-unswitch-inject-invariant-condition-hotness-threshold", -    cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " -                         "unswitch on them to eliminate branches that are " -                         "not-taken 1/<this option> times or less."), +    cl::Hidden, +    cl::desc("Only try to inject loop invariant conditions and " +             "unswitch on them to eliminate branches that are " +             "not-taken 1/<this option> times or less."),      cl::init(16)); +static cl::opt<bool> EstimateProfile("simple-loop-unswitch-estimate-profile", +                                     cl::Hidden, cl::init(true)); +extern cl::opt<bool> ProfcheckDisableMetadataFixes; +} // namespace llvm +  AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key;  namespace {  struct CompareDesc { @@ -268,13 +275,42 @@ static bool areLoopExitPHIsLoopInvariant(const Loop &L,    llvm_unreachable("Basic blocks should never be empty!");  } -/// Copy a set of loop invariant values \p ToDuplicate and insert them at the +/// Copy a set of loop invariant values \p Invariants and insert them at the  /// end of \p BB and conditionally branch on the copied condition. We only  /// branch on a single value. +/// We attempt to estimate the profile of the resulting conditional branch from +/// \p ComputeProfFrom, which is the original conditional branch we're +/// unswitching. +/// When \p Direction is true, the \p Invariants form a disjunction, and the +/// branch conditioned on it exits the loop on the "true" case. When \p +/// Direction is false, the \p Invariants form a conjunction and the branch +/// exits on the "false" case.  static void buildPartialUnswitchConditionalBranch(      BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,      BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, -    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { +    const Instruction *I, AssumptionCache *AC, const DominatorTree &DT, +    const BranchInst &ComputeProfFrom) { + +  SmallVector<uint32_t> BranchWeights; +  bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes && +                          extractBranchWeights(ComputeProfFrom, BranchWeights); +  // If Direction is true, that means we had a disjunction and that the "true" +  // case exits. The probability of the disjunction of the subset of terms is at +  // most as high as the original one. So, if the probability is higher than the +  // one we'd assign in absence of a profile (i.e. 0.5), we will use 0.5, +  // but if it's lower, we will use the original probability. +  // Conversely, if Direction is false, that means we had a conjunction, and the +  // probability of exiting is captured in the second branch weight. That +  // probability is a disjunction (of the negation of the original terms). The +  // same reasoning applies as above. +  // Issue #165649: should we expect BFI to conserve, and use that to calculate +  // the branch weights? +  if (HasBranchWeights && +      static_cast<double>(BranchWeights[Direction ? 0 : 1]) / +              static_cast<double>(sum_of(BranchWeights)) > +          0.5) +    HasBranchWeights = false; +    IRBuilder<> IRB(&BB);    IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); @@ -287,8 +323,14 @@ static void buildPartialUnswitchConditionalBranch(    Value *Cond = Direction ? IRB.CreateOr(FrozenInvariants)                            : IRB.CreateAnd(FrozenInvariants); -  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, -                   Direction ? &NormalSucc : &UnswitchedSucc); +  auto *BR = IRB.CreateCondBr( +      Cond, Direction ? &UnswitchedSucc : &NormalSucc, +      Direction ? &NormalSucc : &UnswitchedSucc, +      HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof) +                       : nullptr); +  if (!HasBranchWeights) +    setExplicitlyUnknownBranchWeightsIfProfiled( +        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);  }  /// Copy a set of loop invariant values, and conditionally branch on them. @@ -658,7 +700,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,               " condition!");      buildPartialUnswitchConditionalBranch(          *OldPH, Invariants, ExitDirection, *UnswitchedBB, *NewPH, -        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT); +        FreezeLoopUnswitchCond, OldPH->getTerminator(), nullptr, DT, BI);    }    // Update the dominator tree with the added edge. @@ -2477,7 +2519,7 @@ static void unswitchNontrivialInvariants(      else {        buildPartialUnswitchConditionalBranch(            *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, -          FreezeLoopUnswitchCond, BI, &AC, DT); +          FreezeLoopUnswitchCond, BI, &AC, DT, *BI);      }      DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 9829d4d..11db0ec 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -674,6 +674,79 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,    return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);  } +/// Helper function to update the cycle or loop information after inserting a +/// new block between a callbr instruction and one of its target blocks.  Adds +/// the new block to the innermost cycle or loop that the callbr instruction and +/// the original target block share. +/// \p LCI            cycle or loop information to update +/// \p CallBrBlock    block containing the callbr instruction +/// \p CallBrTarget   new target block of the callbr instruction +/// \p Succ           original target block of the callbr instruction +template <typename TI, typename T> +static bool updateCycleLoopInfo(TI *LCI, BasicBlock *CallBrBlock, +                                BasicBlock *CallBrTarget, BasicBlock *Succ) { +  static_assert(std::is_same_v<TI, CycleInfo> || std::is_same_v<TI, LoopInfo>, +                "type must be CycleInfo or LoopInfo"); +  if (!LCI) +    return false; + +  T *LC; +  if constexpr (std::is_same_v<TI, CycleInfo>) +    LC = LCI->getSmallestCommonCycle(CallBrBlock, Succ); +  else +    LC = LCI->getSmallestCommonLoop(CallBrBlock, Succ); +  if (!LC) +    return false; + +  if constexpr (std::is_same_v<TI, CycleInfo>) +    LCI->addBlockToCycle(CallBrTarget, LC); +  else +    LC->addBasicBlockToLoop(CallBrTarget, *LCI); + +  return true; +} + +BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ, +                                  unsigned SuccIdx, DomTreeUpdater *DTU, +                                  CycleInfo *CI, LoopInfo *LI, +                                  bool *UpdatedLI) { +  CallBrInst *CallBr = dyn_cast<CallBrInst>(CallBrBlock->getTerminator()); +  assert(CallBr && "expected callbr terminator"); +  assert(SuccIdx < CallBr->getNumSuccessors() && +         Succ == CallBr->getSuccessor(SuccIdx) && "invalid successor index"); + +  // Create a new block between callbr and the specified successor. +  // splitBlockBefore cannot be re-used here since it cannot split if the split +  // point is a PHI node (because BasicBlock::splitBasicBlockBefore cannot +  // handle that). But we don't need to rewire every part of a potential PHI +  // node. We only care about the edge between CallBrBlock and the original +  // successor. +  BasicBlock *CallBrTarget = +      BasicBlock::Create(CallBrBlock->getContext(), +                         CallBrBlock->getName() + ".target." + Succ->getName(), +                         CallBrBlock->getParent()); +  // Rewire control flow from the new target block to the original successor. +  Succ->replacePhiUsesWith(CallBrBlock, CallBrTarget); +  // Rewire control flow from callbr to the new target block. +  CallBr->setSuccessor(SuccIdx, CallBrTarget); +  // Jump from the new target block to the original successor. +  BranchInst::Create(Succ, CallBrTarget); + +  bool Updated = +      updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ); +  if (UpdatedLI) +    *UpdatedLI = Updated; +  updateCycleLoopInfo<CycleInfo, Cycle>(CI, CallBrBlock, CallBrTarget, Succ); +  if (DTU) { +    DTU->applyUpdates({{DominatorTree::Insert, CallBrBlock, CallBrTarget}}); +    if (DTU->getDomTree().dominates(CallBrBlock, Succ)) +      DTU->applyUpdates({{DominatorTree::Delete, CallBrBlock, Succ}, +                         {DominatorTree::Insert, CallBrTarget, Succ}}); +  } + +  return CallBrTarget; +} +  void llvm::setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {    if (auto *II = dyn_cast<InvokeInst>(TI))      II->setUnwindDest(Succ); diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 0046a00..287a177 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -13,6 +13,7 @@  #include "llvm/Transforms/Utils/ControlFlowUtils.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/ValueHandle.h" @@ -281,7 +282,9 @@ std::pair<BasicBlock *, bool> ControlFlowHub::finalize(    for (auto [BB, Succ0, Succ1] : Branches) {  #ifndef NDEBUG -    assert(Incoming.insert(BB).second && "Duplicate entry for incoming block."); +    assert( +        (Incoming.insert(BB).second || isa<CallBrInst>(BB->getTerminator())) && +        "Duplicate entry for incoming block.");  #endif      if (Succ0)        Outgoing.insert(Succ0); diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp index 45e1d12..804af22 100644 --- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp +++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp @@ -79,6 +79,53 @@  // Limitation: The pass cannot handle switch statements and indirect  //             branches. Both must be lowered to plain branches first.  // +// CallBr support: CallBr is handled as a more general branch instruction which +// can have multiple successors. The pass redirects the edges to intermediate +// target blocks that unconditionally branch to the original callbr target +// blocks. This allows the control flow hub to know to which of the original +// target blocks to jump to. +// Example input CFG: +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                      H ----> B +//                      ^      /| +//                       `----' | +//                              v +//                             Exit +// +// becomes: +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                 target.H   target.B +//                      |       | +//                      v       v +//                      H ----> B +//                      ^      /| +//                       `----' | +//                              v +//                             Exit +// +// Note +// OUTPUT CFG: Converted to a natural loop with a new header N. +// +//                        Entry (callbr) +//                       /     \ +//                      v       v +//                 target.H   target.B +//                      \       / +//                       \     / +//                        v   v +//                          N <---. +//                         / \     \ +//                        /   \     | +//                       v     v    / +//                       H --> B --' +//                             | +//                             v +//                            Exit +//  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Utils/FixIrreducible.h" @@ -231,6 +278,7 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,      return false;    LLVM_DEBUG(dbgs() << "Processing cycle:\n" << CI.print(&C) << "\n";); +  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    ControlFlowHub CHub;    SetVector<BasicBlock *> Predecessors; @@ -242,18 +290,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    }    for (BasicBlock *P : Predecessors) { -    auto *Branch = cast<BranchInst>(P->getTerminator()); -    // Exactly one of the two successors is the header. -    BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; -    BasicBlock *Succ1 = Succ0 ? nullptr : Header; -    if (!Succ0) -      assert(Branch->getSuccessor(1) == Header); -    assert(Succ0 || Succ1); -    CHub.addBranch(P, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added internal branch: " << P->getName() << " -> " -                      << (Succ0 ? Succ0->getName() : "") << " " -                      << (Succ1 ? Succ1->getName() : "") << "\n"); +    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) { +      // Exactly one of the two successors is the header. +      BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr; +      BasicBlock *Succ1 = Succ0 ? nullptr : Header; +      assert(Succ0 || Branch->getSuccessor(1) == Header); +      assert(Succ0 || Succ1); +      CHub.addBranch(P, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { +      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { +        BasicBlock *Succ = CallBr->getSuccessor(I); +        if (Succ != Header) +          continue; +        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added internal branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    // Redirect external incoming edges. This includes the edges on the header. @@ -266,17 +328,32 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    }    for (BasicBlock *P : Predecessors) { -    auto *Branch = cast<BranchInst>(P->getTerminator()); -    BasicBlock *Succ0 = Branch->getSuccessor(0); -    Succ0 = C.contains(Succ0) ? Succ0 : nullptr; -    BasicBlock *Succ1 = -        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); -    Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; -    CHub.addBranch(P, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added external branch: " << P->getName() << " -> " -                      << (Succ0 ? Succ0->getName() : "") << " " -                      << (Succ1 ? Succ1->getName() : "") << "\n"); +    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) { +      BasicBlock *Succ0 = Branch->getSuccessor(0); +      Succ0 = C.contains(Succ0) ? Succ0 : nullptr; +      BasicBlock *Succ1 = +          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); +      Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr; +      CHub.addBranch(P, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(P->getTerminator())) { +      for (unsigned I = 0; I < CallBr->getNumSuccessors(); ++I) { +        BasicBlock *Succ = CallBr->getSuccessor(I); +        if (!C.contains(Succ)) +          continue; +        BasicBlock *NewSucc = SplitCallBrEdge(P, Succ, I, &DTU, &CI, LI); +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added external branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    // Redirect all the backedges through a "hub" consisting of a series @@ -292,7 +369,6 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,    SetVector<BasicBlock *> Entries;    Entries.insert(C.entry_rbegin(), C.entry_rend()); -  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    CHub.finalize(&DTU, GuardBlocks, "irr");  #if defined(EXPENSIVE_CHECKS)    assert(DT.verify(DominatorTree::VerificationLevel::Full)); @@ -325,8 +401,6 @@ static bool FixIrreducibleImpl(Function &F, CycleInfo &CI, DominatorTree &DT,    LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "                      << F.getName() << "\n"); -  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); -    bool Changed = false;    for (Cycle *TopCycle : CI.toplevel_cycles()) {      for (Cycle *C : depth_first(TopCycle)) { diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 6312831..7a2b8da 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -460,25 +460,10 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,    Loop *NewLoop = NewLoops[L];    assert(NewLoop && "L should have been cloned"); -  MDNode *LoopID = NewLoop->getLoopID(); - -  // Only add loop metadata if the loop is not going to be completely -  // unrolled. -  if (UnrollRemainder) -    return NewLoop; - -  std::optional<MDNode *> NewLoopID = makeFollowupLoopID( -      LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder}); -  if (NewLoopID) { -    NewLoop->setLoopID(*NewLoopID); - -    // Do not setLoopAlreadyUnrolled if loop attributes have been defined -    // explicitly. -    return NewLoop; -  }    // Add unroll disable metadata to disable future unrolling for this loop. -  NewLoop->setLoopAlreadyUnrolled(); +  if (!UnrollRemainder) +    NewLoop->setLoopAlreadyUnrolled();    return NewLoop;  } diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 9f338db..94c5c170 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -12,7 +12,11 @@  //  // Limitation: This assumes that all terminators in the CFG are direct branches  //             (the "br" instruction). The presence of any other control flow -//             such as indirectbr, switch or callbr will cause an assert. +//             such as indirectbr or switch will cause an assert. +//             The callbr terminator is supported by creating intermediate +//             target blocks that unconditionally branch to the original target +//             blocks. These intermediate target blocks can then be redirected +//             through the ControlFlowHub as usual.  //  //===----------------------------------------------------------------------===// @@ -150,25 +154,55 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    SmallVector<BasicBlock *, 8> ExitingBlocks;    L->getExitingBlocks(ExitingBlocks); +  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); +  SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;    // Redirect exiting edges through a control flow hub.    ControlFlowHub CHub; -  for (auto *BB : ExitingBlocks) { -    auto *Branch = cast<BranchInst>(BB->getTerminator()); -    BasicBlock *Succ0 = Branch->getSuccessor(0); -    Succ0 = L->contains(Succ0) ? nullptr : Succ0; - -    BasicBlock *Succ1 = -        Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); -    Succ1 = L->contains(Succ1) ? nullptr : Succ1; -    CHub.addBranch(BB, Succ0, Succ1); - -    LLVM_DEBUG(dbgs() << "Added exiting branch: " << BB->getName() << " -> {" -                      << (Succ0 ? Succ0->getName() : "<none>") << ", " -                      << (Succ1 ? Succ1->getName() : "<none>") << "}\n"); + +  for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { +    BasicBlock *BB = ExitingBlocks[I]; +    if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) { +      BasicBlock *Succ0 = Branch->getSuccessor(0); +      Succ0 = L->contains(Succ0) ? nullptr : Succ0; + +      BasicBlock *Succ1 = +          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1); +      Succ1 = L->contains(Succ1) ? nullptr : Succ1; +      CHub.addBranch(BB, Succ0, Succ1); + +      LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB) +                        << " -> " << printBasicBlock(Succ0) +                        << (Succ0 && Succ1 ? " " : "") << printBasicBlock(Succ1) +                        << '\n'); +    } else if (CallBrInst *CallBr = dyn_cast<CallBrInst>(BB->getTerminator())) { +      for (unsigned J = 0; J < CallBr->getNumSuccessors(); ++J) { +        BasicBlock *Succ = CallBr->getSuccessor(J); +        if (L->contains(Succ)) +          continue; +        bool UpdatedLI = false; +        BasicBlock *NewSucc = +            SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); +        // Even if CallBr and Succ do not have a common parent loop, we need to +        // add the new target block to the parent loop of the current loop. +        if (!UpdatedLI) +          CallBrTargetBlocksToFix.push_back(NewSucc); +        // ExitingBlocks is later used to restore SSA, so we need to make sure +        // that the blocks used for phi nodes in the guard blocks match the +        // predecessors of the guard blocks, which, in the case of callbr, are +        // the new intermediate target blocks instead of the callbr blocks +        // themselves. +        ExitingBlocks[I] = NewSucc; +        CHub.addBranch(NewSucc, Succ); +        LLVM_DEBUG(dbgs() << "Added exiting branch: " +                          << printBasicBlock(NewSucc) << " -> " +                          << printBasicBlock(Succ) << '\n'); +      } +    } else { +      llvm_unreachable("unsupported block terminator"); +    }    }    SmallVector<BasicBlock *, 8> GuardBlocks; -  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);    BasicBlock *LoopExitBlock;    bool ChangedCFG;    std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( @@ -187,10 +221,19 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {    // The guard blocks were created outside the loop, so they need to become    // members of the parent loop. -  if (auto ParentLoop = L->getParentLoop()) { +  // Same goes for the callbr target blocks.  Although we try to add them to the +  // smallest common parent loop of the callbr block and the corresponding +  // original target block, there might not have been such a loop, in which case +  // the newly created callbr target blocks are not part of any loop. For nested +  // loops, this might result in them leading to a loop with multiple entry +  // points. +  if (auto *ParentLoop = L->getParentLoop()) {      for (auto *G : GuardBlocks) {        ParentLoop->addBasicBlockToLoop(G, LI);      } +    for (auto *C : CallBrTargetBlocksToFix) { +      ParentLoop->addBasicBlockToLoop(C, LI); +    }      ParentLoop->verifyLoop();    } @@ -218,8 +261,6 @@ bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); -    return runImpl(LI, DT);  } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f7968ab..25bf49d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(          continue;        VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, -                            *CM.PSE.getSE()); +                            *CM.PSE.getSE(), OrigLoop);        precomputeCosts(*Plan, VF, CostCtx);        auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());        for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {        // Add on other costs that are modelled in VPlan, but not in the legacy        // cost model.        VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, -                            *CM.PSE.getSE()); +                            *CM.PSE.getSE(), OrigLoop);        VPRegionBlock *VectorRegion = P->getVectorLoopRegion();        assert(VectorRegion && "Expected to have a vector region!");        for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( @@ -5750,13 +5750,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {               getMemoryInstructionCost(I, ElementCount::getFixed(1))));          UpdateMemOpUserCost(cast<LoadInst>(I));        } else if (const auto *Group = getInterleavedAccessGroup(I)) { -        // Scalarize an interleave group of address loads. -        for (unsigned I = 0; I < Group->getFactor(); ++I) { -          if (Instruction *Member = Group->getMember(I)) { -            setWideningDecision( -                Member, VF, CM_Scalarize, -                (VF.getKnownMinValue() * -                 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); +        // Scalarize all members of this interleaved group when any member +        // is used as an address. The address-used load skips scalarization +        // overhead, other members include it. +        for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { +          if (Instruction *Member = Group->getMember(Idx)) { +            InstructionCost Cost = +                AddrDefs.contains(Member) +                    ? (VF.getKnownMinValue() * +                       getMemoryInstructionCost(Member, +                                                ElementCount::getFixed(1))) +                    : getMemInstScalarizationCost(Member, VF); +            setWideningDecision(Member, VF, CM_Scalarize, Cost);              UpdateMemOpUserCost(cast<LoadInst>(Member));            }          } @@ -6871,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,  InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,                                                 ElementCount VF) const { -  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); +  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), +                        OrigLoop);    InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);    // Now compute and add the VPlan-based cost. @@ -7105,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {    // case, don't trigger the assertion, as the extra simplifications may cause a    // different VF to be picked by the VPlan-based cost model.    VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, -                        *CM.PSE.getSE()); +                        *CM.PSE.getSE(), OrigLoop);    precomputeCosts(BestPlan, BestFactor.Width, CostCtx);    // Verify that the VPlan-based and legacy cost models agree, except for VPlans    // with early exits and plans with additional VPlan simplifications. The    // legacy cost model doesn't properly model costs for such loops.    assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || +          !Legal->getLAI()->getSymbolicStrides().empty() ||            planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),                                                  CostCtx, OrigLoop,                                                  BestFactor.Width) || @@ -8335,11 +8342,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(                &R) ||            (isa<VPInstruction>(&R) && !UnderlyingValue))          continue; - -      // FIXME: VPlan0, which models a copy of the original scalar loop, should -      // not use VPWidenPHIRecipe to model the phis. -      assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) && -             UnderlyingValue && "unsupported recipe"); +      assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");        // TODO: Gradually replace uses of underlying instruction by analyses on        // VPlan. @@ -8440,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(    // and mulacc-reduction are implemented.    if (!CM.foldTailWithEVL()) {      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, -                          *CM.PSE.getSE()); +                          *CM.PSE.getSE(), OrigLoop);      VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,                               CostCtx, Range);    } @@ -9910,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {      bool ForceVectorization =          Hints.getForce() == LoopVectorizeHints::FK_Enabled;      VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, -                          CM.CostKind, *CM.PSE.getSE()); +                          CM.CostKind, *CM.PSE.getSE(), L);      if (!ForceVectorization &&          !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,                                       LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9..965426f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext {    SmallPtrSet<Instruction *, 8> SkipCostComputation;    TargetTransformInfo::TargetCostKind CostKind;    ScalarEvolution &SE; +  const Loop *L;    VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,                  const VPlan &Plan, LoopVectorizationCostModel &CM,                  TargetTransformInfo::TargetCostKind CostKind, -                ScalarEvolution &SE) +                ScalarEvolution &SE, const Loop *L)        : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), -        CostKind(CostKind), SE(SE) {} +        CostKind(CostKind), SE(SE), L(L) {}    /// Return the cost for \p UI with \p VF using the legacy cost model as    /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c80..bde62dd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const {    });  } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, +                                        const Loop *L) {    auto *PtrR = Ptr->getDefiningRecipe();    if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&                    cast<VPReplicateRecipe>(PtrR)->getOpcode() ==                        Instruction::GetElementPtr) ||                   isa<VPWidenGEPRecipe>(PtrR) ||                   match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) -    return false; +    return nullptr;    // We are looking for a GEP where all indices are either loop invariant or    // inductions.    for (VPValue *Opd : drop_begin(PtrR->operands())) {      if (!Opd->isDefinedOutsideLoopRegions() &&          !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd)) -      return false; +      return nullptr;    } -  return true; +  return vputils::getSCEVExprForVPValue(Ptr, SE, L);  }  /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,      bool IsLoad = UI->getOpcode() == Instruction::Load;      const VPValue *PtrOp = getOperand(!IsLoad); -    // TODO: Handle cases where we need to pass a SCEV to -    // getAddressComputationCost. -    if (shouldUseAddressAccessSCEV(PtrOp)) +    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); +    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))        break;      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,      InstructionCost ScalarCost =          ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(                                PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, -                              nullptr, Ctx.CostKind); +                              PtrSCEV, Ctx.CostKind);      if (isSingleScalar())        return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7..54348c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {           B == Plan.getBackedgeTakenCount();  } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, +                                           ScalarEvolution &SE, const Loop *L) {    if (V->isLiveIn()) {      if (Value *LiveIn = V->getLiveInIRValue())        return SE.getSCEV(LiveIn); @@ -86,6 +87,52 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {    return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())        .Case<VPExpandSCEVRecipe>(            [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) +      .Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) { +        if (!L) +          return SE.getCouldNotCompute(); +        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, +                                SCEV::FlagAnyWrap); +      }) +      .Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) { +        const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); +        const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); +        if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>)) +          return SE.getCouldNotCompute(); + +        return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), +                             SE.getMulExpr(IV, SE.getTruncateOrSignExtend( +                                                   Scale, IV->getType()))); +      }) +      .Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) { +        const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); +        if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step)) +          return SE.getCouldNotCompute(); +        return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), +                             Step); +      }) +      .Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) { +        if (R->getOpcode() != Instruction::GetElementPtr) +          return SE.getCouldNotCompute(); + +        const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); +        if (isa<SCEVCouldNotCompute>(Base)) +          return SE.getCouldNotCompute(); + +        SmallVector<const SCEV *> IndexExprs; +        for (VPValue *Index : drop_begin(R->operands())) { +          const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); +          if (isa<SCEVCouldNotCompute>(IndexExpr)) +            return SE.getCouldNotCompute(); +          IndexExprs.push_back(IndexExpr); +        } + +        Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr()) +                                 ->getSourceElementType(); +        return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); +      })        .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });  } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413..c21a0e7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);  /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no  /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, +                                  const Loop *L = nullptr);  /// Returns true if \p VPV is a single scalar, either because it produces the  /// same value for all lanes or only has its first lane used. | 
