Pārlūkot izejas kodu

Optimization phases for LLVM upgrade

Brian Fiete 1 gadu atpakaļ
vecāks
revīzija
d5b0e6d06d

+ 2 - 0
BeefLibs/corlib/src/Delegate.bf

@@ -1,5 +1,6 @@
 namespace System
 {
+	[AlwaysInclude]
 	class Delegate : IHashable
 	{
 	    void* mFuncPtr;
@@ -61,6 +62,7 @@ namespace System
 
 	delegate void Action();
 
+	[AlwaysInclude]
 	struct Function : int
 	{
 

+ 2 - 0
IDEHelper/COFF.cpp

@@ -3289,6 +3289,8 @@ void COFF::ParseCompileUnit_Symbols(DbgCompileUnit* compileUnit, uint8* sectionD
 			break;
 		case 0x1179:
 			break;
+		case 0x1180:
+			break;
 		case 7:
 			// Unknown
 			break;

+ 363 - 508
IDEHelper/Compiler/BfIRCodeGen.cpp

@@ -69,6 +69,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -92,6 +93,7 @@
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Passes/PassBuilder.h"
 
 //#include "llvm/Bitcode/ReaderWriter.h"
 
@@ -5605,548 +5607,400 @@ llvm::Type* BfIRCodeGen::GetLLVMTypeById(int id)
 	return GetTypeEntry(id).mType->mLLVMType;
 }
 
-static int GetOptLevel(BfOptLevel optLevel)
+// LLVM/Clang 18.1.4
+static void addSanitizers(const llvm::Triple& TargetTriple,	BfCodeGenOptions& CodeGenOpts, llvm::PassBuilder& PB) 
 {
-	switch (optLevel)
-	{
-	case BfOptLevel_O1: return 1;
-	case BfOptLevel_O2: return 2;
-	case BfOptLevel_O3: return 3;
-	default: return 0;
-	}
-}
+#if 0	
+	auto SanitizersCallback = [&](llvm::ModulePassManager& MPM, llvm::OptimizationLevel Level) {
+			if (CodeGenOpts.hasSanitizeCoverage()) 
+			{
+				auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts);
+				MPM.addPass(SanitizerCoveragePass(
+					SancovOpts, CodeGenOpts.SanitizeCoverageAllowlistFiles,
+					CodeGenOpts.SanitizeCoverageIgnorelistFiles));
+			}
 
-//enum CFLAAType { None, Steensgaard, Andersen, Both };
+			if (CodeGenOpts.hasSanitizeBinaryMetadata()) {
+				MPM.addPass(SanitizerBinaryMetadataPass(
+					getSanitizerBinaryMetadataOptions(CodeGenOpts),
+					CodeGenOpts.SanitizeMetadataIgnorelistFiles));
+			}
 
-static void AddInitialAliasAnalysisPasses(llvm::legacy::PassManagerBase &PM, const BfCodeGenOptions& options)
-{
+			auto MSanPass = [&](SanitizerMask Mask, bool CompileKernel) {
+				if (LangOpts.Sanitize.has(Mask)) {
+					int TrackOrigins = CodeGenOpts.SanitizeMemoryTrackOrigins;
+					bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
+
+					MemorySanitizerOptions options(TrackOrigins, Recover, CompileKernel,
+						CodeGenOpts.SanitizeMemoryParamRetval);
+					MPM.addPass(MemorySanitizerPass(options));
+					if (Level != OptimizationLevel::O0) {
+						// MemorySanitizer inserts complex instrumentation that mostly follows
+						// the logic of the original code, but operates on "shadow" values. It
+						// can benefit from re-running some general purpose optimization
+						// passes.
+						MPM.addPass(RequireAnalysisPass<GlobalsAA, llvm::Module>());
+						FunctionPassManager FPM;
+						FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+						FPM.addPass(InstCombinePass());
+						FPM.addPass(JumpThreadingPass());
+						FPM.addPass(GVNPass());
+						FPM.addPass(InstCombinePass());
+						MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+					}
+				}
+				};
+			MSanPass(SanitizerKind::Memory, false);
+			MSanPass(SanitizerKind::KernelMemory, true);
 
-	//TODO:
-// 	switch (options.mUseCFLAA) {
-// 	case BfCFLAAType_Steensgaard:
-// 		PM.add(llvm::createCFLSteensAAWrapperPass());
-// 		break;
-// 	case BfCFLAAType_Andersen:
-// 		PM.add(llvm::createCFLAndersAAWrapperPass());
-// 		break;
-// 	case BfCFLAAType_Both:
-// 		PM.add(llvm::createCFLSteensAAWrapperPass());
-// 		PM.add(llvm::createCFLAndersAAWrapperPass());
-// 		break;
-// 	default:
-// 		break;
-// 	}
+			if (LangOpts.Sanitize.has(SanitizerKind::Thread)) {
+				MPM.addPass(ModuleThreadSanitizerPass());
+				MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
+			}
 
-	// Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-	// BasicAliasAnalysis wins if they disagree. This is intended to help
-	// support "obvious" type-punning idioms.
-	PM.add(llvm::createTypeBasedAAWrapperPass());
-	PM.add(llvm::createScopedNoAliasAAWrapperPass());
-}
+			auto ASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
+				if (LangOpts.Sanitize.has(Mask)) {
+					bool UseGlobalGC = asanUseGlobalsGC(TargetTriple, CodeGenOpts);
+					bool UseOdrIndicator = CodeGenOpts.SanitizeAddressUseOdrIndicator;
+					llvm::AsanDtorKind DestructorKind =
+						CodeGenOpts.getSanitizeAddressDtor();
+					AddressSanitizerOptions Opts;
+					Opts.CompileKernel = CompileKernel;
+					Opts.Recover = CodeGenOpts.SanitizeRecover.has(Mask);
+					Opts.UseAfterScope = CodeGenOpts.SanitizeAddressUseAfterScope;
+					Opts.UseAfterReturn = CodeGenOpts.getSanitizeAddressUseAfterReturn();
+					MPM.addPass(AddressSanitizerPass(Opts, UseGlobalGC, UseOdrIndicator,
+						DestructorKind));
+				}
+				};
+			ASanPass(SanitizerKind::Address, false);
+			ASanPass(SanitizerKind::KernelAddress, true);
+
+			auto HWASanPass = [&](SanitizerMask Mask, bool CompileKernel) {
+				if (LangOpts.Sanitize.has(Mask)) {
+					bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
+					MPM.addPass(HWAddressSanitizerPass(
+						{ CompileKernel, Recover,
+						/*DisableOptimization=*/CodeGenOpts.OptimizationLevel == 0 }));
+				}
+				};
+			HWASanPass(SanitizerKind::HWAddress, false);
+			HWASanPass(SanitizerKind::KernelHWAddress, true);
 
-static void AddInstructionCombiningPass(llvm::legacy::PassManagerBase &PM, const BfCodeGenOptions& options)
-{
-	bool ExpensiveCombines = GetOptLevel(options.mOptLevel) > 2;
-	//TODO: PM.add(llvm::createInstructionCombiningPass(options.mExpensiveCombines));
+			if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) {
+				MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles));
+			}
+		};
+	if (ClSanitizeOnOptimizerEarlyEP) {
+		PB.registerOptimizerEarlyEPCallback(
+			[SanitizersCallback](ModulePassManager& MPM, OptimizationLevel Level) {
+				ModulePassManager NewMPM;
+				SanitizersCallback(NewMPM, Level);
+				if (!NewMPM.isEmpty()) {
+					// Sanitizers can abandon<GlobalsAA>.
+					NewMPM.addPass(RequireAnalysisPass<GlobalsAA, llvm::Module>());
+					MPM.addPass(std::move(NewMPM));
+				}
+			});
+	}
+	else {
+		// LastEP does not need GlobalsAA.
+		PB.registerOptimizerLastEPCallback(SanitizersCallback);
+	}
+#endif
 }
 
-static void AddFunctionSimplificationPasses(llvm::legacy::PassManagerBase &MPM, const BfCodeGenOptions& options)
+// LLVM/Clang 18.1.4
+static void addKCFIPass(const llvm::Triple& TargetTriple, const BfCodeGenOptions& codeGenOpts, llvm::PassBuilder& PB) 
 {
-	//TODO:
+#if 0
+	// If the back-end supports KCFI operand bundle lowering, skip KCFIPass.
+	if (TargetTriple.getArch() == llvm::Triple::x86_64 ||
+		TargetTriple.isAArch64(64) || TargetTriple.isRISCV())
+		return;
 
-	/*
-	// Start of function pass.
-	// Break up aggregate allocas, using SSAUpdater.
-	MPM.add(llvm::createSROAPass());
-	MPM.add(llvm::createEarlyCSEPass(options.mEnableEarlyCSEMemSSA)); // Catch trivial redundancies
-	//if (EnableGVNHoist)
-	if (options.mEnableGVNHoist)
-		MPM.add(llvm::createGVNHoistPass());
-	if (options.mEnableGVNSink)
-	{
-		MPM.add(llvm::createGVNSinkPass());
-		MPM.add(llvm::createCFGSimplificationPass());
-	}
+	// Ensure we lower KCFI operand bundles with -O0.
+	PB.registerOptimizerLastEPCallback(
+		[&](ModulePassManager& MPM, OptimizationLevel Level) {
+			if (Level == OptimizationLevel::O0 &&
+				LangOpts.Sanitize.has(SanitizerKind::KCFI))
+				MPM.addPass(createModuleToFunctionPassAdaptor(KCFIPass()));
+		});
+
+	// When optimizations are requested, run KCIFPass after InstCombine to
+	// avoid unnecessary checks.
+	PB.registerPeepholeEPCallback(
+		[&](FunctionPassManager& FPM, OptimizationLevel Level) {
+			if (Level != OptimizationLevel::O0 &&
+				LangOpts.Sanitize.has(SanitizerKind::KCFI))
+				FPM.addPass(KCFIPass());
+		});
+#endif
+}
 
-	// Speculative execution if the target has divergent branches; otherwise nop.
-	MPM.add(llvm::createSpeculativeExecutionIfHasBranchDivergencePass());
-	MPM.add(llvm::createJumpThreadingPass());         // Thread jumps.
-	MPM.add(llvm::createCorrelatedValuePropagationPass()); // Propagate conditionals
-	MPM.add(llvm::createCFGSimplificationPass());     // Merge & remove BBs
-	// Combine silly seq's
-	if (GetOptLevel(options.mOptLevel) > 2)
-		MPM.add(llvm::createAggressiveInstCombinerPass());
-	AddInstructionCombiningPass(MPM, options);
-	if (options.mSizeLevel == 0 && !options.mDisableLibCallsShrinkWrap)
-		MPM.add(llvm::createLibCallsShrinkWrapPass());
-	//AddExtensionsToPM(llvm::EP_Peephole, MPM);
-
-	// Optimize memory intrinsic calls based on the profiled size information.
-	if (options.mSizeLevel == 0)
-		MPM.add(llvm::createPGOMemOPSizeOptLegacyPass());
-
-	MPM.add(llvm::createTailCallEliminationPass()); // Eliminate tail calls
-	MPM.add(llvm::createCFGSimplificationPass());     // Merge & remove BBs
-	MPM.add(llvm::createReassociatePass());           // Reassociate expressions
-
-	// Begin the loop pass pipeline.
-	if (options.mEnableSimpleLoopUnswitch) {
-		// The simple loop unswitch pass relies on separate cleanup passes. Schedule
-		// them first so when we re-process a loop they run before other loop
-		// passes.
-		MPM.add(llvm::createLoopInstSimplifyPass());
-		MPM.add(llvm::createLoopSimplifyCFGPass());
-	}
-	// Rotate Loop - disable header duplication at -Oz
-	MPM.add(llvm::createLoopRotatePass(options.mSizeLevel == 2 ? 0 : -1));
-	MPM.add(llvm::createLICMPass());                  // Hoist loop invariants
-	if (options.mEnableSimpleLoopUnswitch)
-		MPM.add(llvm::createSimpleLoopUnswitchLegacyPass());
-	else
-		MPM.add(llvm::createLoopUnswitchPass(options.mSizeLevel || GetOptLevel(options.mOptLevel) < 3, options.mDivergentTarget));
-	// FIXME: We break the loop pass pipeline here in order to do full
-	// simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
-	// need for this.
-	MPM.add(llvm::createCFGSimplificationPass());
-	AddInstructionCombiningPass(MPM, options);
-	// We resume loop passes creating a second loop pipeline here.
-	MPM.add(llvm::createIndVarSimplifyPass());        // Canonicalize indvars
-	MPM.add(llvm::createLoopIdiomPass());             // Recognize idioms like memset.
-	//addExtensionsToPM(EP_LateLoopOptimizations, MPM);
-	MPM.add(llvm::createLoopDeletionPass());          // Delete dead loops
-
-	if (options.mEnableLoopInterchange)
-		MPM.add(llvm::createLoopInterchangePass()); // Interchange loops
-
-	MPM.add(llvm::createSimpleLoopUnrollPass(GetOptLevel(options.mOptLevel),
-		options.mDisableUnrollLoops)); // Unroll small loops
-	//addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
-	// This ends the loop pass pipelines.
-
-	if (GetOptLevel(options.mOptLevel) > 1) {
-		MPM.add(llvm::createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
-		MPM.add(options.mNewGVN ? llvm::createNewGVNPass()
-			: llvm::createGVNPass(options.mDisableGVNLoadPRE)); // Remove redundancies
-	}
-	MPM.add(llvm::createMemCpyOptPass());             // Remove memcpy / form memset
-	MPM.add(llvm::createSCCPPass());                  // Constant prop with SCCP
-
-	// Delete dead bit computations (instcombine runs after to fold away the dead
-	// computations, and then ADCE will run later to exploit any new DCE
-	// opportunities that creates).
-	MPM.add(llvm::createBitTrackingDCEPass());        // Delete dead bit computations
-
-	// Run instcombine after redundancy elimination to exploit opportunities
-	// opened up by them.
-	AddInstructionCombiningPass(MPM, options);
-	//addExtensionsToPM(EP_Peephole, MPM);
-	MPM.add(llvm::createJumpThreadingPass());         // Thread jumps
-	MPM.add(llvm::createCorrelatedValuePropagationPass());
-	MPM.add(llvm::createDeadStoreEliminationPass());  // Delete dead stores
-	MPM.add(llvm::createLICMPass());
-
-	//addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
-
-	if (options.mRerollLoops)
-		MPM.add(llvm::createLoopRerollPass());
-	if (!options.mRunSLPAfterLoopVectorization && options.mSLPVectorize)
-		MPM.add(llvm::createSLPVectorizerPass()); // Vectorize parallel scalar chains.
-
-	MPM.add(llvm::createAggressiveDCEPass());         // Delete dead instructions
-	MPM.add(llvm::createCFGSimplificationPass()); // Merge & remove BBs
-	// Clean up after everything.
-	AddInstructionCombiningPass(MPM, options);
-	//addExtensionsToPM(EP_Peephole, MPM);
-
-// 	if (options.mEnableCHR && options.mOptLevel >= 3 &&
-// 		(!PGOInstrUse.empty() || !PGOSampleUse.empty()))
-// 		MPM.add(createControlHeightReductionLegacyPass());
-	*/
+/// Check whether we should emit a module summary for regular LTO.
+  /// The module summary should be emitted by default for regular LTO
+  /// except for ld64 targets.
+  ///
+  /// \return True if the module summary should be emitted.
+static bool shouldEmitRegularLTOSummary(const llvm::Triple& targetTriple, const BfCodeGenOptions& codeGenOptions, bool PrepareForLTO)
+{
+	return PrepareForLTO /*&& !CodeGenOpts.DisableLLVMPasses*/ &&
+		targetTriple.getVendor() != llvm::Triple::Apple;
 }
 
-static void PopulateModulePassManager(llvm::legacy::PassManagerBase &MPM, const BfCodeGenOptions& options)
+/// Check whether we should emit a flag for UnifiedLTO.
+/// The UnifiedLTO module flag should be set when UnifiedLTO is enabled for
+/// ThinLTO or Full LTO with module summaries.
+static bool shouldEmitUnifiedLTOModueFlag(const llvm::Triple& targetTriple, const BfCodeGenOptions& codeGenOptions, bool PrepareForLTO)
 {
-// 	if (!PGOSampleUse.empty()) {
-// 		MPM.add(createPruneEHPass());
-// 		MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
-// 	}
+	return false;
+	/*return CodeGenOpts.UnifiedLTO &&
+		(CodeGenOpts.PrepareForThinLTO || shouldEmitRegularLTOSummary());*/
+}
 
+void BfIRCodeGen::RunOptimizationPipeline(const llvm::Triple& targetTriple)
+{
+	bool verifyModule = true;
+
+	std::optional<llvm::PGOOptions> pgoOptions;
+	mLLVMTargetMachine->setPGOOption(pgoOptions);
+
+	llvm::PipelineTuningOptions pto;
+	pto.LoopUnrolling = !mCodeGenOptions.mDisableUnrollLoops;
+	// For historical reasons, loop interleaving is set to mirror setting for loop unrolling.
+	pto.LoopInterleaving = !mCodeGenOptions.mDisableUnrollLoops;
+	pto.LoopVectorization = mCodeGenOptions.mLoopVectorize;
+	pto.SLPVectorization = mCodeGenOptions.mSLPVectorize;
+	pto.MergeFunctions = mCodeGenOptions.mMergeFunctions;
 	//TODO:
-	/*
-	llvm::Pass* Inliner;
-	bool prepareForLTO = false;
-	bool prepareForThinLTO = options.mLTOType == BfLTOType_Thin;
-	bool performThinLTO = false;
-	bool enableNonLTOGlobalsModRef = false;
-
-	if (GetOptLevel(options.mOptLevel) > 0)
-		Inliner = llvm::createFunctionInliningPass(GetOptLevel(options.mOptLevel), options.mSizeLevel, false);
-	else
-		Inliner = llvm::createAlwaysInlinerLegacyPass();
+	//pto.CallGraphProfile = ???
+	//pto.UnifiedLTO = ???
+
+	llvm::LoopAnalysisManager LAM;
+	llvm::FunctionAnalysisManager FAM;
+	llvm::CGSCCAnalysisManager CGAM;
+	llvm::ModuleAnalysisManager MAM;
+
+	llvm::PassInstrumentationCallbacks PIC;
+// 	PrintPassOptions PrintPassOpts;
+// 	PrintPassOpts.Indent = DebugPassStructure;
+// 	PrintPassOpts.SkipAnalyses = DebugPassStructure;
+// 	StandardInstrumentations SI(
+// 		TheModule->getContext(),
+// 		(CodeGenOpts.DebugPassManager || DebugPassStructure),
+// 		CodeGenOpts.VerifyEach, PrintPassOpts);
+// 	SI.registerCallbacks(PIC, &MAM);
+	llvm::PassBuilder PB(mLLVMTargetMachine, pto, pgoOptions, &PIC);
+
+	// Register all the basic analyses with the managers.
+	PB.registerModuleAnalyses(MAM);
+	PB.registerCGSCCAnalyses(CGAM);
+	PB.registerFunctionAnalyses(FAM);
+	PB.registerLoopAnalyses(LAM);
+	PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+	
+	//llvm::ModulePassManager MPM;
+	// Add a verifier pass, before any other passes, to catch CodeGen issues.
 
-	// Allow forcing function attributes as a debugging and tuning aid.
-	MPM.add(llvm::createForceFunctionAttrsLegacyPass());
+	llvm::ModulePassManager MPM;
+	if (verifyModule)
+		MPM.addPass(llvm::VerifierPass());	
 
-	// If all optimizations are disabled, just run the always-inline pass and,
-	// if enabled, the function merging pass.
-	if (GetOptLevel(options.mOptLevel) == 0) {
-		//addPGOInstrPasses(MPM);
-		if (Inliner) {
-			MPM.add(Inliner);
-			Inliner = nullptr;
-		}
+	bool disableLLVMPasses = false;
+	if (!disableLLVMPasses)
+	{
+		llvm::OptimizationLevel Level;
 
-		// FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
-		// creates a CGSCC pass manager, but we don't want to add extensions into
-		// that pass manager. To prevent this we insert a no-op module pass to reset
-		// the pass manager to get the same behavior as EP_OptimizerLast in non-O0
-		// builds. The function merging pass is
-		if (options.mMergeFunctions)
-			MPM.add(llvm::createMergeFunctionsPass());
-// 		else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
-// 			MPM.add(createBarrierNoopPass());
+		bool PrepareForLTO = false;
+		bool PrepareForThinLTO = mCodeGenOptions.mLTOType == BfLTOType_Thin;
+		//bool performThinLTO = false;
 
-		if (performThinLTO)
+		Level = llvm::OptimizationLevel::O0;
+		switch (mCodeGenOptions.mOptLevel)
 		{
-			// Drop available_externally and unreferenced globals. This is necessary
-			// with ThinLTO in order to avoid leaving undefined references to dead
-			// globals in the object file.
-			MPM.add(llvm::createEliminateAvailableExternallyPass());
-			MPM.add(llvm::createGlobalDCEPass());
-		}
-
-		//addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
-
-		if (prepareForLTO || prepareForThinLTO) {
-			MPM.add(llvm::createCanonicalizeAliasesPass());
-			// Rename anon globals to be able to export them in the summary.
-			// This has to be done after we add the extensions to the pass manager
-			// as there could be passes (e.g. Adddress sanitizer) which introduce
-			// new unnamed globals.
-			MPM.add(llvm::createNameAnonGlobalPass());
+		case BfOptLevel_O0:
+			Level = llvm::OptimizationLevel::O0;
+			break;
+		case BfOptLevel_O1:
+			Level = llvm::OptimizationLevel::O1;
+			break;
+		case BfOptLevel_O2:
+			Level = llvm::OptimizationLevel::O2;
+			break;
+		case BfOptLevel_O3:
+			Level = llvm::OptimizationLevel::O3;
+			break;
+		case BfOptLevel_Og:			
+			Level = llvm::OptimizationLevel::O1;
+			break;
 		}
-		return;
-	}
-
-	// Add LibraryInfo if we have some.
-// 	if (LibraryInfo)
-// 		MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
-	AddInitialAliasAnalysisPasses(MPM, options);
-
-	// For ThinLTO there are two passes of indirect call promotion. The
-	// first is during the compile phase when PerformThinLTO=false and
-	// intra-module indirect call targets are promoted. The second is during
-	// the ThinLTO backend when PerformThinLTO=true, when we promote imported
-	// inter-module indirect calls. For that we perform indirect call promotion
-	// earlier in the pass pipeline, here before globalopt. Otherwise imported
-	// available_externally functions look unreferenced and are removed.
-// 	if (performThinLTO)
-// 		MPM.add(llvm::createPGOIndirectCallPromotionLegacyPass(true,
-// 			!PGOSampleUse.empty()));
-
-	// For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
-	// as it will change the CFG too much to make the 2nd profile annotation
-	// in backend more difficult.
-// 	bool PrepareForThinLTOUsingPGOSampleProfile =
-// 		PrepareForThinLTO && !PGOSampleUse.empty();
-
-	bool disableUnrollLoops = false;
-	bool prepareForThinLTOUsingPGOSampleProfile = false;
-	if (prepareForThinLTOUsingPGOSampleProfile)
-		disableUnrollLoops = true;
-
-	// Infer attributes about declarations if possible.
-	MPM.add(llvm::createInferFunctionAttrsLegacyPass());
-
-	//addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
-
-	if (GetOptLevel(options.mOptLevel) > 2)
-		MPM.add(llvm::createCallSiteSplittingPass());
-
-	MPM.add(llvm::createIPSCCPPass());          // IP SCCP
-	MPM.add(llvm::createCalledValuePropagationPass());
-	MPM.add(llvm::createGlobalOptimizerPass()); // Optimize out global vars
-	// Promote any localized global vars.
-	MPM.add(llvm::createPromoteMemoryToRegisterPass());
-
-	MPM.add(llvm::createDeadArgEliminationPass()); // Dead argument elimination
-
-	AddInstructionCombiningPass(MPM, options); // Clean up after IPCP & DAE
-	//addExtensionsToPM(EP_Peephole, MPM);
-	MPM.add(llvm::createCFGSimplificationPass()); // Clean up after IPCP & DAE
-
-	// For SamplePGO in ThinLTO compile phase, we do not want to do indirect
-	// call promotion as it will change the CFG too much to make the 2nd
-	// profile annotation in backend more difficult.
-	// PGO instrumentation is added during the compile phase for ThinLTO, do
-	// not run it a second time
-// 	if (!performThinLTO && !prepareForThinLTOUsingPGOSampleProfile)
-// 		llvm::addPGOInstrPasses(MPM);
-
-	// We add a module alias analysis pass here. In part due to bugs in the
-	// analysis infrastructure this "works" in that the analysis stays alive
-	// for the entire SCC pass run below.
-	MPM.add(llvm::createGlobalsAAWrapperPass());
-
-	// Start of CallGraph SCC passes.
-	MPM.add(llvm::createPruneEHPass()); // Remove dead EH info
-	bool RunInliner = false;
-	if (Inliner) {
-		MPM.add(Inliner);
-		Inliner = nullptr;
-		RunInliner = true;
-	}
-
-	MPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
-	if (GetOptLevel(options.mOptLevel) > 2)
-		MPM.add(llvm::createArgumentPromotionPass()); // Scalarize uninlined fn args
-
-	//addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
-	AddFunctionSimplificationPasses(MPM, options);
-
-	// FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
-	// pass manager that we are specifically trying to avoid. To prevent this
-	// we must insert a no-op module pass to reset the pass manager.
-	MPM.add(llvm::createBarrierNoopPass());
-
-	if (options.mRunPartialInlining)
-		MPM.add(llvm::createPartialInliningPass());
-
-	if (GetOptLevel(options.mOptLevel) > 1 && !prepareForLTO && !prepareForThinLTO)
-		// Remove avail extern fns and globals definitions if we aren't
-		// compiling an object file for later LTO. For LTO we want to preserve
-		// these so they are eligible for inlining at link-time. Note if they
-		// are unreferenced they will be removed by GlobalDCE later, so
-		// this only impacts referenced available externally globals.
-		// Eventually they will be suppressed during codegen, but eliminating
-		// here enables more opportunity for GlobalDCE as it may make
-		// globals referenced by available external functions dead
-		// and saves running remaining passes on the eliminated functions.
-		MPM.add(llvm::createEliminateAvailableExternallyPass());
-
-	MPM.add(llvm::createReversePostOrderFunctionAttrsPass());
-
-	// The inliner performs some kind of dead code elimination as it goes,
-	// but there are cases that are not really caught by it. We might
-	// at some point consider teaching the inliner about them, but it
-	// is OK for now to run GlobalOpt + GlobalDCE in tandem as their
-	// benefits generally outweight the cost, making the whole pipeline
-	// faster.
-	if (RunInliner) {
-		MPM.add(llvm::createGlobalOptimizerPass());
-		MPM.add(llvm::createGlobalDCEPass());
-	}
-
-	// If we are planning to perform ThinLTO later, let's not bloat the code with
-	// unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
-	// during ThinLTO and perform the rest of the optimizations afterward.
-	if (prepareForThinLTO) {
-		// Ensure we perform any last passes, but do so before renaming anonymous
-		// globals in case the passes add any.
-		//addExtensionsToPM(EP_OptimizerLast, MPM);
-		MPM.add(llvm::createCanonicalizeAliasesPass());
-		// Rename anon globals to be able to export them in the summary.
-		MPM.add(llvm::createNameAnonGlobalPass());
-		return;
-	}
 
-	if (performThinLTO)
-		// Optimize globals now when performing ThinLTO, this enables more
-		// optimizations later.
-		MPM.add(llvm::createGlobalOptimizerPass());
-
-	// Scheduling LoopVersioningLICM when inlining is over, because after that
-	// we may see more accurate aliasing. Reason to run this late is that too
-	// early versioning may prevent further inlining due to increase of code
-	// size. By placing it just after inlining other optimizations which runs
-	// later might get benefit of no-alias assumption in clone loop.
-	if (options.mUseLoopVersioningLICM) {
-		MPM.add(llvm::createLoopVersioningLICMPass());    // Do LoopVersioningLICM
-		MPM.add(llvm::createLICMPass());                  // Hoist loop invariants
-	}
-
-	// We add a fresh GlobalsModRef run at this point. This is particularly
-	// useful as the above will have inlined, DCE'ed, and function-attr
-	// propagated everything. We should at this point have a reasonably minimal
-	// and richly annotated call graph. By computing aliasing and mod/ref
-	// information for all local globals here, the late loop passes and notably
-	// the vectorizer will be able to use them to help recognize vectorizable
-	// memory operations.
-	//
-	// Note that this relies on a bug in the pass manager which preserves
-	// a module analysis into a function pass pipeline (and throughout it) so
-	// long as the first function pass doesn't invalidate the module analysis.
-	// Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
-	// this to work. Fortunately, it is trivial to preserve AliasAnalysis
-	// (doing nothing preserves it as it is required to be conservatively
-	// correct in the face of IR changes).
-	MPM.add(llvm::createGlobalsAAWrapperPass());
-
-	MPM.add(llvm::createFloat2IntPass());
-
-	//addExtensionsToPM(EP_VectorizerStart, MPM);
-
-	// Re-rotate loops in all our loop nests. These may have fallout out of
-	// rotated form due to GVN or other transformations, and the vectorizer relies
-	// on the rotated form. Disable header duplication at -Oz.
-	MPM.add(llvm::createLoopRotatePass(options.mSizeLevel == 2 ? 0 : -1));
-
-	// Distribute loops to allow partial vectorization.  I.e. isolate dependences
-	// into separate loop that would otherwise inhibit vectorization.  This is
-	// currently only performed for loops marked with the metadata
-	// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
-	MPM.add(llvm::createLoopDistributePass());
-
-	MPM.add(llvm::createLoopVectorizePass(options.mDisableUnrollLoops, !options.mLoopVectorize));
-
-	// Eliminate loads by forwarding stores from the previous iteration to loads
-	// of the current iteration.
-	MPM.add(llvm::createLoopLoadEliminationPass());
-
-	// FIXME: Because of #pragma vectorize enable, the passes below are always
-	// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
-	// on -O1 and no #pragma is found). Would be good to have these two passes
-	// as function calls, so that we can only pass them when the vectorizer
-	// changed the code.
-	AddInstructionCombiningPass(MPM, options);
-	if (GetOptLevel(options.mOptLevel) > 1 && options.mExtraVectorizerPasses) {
-		// At higher optimization levels, try to clean up any runtime overlap and
-		// alignment checks inserted by the vectorizer. We want to track correllated
-		// runtime checks for two inner loops in the same outer loop, fold any
-		// common computations, hoist loop-invariant aspects out of any outer loop,
-		// and unswitch the runtime checks if possible. Once hoisted, we may have
-		// dead (or speculatable) control flows or more combining opportunities.
-		MPM.add(llvm::createEarlyCSEPass());
-		MPM.add(llvm::createCorrelatedValuePropagationPass());
-		AddInstructionCombiningPass(MPM, options);
-		MPM.add(llvm::createLICMPass());
-		MPM.add(llvm::createLoopUnswitchPass(options.mSizeLevel || GetOptLevel(options.mOptLevel) < 3, options.mDivergentTarget));
-		MPM.add(llvm::createCFGSimplificationPass());
-		AddInstructionCombiningPass(MPM, options);
-	}
+		bool IsThinLTOPostLink = false;
+
+#if 0
+		// If we reached here with a non-empty index file name, then the index
+		// file was empty and we are not performing ThinLTO backend compilation
+		// (used in testing in a distributed build environment).
+
+			bool IsThinLTOPostLink = !CodeGenOpts.ThinLTOIndexFile.empty();
+			// If so drop any the type test assume sequences inserted for whole program
+			// vtables so that codegen doesn't complain.
+			if (IsThinLTOPostLink)
+				PB.registerPipelineStartEPCallback(
+					[](ModulePassManager& MPM, OptimizationLevel Level) {
+						MPM.addPass(LowerTypeTestsPass(/*ExportSummary=*/nullptr,
+						/*ImportSummary=*/nullptr,
+						/*DropTypeTests=*/true));
+					});
+
+		// Register callbacks to schedule sanitizer passes at the appropriate part
+		// of the pipeline.
+			if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds))
+				PB.registerScalarOptimizerLateEPCallback(
+					[](FunctionPassManager& FPM, OptimizationLevel Level) {
+						FPM.addPass(BoundsCheckingPass());
+					});
+#endif
 
-	// Cleanup after loop vectorization, etc. Simplification passes like CVP and
-	// GVN, loop transforms, and others have already run, so it's now better to
-	// convert to more optimized IR using more aggressive simplify CFG options.
-	// The extra sinking transform can create larger basic blocks, so do this
-	// before SLP vectorization.
-	MPM.add(llvm::createCFGSimplificationPass({ 1, true, true, false, false, true }));
+		// Don't add sanitizers if we are here from ThinLTO PostLink. That already
+		// done on PreLink stage.
+		if (!IsThinLTOPostLink) {
+			addSanitizers(targetTriple, mCodeGenOptions, PB);
+			addKCFIPass(targetTriple, mCodeGenOptions, PB);
+		}
+
+#if 0
+		if (std::optional<GCOVOptions> Options =
+			getGCOVOptions(CodeGenOpts, LangOpts))
+			PB.registerPipelineStartEPCallback(
+				[Options](ModulePassManager& MPM, OptimizationLevel Level) {
+					MPM.addPass(GCOVProfilerPass(*Options));
+				});
+		if (std::optional<InstrProfOptions> Options =
+			getInstrProfOptions(CodeGenOpts, LangOpts))
+			PB.registerPipelineStartEPCallback(
+				[Options](ModulePassManager& MPM, OptimizationLevel Level) {
+					MPM.addPass(InstrProfilingLoweringPass(*Options, false));
+				});
+
+		// TODO: Consider passing the MemoryProfileOutput to the pass builder via
+		// the PGOOptions, and set this up there.
+		if (!CodeGenOpts.MemoryProfileOutput.empty()) {
+			PB.registerOptimizerLastEPCallback(
+				[](ModulePassManager& MPM, OptimizationLevel Level) {
+					MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
+					MPM.addPass(ModuleMemProfilerPass());
+				});
+		}		
+#endif
 
-	if (options.mRunSLPAfterLoopVectorization && options.mSLPVectorize) {
-		MPM.add(llvm::createSLPVectorizerPass()); // Vectorize parallel scalar chains.
-		if (GetOptLevel(options.mOptLevel) > 1 && options.mExtraVectorizerPasses) {
-			MPM.add(llvm::createEarlyCSEPass());
+		if (mCodeGenOptions.mLTOType == BfLTOType_Fat)
+		{
+			MPM.addPass(PB.buildFatLTODefaultPipeline(
+				Level, PrepareForThinLTO,
+				PrepareForThinLTO || shouldEmitRegularLTOSummary(targetTriple, mCodeGenOptions, PrepareForLTO)));
+		}
+		else if (PrepareForThinLTO) 
+		{
+			MPM.addPass(PB.buildThinLTOPreLinkDefaultPipeline(Level));
+		}
+		else if (PrepareForLTO) 
+		{
+			MPM.addPass(PB.buildLTOPreLinkDefaultPipeline(Level));
+		}
+		else 
+		{
+			MPM.addPass(PB.buildPerModuleDefaultPipeline(Level));
 		}
 	}
 
-	//addExtensionsToPM(EP_Peephole, MPM);
-	AddInstructionCombiningPass(MPM, options);
-
-	if (options.mEnableUnrollAndJam && !disableUnrollLoops) {
-		// Unroll and Jam. We do this before unroll but need to be in a separate
-		// loop pass manager in order for the outer loop to be processed by
-		// unroll and jam before the inner loop is unrolled.
-		MPM.add(llvm::createLoopUnrollAndJamPass(GetOptLevel(options.mOptLevel)));
-	}
-
-	MPM.add(llvm::createLoopUnrollPass(GetOptLevel(options.mOptLevel),
-		disableUnrollLoops)); // Unroll small loops
-
-	if (!disableUnrollLoops) {
-		// LoopUnroll may generate some redundency to cleanup.
-		AddInstructionCombiningPass(MPM, options);
-
-		// Runtime unrolling will introduce runtime check in loop prologue. If the
-		// unrolled loop is a inner loop, then the prologue will be inside the
-		// outer loop. LICM pass can help to promote the runtime check out if the
-		// checked value is loop invariant.
-		MPM.add(llvm::createLICMPass());
-	}
-
-	MPM.add(llvm::createWarnMissedTransformationsPass());
-
-	// After vectorization and unrolling, assume intrinsics may tell us more
-	// about pointer alignments.
-	MPM.add(llvm::createAlignmentFromAssumptionsPass());
-
-	// FIXME: We shouldn't bother with this anymore.
-	MPM.add(llvm::createStripDeadPrototypesPass()); // Get rid of dead prototypes
-
-	// GlobalOpt already deletes dead functions and globals, at -O2 try a
-	// late pass of GlobalDCE.  It is capable of deleting dead cycles.
-	if (GetOptLevel(options.mOptLevel) > 1) {
-		MPM.add(llvm::createGlobalDCEPass());         // Remove dead fns and globals.
-		MPM.add(llvm::createConstantMergePass());     // Merge dup global constants
-	}
-
-	if (options.mMergeFunctions)
-		MPM.add(llvm::createMergeFunctionsPass());
-
-	// LoopSink pass sinks instructions hoisted by LICM, which serves as a
-	// canonicalization pass that enables other optimizations. As a result,
-	// LoopSink pass needs to be a very late IR pass to avoid undoing LICM
-	// result too early.
-	MPM.add(llvm::createLoopSinkPass());
-	// Get rid of LCSSA nodes.
-	MPM.add(llvm::createInstSimplifyLegacyPass());
 
-	// This hoists/decomposes div/rem ops. It should run after other sink/hoist
-	// passes to avoid re-sinking, but before SimplifyCFG because it can allow
-	// flattening of blocks.
-	MPM.add(llvm::createDivRemPairsPass());
-
-	if (options.mEnableHotColdSplit)
-		MPM.add(llvm::createHotColdSplittingPass());
-
-	// LoopSink (and other loop passes since the last simplifyCFG) might have
-	// resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-	MPM.add(llvm::createCFGSimplificationPass());
+	// Re-link against any bitcodes supplied via the -mlink-builtin-bitcode option
+	// Some optimizations may generate new function calls that would not have
+	// been linked pre-optimization (i.e. fused sincos calls generated by
+	// AMDGPULibCalls::fold_sincos.)
+	//TODO:
+// 	if (ClRelinkBuiltinBitcodePostop)
+// 		MPM.addPass(LinkInModulesPass(BC, false));
 
-	//addExtensionsToPM(EP_OptimizerLast, MPM);
+	// Add a verifier pass if requested. We don't have to do this if the action
+	// requires code generation because there will already be a verifier pass in
+	// the code-generation pipeline.
+	// Since we already added a verifier pass above, this
+	// might even not run the analysis, if previous passes caused no changes.
+// 	if (!actionRequiresCodeGen(Action) && CodeGenOpts.VerifyModule)
+// 		MPM.addPass(VerifierPass());
 
-	if (prepareForLTO) {
-		MPM.add(llvm::createCanonicalizeAliasesPass());
-		// Rename anon globals to be able to handle them in the summary
-		MPM.add(llvm::createNameAnonGlobalPass());
-	}*/
-}
-
-namespace
-{
-	struct BfPass : public llvm::MachineFunctionPass
+	//TODO:
+#if 0
+	if (Action == Backend_EmitBC || Action == Backend_EmitLL || CodeGenOpts.FatLTO)
 	{
-		static char ID;
-		BfPass() : llvm::MachineFunctionPass(ID) {}
-
-		bool runOnMachineFunction(llvm::MachineFunction &F) override
-		{
-			//errs() << "Hello: ";
-			//errs().write_escaped(F.getName()) << '\n';
-			return false;
+		if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) {
+			if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
+				TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
+					CodeGenOpts.EnableSplitLTOUnit);
+			if (Action == Backend_EmitBC) {
+				if (!CodeGenOpts.ThinLinkBitcodeFile.empty()) {
+					ThinLinkOS = openOutputFile(CodeGenOpts.ThinLinkBitcodeFile);
+					if (!ThinLinkOS)
+						return;
+				}
+				MPM.addPass(ThinLTOBitcodeWriterPass(
+					*OS, ThinLinkOS ? &ThinLinkOS->os() : nullptr));
+			}
+			else if (Action == Backend_EmitLL) {
+				MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
+					/*EmitLTOSummary=*/true));
+			}
+		}
+		else {
+			// Emit a module summary by default for Regular LTO except for ld64
+			// targets
+			bool EmitLTOSummary = shouldEmitRegularLTOSummary();
+			if (EmitLTOSummary) {
+				if (!TheModule->getModuleFlag("ThinLTO") && !CodeGenOpts.UnifiedLTO)
+					TheModule->addModuleFlag(llvm::Module::Error, "ThinLTO", uint32_t(0));
+				if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
+					TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
+						uint32_t(1));
+			}
+			if (Action == Backend_EmitBC) {
+				MPM.addPass(BitcodeWriterPass(*OS, CodeGenOpts.EmitLLVMUseLists,
+					EmitLTOSummary));
+			}
+			else if (Action == Backend_EmitLL) {
+				MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
+					EmitLTOSummary));
+			}
 		}
-	};
-}
-
-char BfPass::ID = 0;
-static llvm::RegisterPass<BfPass> sBfPass("BfPass", "Beef Pass", false, false);
 
-llvm::Expected<llvm::BitcodeModule> FindThinLTOModule(llvm::MemoryBufferRef MBRef)
-{
-	llvm::Expected<std::vector<llvm::BitcodeModule>> BMsOrErr = getBitcodeModuleList(MBRef);
-	if (!BMsOrErr)
-		return BMsOrErr.takeError();
-
-	// The bitcode file may contain multiple modules, we want the one that is
-	// marked as being the ThinLTO module.
-	for (llvm::BitcodeModule &BM : *BMsOrErr) {
-		llvm::Expected<llvm::BitcodeLTOInfo> LTOInfo = BM.getLTOInfo();
-		if (LTOInfo && LTOInfo->IsThinLTO)
-			return BM;
+		if (shouldEmitUnifiedLTOModueFlag())
+			TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
 	}
+#endif
+
+	// Print a textual, '-passes=' compatible, representation of pipeline if
+	// requested.
+//	if (PrintPipelinePasses) {
+// 		MPM.printPipeline(outs(), [&PIC](StringRef ClassName) {
+// 			auto PassName = PIC.getPassNameForClassName(ClassName);
+// 			return PassName.empty() ? ClassName : PassName;
+// 			});
+// 		outs() << "\n";
+// 		return;
+// 	}
+// 
+// 	if (LangOpts.HIPStdPar && !LangOpts.CUDAIsDevice &&
+// 		LangOpts.HIPStdParInterposeAlloc)
+// 		MPM.addPass(HipStdParAllocationInterpositionPass());
 
-	return llvm::make_error<llvm::StringError>("Could not find module summary",
-		llvm::inconvertibleErrorCode());
+	// Now that we have all of the passes ready, run them.
+	{
+		//PrettyStackTraceString CrashInfo("Optimizer");
+		llvm::TimeTraceScope TimeScope("Optimizer");
+		MPM.run(*mLLVMModule, MAM);
+	}	
 }
 
 bool BfIRCodeGen::WriteObjectFile(const StringImpl& outFileName)
@@ -6183,6 +6037,7 @@ bool BfIRCodeGen::WriteObjectFile(const StringImpl& outFileName)
 	if (EC)
 		return false;
 	// Build up all of the passes that we want to do to the module.
+	//llvm::legacy::PassManager PM;
 	llvm::legacy::PassManager PM;
 
 	llvm::Triple theTriple = llvm::Triple(mLLVMModule->getTargetTriple());
@@ -6193,7 +6048,7 @@ bool BfIRCodeGen::WriteObjectFile(const StringImpl& outFileName)
 
 	// Add the target data from the target machine, if it exists, or the module.
 	//PM.add(new DataLayoutPass());
-	PopulateModulePassManager(PM, mCodeGenOptions);
+	RunOptimizationPipeline(theTriple);
 
 	llvm::raw_fd_ostream* outStream = NULL;
 	defer ( delete outStream; );
@@ -6401,6 +6256,6 @@ void BfIRCodeGen::StaticInit()
 	LLVMInitializeWebAssemblyTarget();
 	LLVMInitializeWebAssemblyTargetMC();
 	LLVMInitializeWebAssemblyAsmPrinter();
-	LLVMInitializeWebAssemblyAsmParser();
+	//LLVMInitializeWebAssemblyAsmParser();
 	LLVMInitializeWebAssemblyDisassembler();
 }

+ 3 - 0
IDEHelper/Compiler/BfIRCodeGen.h

@@ -21,6 +21,7 @@ namespace llvm
 	class Module;
 	class LLVMContext;
 	class TargetMachine;
+	class Triple;
 };
 
 NS_BF_BEGIN
@@ -208,6 +209,8 @@ public:
 	BfIRTypedValue GetAlignedPtr(const BfIRTypedValue& val);	
 	llvm::Value* DoCheckedIntrinsic(llvm::Intrinsic::ID intrin, llvm::Value* lhs, llvm::Value* rhs, bool useAsm);
 
+	void RunOptimizationPipeline(const llvm::Triple& targetTriple);
+
 public:
 	BfIRCodeGen();
 	~BfIRCodeGen();

+ 2 - 1
IDEHelper/Compiler/BfSystem.h

@@ -357,7 +357,8 @@ enum BfOptLevel
 enum BfLTOType
 {
 	BfLTOType_None = 0,
-	BfLTOType_Thin = 1
+	BfLTOType_Thin = 1,
+	BfLTOType_Fat = 2
 };
 
 enum BfCFLAAType