loop_fission.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // Copyright (c) 2018 Google LLC.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "source/opt/loop_fission.h"
  15. #include <set>
  16. #include "source/opt/register_pressure.h"
  17. // Implement loop fission with an optional parameter to split only
  18. // if the register pressure in a given loop meets a certain criteria. This is
  19. // controlled via the constructors of LoopFissionPass.
  20. //
  21. // 1 - Build a list of loops to be split, these are top level loops (loops
  22. // without child loops themselves) which meet the register pressure criteria, as
  23. // determined by the ShouldSplitLoop method of LoopFissionPass.
  24. //
  25. // 2 - For each loop in the list, group each instruction into a set of related
  26. // instructions by traversing each instructions users and operands recursively.
  27. // We stop if we encounter an instruction we have seen before or an instruction
  28. // which we don't consider relevant (i.e OpLoopMerge). We then group these
  29. // groups into two different sets, one for the first loop and one for the
  30. // second.
  31. //
  32. // 3 - We then run CanPerformSplit to check that it would be legal to split a
  33. // loop using those two sets. We check that we haven't altered the relative
  34. // order load/stores appear in the binary and that we aren't breaking any
  35. // dependency between load/stores by splitting them into two loops. We also
  36. // check that none of the OpBranch instructions are dependent on a load as we
  37. // leave control flow structure intact and move only instructions in the body so
  38. // we want to avoid any loads with side affects or aliasing.
  39. //
  40. // 4 - We then split the loop by calling SplitLoop. This function clones the
  41. // loop and attaches it to the preheader and connects the new loops merge block
  42. // to the current loop header block. We then use the two sets built in step 2 to
  43. // remove instructions from each loop. If an instruction appears in the first
  44. // set it is removed from the second loop and vice versa.
  45. //
  46. // 5 - If the multiple split passes flag is set we check if each of the loops
  47. // still meet the register pressure criteria. If they do then we add them to the
  48. // list of loops to be split (created in step one) to allow for loops to be
  49. // split multiple times.
  50. //
  51. namespace spvtools {
  52. namespace opt {
  53. class LoopFissionImpl {
  54. public:
  55. LoopFissionImpl(IRContext* context, Loop* loop)
  56. : context_(context), loop_(loop), load_used_in_condition_(false) {}
  57. // Group each instruction in the loop into sets of instructions related by
  58. // their usedef chains. An instruction which uses another will appear in the
  59. // same set. Then merge those sets into just two sets. Returns false if there
  60. // was one or less sets created.
  61. bool GroupInstructionsByUseDef();
  62. // Check if the sets built by GroupInstructionsByUseDef violate any data
  63. // dependence rules.
  64. bool CanPerformSplit();
  65. // Split the loop and return a pointer to the new loop.
  66. Loop* SplitLoop();
  67. // Checks if |inst| is safe to move. We can only move instructions which don't
  68. // have any side effects and OpLoads and OpStores.
  69. bool MovableInstruction(const Instruction& inst) const;
  70. private:
  71. // Traverse the def use chain of |inst| and add the users and uses of |inst|
  72. // which are in the same loop to the |returned_set|.
  73. void TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set,
  74. bool ignore_phi_users = false, bool report_loads = false);
  75. // We group the instructions in the block into two different groups, the
  76. // instructions to be kept in the original loop and the ones to be cloned into
  77. // the new loop. As the cloned loop is attached to the preheader it will be
  78. // the first loop and the second loop will be the original.
  79. std::set<Instruction*> cloned_loop_instructions_;
  80. std::set<Instruction*> original_loop_instructions_;
  81. // We need a set of all the instructions to be seen so we can break any
  82. // recursion and also so we can ignore certain instructions by preemptively
  83. // adding them to this set.
  84. std::set<Instruction*> seen_instructions_;
  85. // A map of instructions to their relative position in the function.
  86. std::map<Instruction*, size_t> instruction_order_;
  87. IRContext* context_;
  88. Loop* loop_;
  89. // This is set to true by TraverseUseDef when traversing the instructions
  90. // related to the loop condition and any if conditions should any of those
  91. // instructions be a load.
  92. bool load_used_in_condition_;
  93. };
  94. bool LoopFissionImpl::MovableInstruction(const Instruction& inst) const {
  95. return inst.opcode() == spv::Op::OpLoad ||
  96. inst.opcode() == spv::Op::OpStore ||
  97. inst.opcode() == spv::Op::OpSelectionMerge ||
  98. inst.opcode() == spv::Op::OpPhi || inst.IsOpcodeCodeMotionSafe();
  99. }
  100. void LoopFissionImpl::TraverseUseDef(Instruction* inst,
  101. std::set<Instruction*>* returned_set,
  102. bool ignore_phi_users, bool report_loads) {
  103. assert(returned_set && "Set to be returned cannot be null.");
  104. analysis::DefUseManager* def_use = context_->get_def_use_mgr();
  105. std::set<Instruction*>& inst_set = *returned_set;
  106. // We create this functor to traverse the use def chain to build the
  107. // grouping of related instructions. The lambda captures the std::function
  108. // to allow it to recurse.
  109. std::function<void(Instruction*)> traverser_functor;
  110. traverser_functor = [this, def_use, &inst_set, &traverser_functor,
  111. ignore_phi_users, report_loads](Instruction* user) {
  112. // If we've seen the instruction before or it is not inside the loop end the
  113. // traversal.
  114. if (!user || seen_instructions_.count(user) != 0 ||
  115. !context_->get_instr_block(user) ||
  116. !loop_->IsInsideLoop(context_->get_instr_block(user))) {
  117. return;
  118. }
  119. // Don't include labels or loop merge instructions in the instruction sets.
  120. // Including them would mean we group instructions related only by using the
  121. // same labels (i.e phis). We already preempt the inclusion of
  122. // OpSelectionMerge by adding related instructions to the seen_instructions_
  123. // set.
  124. if (user->opcode() == spv::Op::OpLoopMerge ||
  125. user->opcode() == spv::Op::OpLabel)
  126. return;
  127. // If the |report_loads| flag is set, set the class field
  128. // load_used_in_condition_ to false. This is used to check that none of the
  129. // condition checks in the loop rely on loads.
  130. if (user->opcode() == spv::Op::OpLoad && report_loads) {
  131. load_used_in_condition_ = true;
  132. }
  133. // Add the instruction to the set of instructions already seen, this breaks
  134. // recursion and allows us to ignore certain instructions.
  135. seen_instructions_.insert(user);
  136. inst_set.insert(user);
  137. // Wrapper functor to traverse the operands of each instruction.
  138. auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) {
  139. traverser_functor(def_use->GetDef(*id));
  140. };
  141. user->ForEachInOperand(traverse_operand);
  142. // For the first traversal we want to ignore the users of the phi.
  143. if (ignore_phi_users && user->opcode() == spv::Op::OpPhi) return;
  144. // Traverse each user with this lambda.
  145. def_use->ForEachUser(user, traverser_functor);
  146. // Wrapper functor for the use traversal.
  147. auto traverse_use = [&traverser_functor](Instruction* use, uint32_t) {
  148. traverser_functor(use);
  149. };
  150. def_use->ForEachUse(user, traverse_use);
  151. };
  152. // We start the traversal of the use def graph by invoking the above
  153. // lambda with the |inst| parameter.
  154. traverser_functor(inst);
  155. }
  156. bool LoopFissionImpl::GroupInstructionsByUseDef() {
  157. std::vector<std::set<Instruction*>> sets{};
  158. // We want to ignore all the instructions stemming from the loop condition
  159. // instruction.
  160. BasicBlock* condition_block = loop_->FindConditionBlock();
  161. if (!condition_block) return false;
  162. Instruction* condition = &*condition_block->tail();
  163. // We iterate over the blocks via iterating over all the blocks in the
  164. // function, we do this so we are iterating in the same order which the blocks
  165. // appear in the binary.
  166. Function& function = *loop_->GetHeaderBlock()->GetParent();
  167. // Create a temporary set to ignore certain groups of instructions within the
  168. // loop. We don't want any instructions related to control flow to be removed
  169. // from either loop only instructions within the control flow bodies.
  170. std::set<Instruction*> instructions_to_ignore{};
  171. TraverseUseDef(condition, &instructions_to_ignore, true, true);
  172. // Traverse control flow instructions to ensure they are added to the
  173. // seen_instructions_ set and will be ignored when it it called with actual
  174. // sets.
  175. for (BasicBlock& block : function) {
  176. if (!loop_->IsInsideLoop(block.id())) continue;
  177. for (Instruction& inst : block) {
  178. // Ignore all instructions related to control flow.
  179. if (inst.opcode() == spv::Op::OpSelectionMerge || inst.IsBranch()) {
  180. TraverseUseDef(&inst, &instructions_to_ignore, true, true);
  181. }
  182. }
  183. }
  184. // Traverse the instructions and generate the sets, automatically ignoring any
  185. // instructions in instructions_to_ignore.
  186. for (BasicBlock& block : function) {
  187. if (!loop_->IsInsideLoop(block.id()) ||
  188. loop_->GetHeaderBlock()->id() == block.id())
  189. continue;
  190. for (Instruction& inst : block) {
  191. // Record the order that each load/store is seen.
  192. if (inst.opcode() == spv::Op::OpLoad ||
  193. inst.opcode() == spv::Op::OpStore) {
  194. instruction_order_[&inst] = instruction_order_.size();
  195. }
  196. // Ignore instructions already seen in a traversal.
  197. if (seen_instructions_.count(&inst) != 0) {
  198. continue;
  199. }
  200. // Build the set.
  201. std::set<Instruction*> inst_set{};
  202. TraverseUseDef(&inst, &inst_set);
  203. if (!inst_set.empty()) sets.push_back(std::move(inst_set));
  204. }
  205. }
  206. // If we have one or zero sets return false to indicate that due to
  207. // insufficient instructions we couldn't split the loop into two groups and
  208. // thus the loop can't be split any further.
  209. if (sets.size() < 2) {
  210. return false;
  211. }
  212. // Merge the loop sets into two different sets. In CanPerformSplit we will
  213. // validate that we don't break the relative ordering of loads/stores by doing
  214. // this.
  215. for (size_t index = 0; index < sets.size() / 2; ++index) {
  216. cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end());
  217. }
  218. for (size_t index = sets.size() / 2; index < sets.size(); ++index) {
  219. original_loop_instructions_.insert(sets[index].begin(), sets[index].end());
  220. }
  221. return true;
  222. }
  223. bool LoopFissionImpl::CanPerformSplit() {
  224. // Return false if any of the condition instructions in the loop depend on a
  225. // load.
  226. if (load_used_in_condition_) {
  227. return false;
  228. }
  229. // Build a list of all parent loops of this loop. Loop dependence analysis
  230. // needs this structure.
  231. std::vector<const Loop*> loops;
  232. Loop* parent_loop = loop_;
  233. while (parent_loop) {
  234. loops.push_back(parent_loop);
  235. parent_loop = parent_loop->GetParent();
  236. }
  237. LoopDependenceAnalysis analysis{context_, loops};
  238. // A list of all the stores in the cloned loop.
  239. std::vector<Instruction*> set_one_stores{};
  240. // A list of all the loads in the cloned loop.
  241. std::vector<Instruction*> set_one_loads{};
  242. // Populate the above lists.
  243. for (Instruction* inst : cloned_loop_instructions_) {
  244. if (inst->opcode() == spv::Op::OpStore) {
  245. set_one_stores.push_back(inst);
  246. } else if (inst->opcode() == spv::Op::OpLoad) {
  247. set_one_loads.push_back(inst);
  248. }
  249. // If we find any instruction which we can't move (such as a barrier),
  250. // return false.
  251. if (!MovableInstruction(*inst)) return false;
  252. }
  253. // We need to calculate the depth of the loop to create the loop dependency
  254. // distance vectors.
  255. const size_t loop_depth = loop_->GetDepth();
  256. // Check the dependencies between loads in the cloned loop and stores in the
  257. // original and vice versa.
  258. for (Instruction* inst : original_loop_instructions_) {
  259. // If we find any instruction which we can't move (such as a barrier),
  260. // return false.
  261. if (!MovableInstruction(*inst)) return false;
  262. // Look at the dependency between the loads in the original and stores in
  263. // the cloned loops.
  264. if (inst->opcode() == spv::Op::OpLoad) {
  265. for (Instruction* store : set_one_stores) {
  266. DistanceVector vec{loop_depth};
  267. // If the store actually should appear after the load, return false.
  268. // This means the store has been placed in the wrong grouping.
  269. if (instruction_order_[store] > instruction_order_[inst]) {
  270. return false;
  271. }
  272. // If not independent check the distance vector.
  273. if (!analysis.GetDependence(store, inst, &vec)) {
  274. for (DistanceEntry& entry : vec.GetEntries()) {
  275. // A distance greater than zero means that the store in the cloned
  276. // loop has a dependency on the load in the original loop.
  277. if (entry.distance > 0) return false;
  278. }
  279. }
  280. }
  281. } else if (inst->opcode() == spv::Op::OpStore) {
  282. for (Instruction* load : set_one_loads) {
  283. DistanceVector vec{loop_depth};
  284. // If the load actually should appear after the store, return false.
  285. if (instruction_order_[load] > instruction_order_[inst]) {
  286. return false;
  287. }
  288. // If not independent check the distance vector.
  289. if (!analysis.GetDependence(inst, load, &vec)) {
  290. for (DistanceEntry& entry : vec.GetEntries()) {
  291. // A distance less than zero means the load in the cloned loop is
  292. // dependent on the store instruction in the original loop.
  293. if (entry.distance < 0) return false;
  294. }
  295. }
  296. }
  297. }
  298. }
  299. return true;
  300. }
  301. Loop* LoopFissionImpl::SplitLoop() {
  302. // Clone the loop.
  303. LoopUtils util{context_, loop_};
  304. LoopUtils::LoopCloningResult clone_results;
  305. Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results);
  306. // Update the OpLoopMerge in the cloned loop.
  307. cloned_loop->UpdateLoopMergeInst();
  308. // Add the loop_ to the module.
  309. // TODO(1841): Handle failure to create pre-header.
  310. Function::iterator it =
  311. util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id());
  312. util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(),
  313. clone_results.cloned_bb_.end(), ++it);
  314. loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock());
  315. std::vector<Instruction*> instructions_to_kill{};
  316. // Kill all the instructions which should appear in the cloned loop but not in
  317. // the original loop.
  318. for (uint32_t id : loop_->GetBlocks()) {
  319. BasicBlock* block = context_->cfg()->block(id);
  320. for (Instruction& inst : *block) {
  321. // If the instruction appears in the cloned loop instruction group, kill
  322. // it.
  323. if (cloned_loop_instructions_.count(&inst) == 1 &&
  324. original_loop_instructions_.count(&inst) == 0) {
  325. instructions_to_kill.push_back(&inst);
  326. if (inst.opcode() == spv::Op::OpPhi) {
  327. context_->ReplaceAllUsesWith(
  328. inst.result_id(), clone_results.value_map_[inst.result_id()]);
  329. }
  330. }
  331. }
  332. }
  333. // Kill all instructions which should appear in the original loop and not in
  334. // the cloned loop.
  335. for (uint32_t id : cloned_loop->GetBlocks()) {
  336. BasicBlock* block = context_->cfg()->block(id);
  337. for (Instruction& inst : *block) {
  338. Instruction* old_inst = clone_results.ptr_map_[&inst];
  339. // If the instruction belongs to the original loop instruction group, kill
  340. // it.
  341. if (cloned_loop_instructions_.count(old_inst) == 0 &&
  342. original_loop_instructions_.count(old_inst) == 1) {
  343. instructions_to_kill.push_back(&inst);
  344. }
  345. }
  346. }
  347. for (Instruction* i : instructions_to_kill) {
  348. context_->KillInst(i);
  349. }
  350. return cloned_loop;
  351. }
  352. LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split,
  353. bool split_multiple_times)
  354. : split_multiple_times_(split_multiple_times) {
  355. // Split if the number of registers in the loop exceeds
  356. // |register_threshold_to_split|.
  357. split_criteria_ =
  358. [register_threshold_to_split](
  359. const RegisterLiveness::RegionRegisterLiveness& liveness) {
  360. return liveness.used_registers_ > register_threshold_to_split;
  361. };
  362. }
  363. LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) {
  364. // Split by default.
  365. split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) {
  366. return true;
  367. };
  368. }
  369. bool LoopFissionPass::ShouldSplitLoop(const Loop& loop, IRContext* c) {
  370. LivenessAnalysis* analysis = c->GetLivenessAnalysis();
  371. RegisterLiveness::RegionRegisterLiveness liveness{};
  372. Function* function = loop.GetHeaderBlock()->GetParent();
  373. analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness);
  374. return split_criteria_(liveness);
  375. }
  376. Pass::Status LoopFissionPass::Process() {
  377. bool changed = false;
  378. for (Function& f : *context()->module()) {
  379. // We collect all the inner most loops in the function and run the loop
  380. // splitting util on each. The reason we do this is to allow us to iterate
  381. // over each, as creating new loops will invalidate the loop iterator.
  382. std::vector<Loop*> inner_most_loops{};
  383. LoopDescriptor& loop_descriptor = *context()->GetLoopDescriptor(&f);
  384. for (Loop& loop : loop_descriptor) {
  385. if (!loop.HasChildren() && ShouldSplitLoop(loop, context())) {
  386. inner_most_loops.push_back(&loop);
  387. }
  388. }
  389. // List of new loops which meet the criteria to be split again.
  390. std::vector<Loop*> new_loops_to_split{};
  391. while (!inner_most_loops.empty()) {
  392. for (Loop* loop : inner_most_loops) {
  393. LoopFissionImpl impl{context(), loop};
  394. // Group the instructions in the loop into two different sets of related
  395. // instructions. If we can't group the instructions into the two sets
  396. // then we can't split the loop any further.
  397. if (!impl.GroupInstructionsByUseDef()) {
  398. continue;
  399. }
  400. if (impl.CanPerformSplit()) {
  401. Loop* second_loop = impl.SplitLoop();
  402. changed = true;
  403. context()->InvalidateAnalysesExceptFor(
  404. IRContext::kAnalysisLoopAnalysis);
  405. // If the newly created loop meets the criteria to be split, split it
  406. // again.
  407. if (ShouldSplitLoop(*second_loop, context()))
  408. new_loops_to_split.push_back(second_loop);
  409. // If the original loop (now split) still meets the criteria to be
  410. // split, split it again.
  411. if (ShouldSplitLoop(*loop, context()))
  412. new_loops_to_split.push_back(loop);
  413. }
  414. }
  415. // If the split multiple times flag has been set add the new loops which
  416. // meet the splitting criteria into the list of loops to be split on the
  417. // next iteration.
  418. if (split_multiple_times_) {
  419. inner_most_loops = std::move(new_loops_to_split);
  420. new_loops_to_split = {};
  421. } else {
  422. break;
  423. }
  424. }
  425. }
  426. return changed ? Pass::Status::SuccessWithChange
  427. : Pass::Status::SuccessWithoutChange;
  428. }
  429. } // namespace opt
  430. } // namespace spvtools