parallel_for.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /*
  2. Copyright (c) 2005-2020 Intel Corporation
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. #ifndef __TBB_parallel_for_H
  14. #define __TBB_parallel_for_H
  15. #define __TBB_parallel_for_H_include_area
  16. #include "internal/_warning_suppress_enable_notice.h"
  17. #include <new>
  18. #include "task.h"
  19. #include "partitioner.h"
  20. #include "blocked_range.h"
  21. #include "tbb_exception.h"
  22. #include "internal/_tbb_trace_impl.h"
  23. namespace tbb {
  24. namespace interface9 {
  25. //! @cond INTERNAL
  26. namespace internal {
  27. //! allocate right task with new parent
  28. void* allocate_sibling(task* start_for_task, size_t bytes);
  29. //! Task type used in parallel_for
  30. /** @ingroup algorithms */
  31. template<typename Range, typename Body, typename Partitioner>
  32. class start_for: public task {
  33. Range my_range;
  34. const Body my_body;
  35. typename Partitioner::task_partition_type my_partition;
  36. task* execute() __TBB_override;
  37. //! Update affinity info, if any.
  38. void note_affinity( affinity_id id ) __TBB_override {
  39. my_partition.note_affinity( id );
  40. }
  41. public:
  42. //! Constructor for root task.
  43. start_for( const Range& range, const Body& body, Partitioner& partitioner ) :
  44. my_range(range),
  45. my_body(body),
  46. my_partition(partitioner)
  47. {
  48. tbb::internal::fgt_algorithm(tbb::internal::PARALLEL_FOR_TASK, this, NULL);
  49. }
  50. //! Splitting constructor used to generate children.
  51. /** parent_ becomes left child. Newly constructed object is right child. */
  52. start_for( start_for& parent_, typename Partitioner::split_type& split_obj) :
  53. my_range(parent_.my_range, split_obj),
  54. my_body(parent_.my_body),
  55. my_partition(parent_.my_partition, split_obj)
  56. {
  57. my_partition.set_affinity(*this);
  58. tbb::internal::fgt_algorithm(tbb::internal::PARALLEL_FOR_TASK, this, (void *)&parent_);
  59. }
  60. //! Construct right child from the given range as response to the demand.
  61. /** parent_ remains left child. Newly constructed object is right child. */
  62. start_for( start_for& parent_, const Range& r, depth_t d ) :
  63. my_range(r),
  64. my_body(parent_.my_body),
  65. my_partition(parent_.my_partition, split())
  66. {
  67. my_partition.set_affinity(*this);
  68. my_partition.align_depth( d );
  69. tbb::internal::fgt_algorithm(tbb::internal::PARALLEL_FOR_TASK, this, (void *)&parent_);
  70. }
  71. static void run( const Range& range, const Body& body, Partitioner& partitioner ) {
  72. if( !range.empty() ) {
  73. #if !__TBB_TASK_GROUP_CONTEXT || TBB_JOIN_OUTER_TASK_GROUP
  74. start_for& a = *new(task::allocate_root()) start_for(range,body,partitioner);
  75. #else
  76. // Bound context prevents exceptions from body to affect nesting or sibling algorithms,
  77. // and allows users to handle exceptions safely by wrapping parallel_for in the try-block.
  78. task_group_context context(PARALLEL_FOR);
  79. start_for& a = *new(task::allocate_root(context)) start_for(range,body,partitioner);
  80. #endif /* __TBB_TASK_GROUP_CONTEXT && !TBB_JOIN_OUTER_TASK_GROUP */
  81. // REGION BEGIN
  82. fgt_begin_algorithm( tbb::internal::PARALLEL_FOR_TASK, (void*)&context );
  83. task::spawn_root_and_wait(a);
  84. fgt_end_algorithm( (void*)&context );
  85. // REGION END
  86. }
  87. }
  88. #if __TBB_TASK_GROUP_CONTEXT
  89. static void run( const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context ) {
  90. if( !range.empty() ) {
  91. start_for& a = *new(task::allocate_root(context)) start_for(range,body,partitioner);
  92. // REGION BEGIN
  93. fgt_begin_algorithm( tbb::internal::PARALLEL_FOR_TASK, (void*)&context );
  94. task::spawn_root_and_wait(a);
  95. fgt_end_algorithm( (void*)&context );
  96. // END REGION
  97. }
  98. }
  99. #endif /* __TBB_TASK_GROUP_CONTEXT */
  100. //! Run body for range, serves as callback for partitioner
  101. void run_body( Range &r ) {
  102. fgt_alg_begin_body( tbb::internal::PARALLEL_FOR_TASK, (void *)const_cast<Body*>(&(this->my_body)), (void*)this );
  103. my_body( r );
  104. fgt_alg_end_body( (void *)const_cast<Body*>(&(this->my_body)) );
  105. }
  106. //! spawn right task, serves as callback for partitioner
  107. void offer_work(typename Partitioner::split_type& split_obj) {
  108. spawn( *new( allocate_sibling(static_cast<task*>(this), sizeof(start_for)) ) start_for(*this, split_obj) );
  109. }
  110. //! spawn right task, serves as callback for partitioner
  111. void offer_work(const Range& r, depth_t d = 0) {
  112. spawn( *new( allocate_sibling(static_cast<task*>(this), sizeof(start_for)) ) start_for(*this, r, d) );
  113. }
  114. };
  115. //! allocate right task with new parent
  116. // TODO: 'inline' here is to avoid multiple definition error but for sake of code size this should not be inlined
  117. inline void* allocate_sibling(task* start_for_task, size_t bytes) {
  118. task* parent_ptr = new( start_for_task->allocate_continuation() ) flag_task();
  119. start_for_task->set_parent(parent_ptr);
  120. parent_ptr->set_ref_count(2);
  121. return &parent_ptr->allocate_child().allocate(bytes);
  122. }
  123. //! execute task for parallel_for
  124. template<typename Range, typename Body, typename Partitioner>
  125. task* start_for<Range,Body,Partitioner>::execute() {
  126. my_partition.check_being_stolen( *this );
  127. my_partition.execute(*this, my_range);
  128. return NULL;
  129. }
  130. } // namespace internal
  131. //! @endcond
  132. } // namespace interfaceX
  133. //! @cond INTERNAL
  134. namespace internal {
  135. using interface9::internal::start_for;
  136. //! Calls the function with values from range [begin, end) with a step provided
  137. template<typename Function, typename Index>
  138. class parallel_for_body : internal::no_assign {
  139. const Function &my_func;
  140. const Index my_begin;
  141. const Index my_step;
  142. public:
  143. parallel_for_body( const Function& _func, Index& _begin, Index& _step )
  144. : my_func(_func), my_begin(_begin), my_step(_step) {}
  145. void operator()( const tbb::blocked_range<Index>& r ) const {
  146. // A set of local variables to help the compiler with vectorization of the following loop.
  147. Index b = r.begin();
  148. Index e = r.end();
  149. Index ms = my_step;
  150. Index k = my_begin + b*ms;
  151. #if __INTEL_COMPILER
  152. #pragma ivdep
  153. #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
  154. #pragma vector always assert
  155. #endif
  156. #endif
  157. for ( Index i = b; i < e; ++i, k += ms ) {
  158. my_func( k );
  159. }
  160. }
  161. };
  162. } // namespace internal
  163. //! @endcond
  164. // Requirements on Range concept are documented in blocked_range.h
  165. /** \page parallel_for_body_req Requirements on parallel_for body
  166. Class \c Body implementing the concept of parallel_for body must define:
  167. - \code Body::Body( const Body& ); \endcode Copy constructor
  168. - \code Body::~Body(); \endcode Destructor
  169. - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r.
  170. **/
  171. /** \name parallel_for
  172. See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
  173. //@{
  174. //! Parallel iteration over range with default partitioner.
  175. /** @ingroup algorithms **/
  176. template<typename Range, typename Body>
  177. void parallel_for( const Range& range, const Body& body ) {
  178. internal::start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
  179. }
  180. //! Parallel iteration over range with simple partitioner.
  181. /** @ingroup algorithms **/
  182. template<typename Range, typename Body>
  183. void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
  184. internal::start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
  185. }
  186. //! Parallel iteration over range with auto_partitioner.
  187. /** @ingroup algorithms **/
  188. template<typename Range, typename Body>
  189. void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
  190. internal::start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
  191. }
  192. //! Parallel iteration over range with static_partitioner.
  193. /** @ingroup algorithms **/
  194. template<typename Range, typename Body>
  195. void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
  196. internal::start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
  197. }
  198. //! Parallel iteration over range with affinity_partitioner.
  199. /** @ingroup algorithms **/
  200. template<typename Range, typename Body>
  201. void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
  202. internal::start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
  203. }
  204. #if __TBB_TASK_GROUP_CONTEXT
  205. //! Parallel iteration over range with default partitioner and user-supplied context.
  206. /** @ingroup algorithms **/
  207. template<typename Range, typename Body>
  208. void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
  209. internal::start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
  210. }
  211. //! Parallel iteration over range with simple partitioner and user-supplied context.
  212. /** @ingroup algorithms **/
  213. template<typename Range, typename Body>
  214. void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
  215. internal::start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
  216. }
  217. //! Parallel iteration over range with auto_partitioner and user-supplied context.
  218. /** @ingroup algorithms **/
  219. template<typename Range, typename Body>
  220. void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
  221. internal::start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
  222. }
  223. //! Parallel iteration over range with static_partitioner and user-supplied context.
  224. /** @ingroup algorithms **/
  225. template<typename Range, typename Body>
  226. void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
  227. internal::start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
  228. }
  229. //! Parallel iteration over range with affinity_partitioner and user-supplied context.
  230. /** @ingroup algorithms **/
  231. template<typename Range, typename Body>
  232. void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
  233. internal::start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
  234. }
  235. #endif /* __TBB_TASK_GROUP_CONTEXT */
  236. //@}
  237. namespace strict_ppl {
  238. //@{
  239. //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
  240. template <typename Index, typename Function, typename Partitioner>
  241. void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
  242. if (step <= 0 )
  243. internal::throw_exception(internal::eid_nonpositive_step); // throws std::invalid_argument
  244. else if (last > first) {
  245. // Above "else" avoids "potential divide by zero" warning on some platforms
  246. Index end = (last - first - Index(1)) / step + Index(1);
  247. tbb::blocked_range<Index> range(static_cast<Index>(0), end);
  248. internal::parallel_for_body<Function, Index> body(f, first, step);
  249. tbb::parallel_for(range, body, partitioner);
  250. }
  251. }
  252. //! Parallel iteration over a range of integers with a step provided and default partitioner
  253. template <typename Index, typename Function>
  254. void parallel_for(Index first, Index last, Index step, const Function& f) {
  255. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
  256. }
  257. //! Parallel iteration over a range of integers with a step provided and simple partitioner
  258. template <typename Index, typename Function>
  259. void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
  260. parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
  261. }
  262. //! Parallel iteration over a range of integers with a step provided and auto partitioner
  263. template <typename Index, typename Function>
  264. void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
  265. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
  266. }
  267. //! Parallel iteration over a range of integers with a step provided and static partitioner
  268. template <typename Index, typename Function>
  269. void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
  270. parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
  271. }
  272. //! Parallel iteration over a range of integers with a step provided and affinity partitioner
  273. template <typename Index, typename Function>
  274. void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
  275. parallel_for_impl(first, last, step, f, partitioner);
  276. }
  277. //! Parallel iteration over a range of integers with a default step value and default partitioner
  278. template <typename Index, typename Function>
  279. void parallel_for(Index first, Index last, const Function& f) {
  280. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
  281. }
  282. //! Parallel iteration over a range of integers with a default step value and simple partitioner
  283. template <typename Index, typename Function>
  284. void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
  285. parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
  286. }
  287. //! Parallel iteration over a range of integers with a default step value and auto partitioner
  288. template <typename Index, typename Function>
  289. void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
  290. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
  291. }
  292. //! Parallel iteration over a range of integers with a default step value and static partitioner
  293. template <typename Index, typename Function>
  294. void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
  295. parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
  296. }
  297. //! Parallel iteration over a range of integers with a default step value and affinity partitioner
  298. template <typename Index, typename Function>
  299. void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
  300. parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
  301. }
  302. #if __TBB_TASK_GROUP_CONTEXT
  303. //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
  304. template <typename Index, typename Function, typename Partitioner>
  305. void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, tbb::task_group_context &context) {
  306. if (step <= 0 )
  307. internal::throw_exception(internal::eid_nonpositive_step); // throws std::invalid_argument
  308. else if (last > first) {
  309. // Above "else" avoids "potential divide by zero" warning on some platforms
  310. Index end = (last - first - Index(1)) / step + Index(1);
  311. tbb::blocked_range<Index> range(static_cast<Index>(0), end);
  312. internal::parallel_for_body<Function, Index> body(f, first, step);
  313. tbb::parallel_for(range, body, partitioner, context);
  314. }
  315. }
  316. //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
  317. template <typename Index, typename Function>
  318. void parallel_for(Index first, Index last, Index step, const Function& f, tbb::task_group_context &context) {
  319. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
  320. }
  321. //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
  322. template <typename Index, typename Function>
  323. void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, tbb::task_group_context &context) {
  324. parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
  325. }
  326. //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
  327. template <typename Index, typename Function>
  328. void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, tbb::task_group_context &context) {
  329. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
  330. }
  331. //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
  332. template <typename Index, typename Function>
  333. void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, tbb::task_group_context &context) {
  334. parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
  335. }
  336. //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
  337. template <typename Index, typename Function>
  338. void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, tbb::task_group_context &context) {
  339. parallel_for_impl(first, last, step, f, partitioner, context);
  340. }
  341. //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
  342. template <typename Index, typename Function>
  343. void parallel_for(Index first, Index last, const Function& f, tbb::task_group_context &context) {
  344. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
  345. }
  346. //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
  347. template <typename Index, typename Function>
  348. void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, tbb::task_group_context &context) {
  349. parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
  350. }
  351. //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
  352. template <typename Index, typename Function>
  353. void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, tbb::task_group_context &context) {
  354. parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
  355. }
  356. //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
  357. template <typename Index, typename Function>
  358. void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, tbb::task_group_context &context) {
  359. parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
  360. }
  361. //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
  362. template <typename Index, typename Function>
  363. void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, tbb::task_group_context &context) {
  364. parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
  365. }
  366. #endif /* __TBB_TASK_GROUP_CONTEXT */
  367. //@}
  368. } // namespace strict_ppl
  369. using strict_ppl::parallel_for;
  370. } // namespace tbb
  371. #if TBB_PREVIEW_SERIAL_SUBSET
  372. #define __TBB_NORMAL_EXECUTION
  373. #include "../serial/tbb/parallel_for.h"
  374. #undef __TBB_NORMAL_EXECUTION
  375. #endif
  376. #include "internal/_warning_suppress_disable_notice.h"
  377. #undef __TBB_parallel_for_H_include_area
  378. #endif /* __TBB_parallel_for_H */