STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ppl.h
Go to the documentation of this file.
1 /***
2 * ==++==
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 * ==--==
7 * =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
8 *
9 * ppl.h
10 *
11 * Parallel Patterns Library
12 *
13 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
14 ****/
15 
16 #pragma once
17 
18 #include <crtdefs.h>
19 #include <concrt.h>
20 #include <stdexcept>
21 #include <iterator>
22 #include <functional>
23 #include <memory>
24 #include <type_traits>
25 #include <algorithm>
26 
27 #include <pplconcrt.h>
28 
29 #define _PPL_H
30 
31 #pragma pack(push,_CRT_PACKING)
32 #pragma push_macro("new")
33 #undef new
34 
35 // Define the level of tracing to use
36 
37 #define _TRACE_LEVEL_INFORMATION 4
38 
43 
44 namespace Concurrency
45 {
46 namespace details
47 {
48  _CRTIMP2 size_t __cdecl _GetCombinableSize();
49 } // namespace details
50 
51 class structured_task_group;
52 class task_group;
53 
82 
83 template<typename _Function>
85 {
86 public:
99 
100  task_handle(const _Function& _Func) : _M_function(_Func)
101  {
102  m_pFunction = reinterpret_cast <TaskProc> (&::Concurrency::details::_UnrealizedChore::_InvokeBridge<task_handle>);
103  }
104 
108 
110  {
111  //
112  // We only need to perform a liveness check if the client owns the lifetime of the handle. Doing this for runtime owned handles
113  // is not only unnecessary -- it is also dangerous.
114  //
116  {
118  }
119  }
120 
124 
125  void operator()() const
126  {
127  _M_function();
128  }
129 
130 private:
131 
132  friend class task_group;
133  friend class structured_task_group;
134 
135  // The function object invoked to perform the body of the task.
136  _Function _M_function;
137 
138  task_handle const & operator=(task_handle const&); // no assignment operator
139 
140 };
141 
162 
163 template <class _Function>
164 task_handle<_Function> make_task(const _Function& _Func)
165 {
166  return task_handle<_Function>(_Func);
167 }
168 
203 
205 {
206 public:
207 
217 
219  {
220  }
221 
234 
236  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : Concurrency::details::_CancellationTokenState::_None())
237  {
238  }
239 
250 
252  {
253  }
254 
282 
283  template<class _Function>
284  void run(task_handle<_Function>& _Task_handle)
285  {
286  _Task_handle._SetRuntimeOwnsLifetime(false);
287  _M_task_collection._Schedule(&_Task_handle);
288  }
289 
320 
321  template<class _Function>
322  void run(task_handle<_Function>& _Task_handle, location& _Placement)
323  {
324  _Task_handle._SetRuntimeOwnsLifetime(false);
325  _M_task_collection._Schedule(&_Task_handle, &_Placement);
326  }
327 
347 
349  {
350  //
351  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
352  //
354  }
355 
385 
386  template<class _Function>
388  {
389  //
390  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
391  //
392  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
393  }
394 
424 
425  template<class _Function>
426  task_group_status run_and_wait(const _Function& _Func)
427  {
428  //
429  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
430  //
431  task_handle<_Function> _Task(_Func);
433  }
434 
442 
443  void cancel()
444  {
446  }
447 
461 
463  {
465  }
466 
467 private:
468 
469  // Disallow passing in an r-value for a task handle argument
470  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
471 
472  // The underlying group of tasks as known to the runtime.
474 };
475 
488 
490 {
491 public:
492 
502 
504  {
505  }
518 
519  task_group(cancellation_token _CancellationToken) :
520  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : Concurrency::details::_CancellationTokenState::_None())
521  {
522  }
523 
534 
536  {
537  }
538 
570 
571  template<typename _Function>
572  void run(const _Function& _Func)
573  {
575  }
576 
611 
612  template<typename _Function>
613  void run(const _Function& _Func, location& _Placement)
614  {
616  }
617 
650 
651  template<typename _Function>
652  void run(task_handle<_Function>& _Task_handle)
653  {
654  _Task_handle._SetRuntimeOwnsLifetime(false);
655  _M_task_collection._Schedule(&_Task_handle);
656  }
657 
693 
694  template<typename _Function>
695  void run(task_handle<_Function>& _Task_handle, location& _Placement)
696  {
697  _Task_handle._SetRuntimeOwnsLifetime(false);
698  _M_task_collection._Schedule(&_Task_handle, &_Placement);
699  }
700 
717 
719  {
720  //
721  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
722  //
723  return static_cast<task_group_status>(_M_task_collection._Wait());
724  }
725 
754 
755  template<class _Function>
757  {
758  //
759  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
760  //
761  _Task_handle._SetRuntimeOwnsLifetime(false);
762  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
763  }
764 
793 
794  template<class _Function>
795  task_group_status run_and_wait(const _Function& _Func)
796  {
797  //
798  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
799  //
801  }
802 
810 
811  void cancel()
812  {
814  }
815 
829 
831  {
833  }
834 
835 private:
836 
837  // Disallow passing in an r-value for a task handle argument
838  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
839 
840  // The underlying group of tasks as known to the runtime.
842 };
843 
862 
863 template<typename _Function>
864 void run_with_cancellation_token(const _Function& _Func, cancellation_token _Ct)
865 {
866  structured_task_group _Stg(_Ct);
867  _Stg.run_and_wait(_Func);
868 }
869 
878 
879 inline void interruption_point()
880 {
882  _Stg.wait();
883 }
884 
898 
900 
901 // Parallel Algorithms and Patterns
902 
903 // Helper function that implements parallel_invoke with two functions
904 // Used by parallel_for and parallel_for_each implementations
905 
906 template <typename _Function1, typename _Function2>
907 void _Parallel_invoke_impl(const _Function1& _Func1, const _Function2& _Func2)
908 {
909  structured_task_group _Task_group;
910 
911  task_handle<_Function1> _Task_handle1(_Func1);
912  _Task_group.run(_Task_handle1);
913 
914  // We inline the last item to prevent the unnecessary push/pop on the work queue.
915  task_handle<_Function2> _Task_handle2(_Func2);
916  _Task_group.run_and_wait(_Task_handle2);
917 }
918 
942 
943 template <typename _Function1, typename _Function2>
944 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2)
945 {
947 
948  _Parallel_invoke_impl(_Func1, _Func2);
949 
950  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
951 }
952 
982 
983 template <typename _Function1, typename _Function2, typename _Function3>
984 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3)
985 {
987 
988  structured_task_group _Task_group;
989 
990  task_handle<_Function1> _Task_handle1(_Func1);
991  _Task_group.run(_Task_handle1);
992 
993  task_handle<_Function2> _Task_handle2(_Func2);
994  _Task_group.run(_Task_handle2);
995 
996  task_handle<_Function3> _Task_handle3(_Func3);
997  _Task_group.run_and_wait(_Task_handle3);
998 
999  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1000 }
1001 
1037 
1038 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4>
1039 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4)
1040 {
1041  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1042 
1043  structured_task_group _Task_group;
1044 
1045  task_handle<_Function1> _Task_handle1(_Func1);
1046  _Task_group.run(_Task_handle1);
1047 
1048  task_handle<_Function2> _Task_handle2(_Func2);
1049  _Task_group.run(_Task_handle2);
1050 
1051  task_handle<_Function3> _Task_handle3(_Func3);
1052  _Task_group.run(_Task_handle3);
1053 
1054  task_handle<_Function4> _Task_handle4(_Func4);
1055  _Task_group.run_and_wait(_Task_handle4);
1056 
1057  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1058 }
1059 
1101 
1102 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5>
1103 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5)
1104 {
1105  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1106 
1107  structured_task_group _Task_group;
1108 
1109  task_handle<_Function1> _Task_handle1(_Func1);
1110  _Task_group.run(_Task_handle1);
1111 
1112  task_handle<_Function2> _Task_handle2(_Func2);
1113  _Task_group.run(_Task_handle2);
1114 
1115  task_handle<_Function3> _Task_handle3(_Func3);
1116  _Task_group.run(_Task_handle3);
1117 
1118  task_handle<_Function4> _Task_handle4(_Func4);
1119  _Task_group.run(_Task_handle4);
1120 
1121  task_handle<_Function5> _Task_handle5(_Func5);
1122  _Task_group.run_and_wait(_Task_handle5);
1123 
1124  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1125 }
1126 
1174 
1175 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1176  typename _Function6>
1177 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1178  const _Function6& _Func6)
1179 {
1180  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1181 
1182  structured_task_group _Task_group;
1183 
1184  task_handle<_Function1> _Task_handle1(_Func1);
1185  _Task_group.run(_Task_handle1);
1186 
1187  task_handle<_Function2> _Task_handle2(_Func2);
1188  _Task_group.run(_Task_handle2);
1189 
1190  task_handle<_Function3> _Task_handle3(_Func3);
1191  _Task_group.run(_Task_handle3);
1192 
1193  task_handle<_Function4> _Task_handle4(_Func4);
1194  _Task_group.run(_Task_handle4);
1195 
1196  task_handle<_Function5> _Task_handle5(_Func5);
1197  _Task_group.run(_Task_handle5);
1198 
1199  task_handle<_Function6> _Task_handle6(_Func6);
1200  _Task_group.run_and_wait(_Task_handle6);
1201 
1202  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1203 }
1204 
1258 
1259 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1260  typename _Function6, typename _Function7>
1261 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1262  const _Function6& _Func6, const _Function7& _Func7)
1263 {
1264  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1265 
1266  structured_task_group _Task_group;
1267 
1268  task_handle<_Function1> _Task_handle1(_Func1);
1269  _Task_group.run(_Task_handle1);
1270 
1271  task_handle<_Function2> _Task_handle2(_Func2);
1272  _Task_group.run(_Task_handle2);
1273 
1274  task_handle<_Function3> _Task_handle3(_Func3);
1275  _Task_group.run(_Task_handle3);
1276 
1277  task_handle<_Function4> _Task_handle4(_Func4);
1278  _Task_group.run(_Task_handle4);
1279 
1280  task_handle<_Function5> _Task_handle5(_Func5);
1281  _Task_group.run(_Task_handle5);
1282 
1283  task_handle<_Function6> _Task_handle6(_Func6);
1284  _Task_group.run(_Task_handle6);
1285 
1286  task_handle<_Function7> _Task_handle7(_Func7);
1287  _Task_group.run_and_wait(_Task_handle7);
1288 
1289  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1290 }
1291 
1351 
1352 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1353  typename _Function6, typename _Function7, typename _Function8>
1354 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1355  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8)
1356 {
1357  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1358 
1359  structured_task_group _Task_group;
1360 
1361  task_handle<_Function1> _Task_handle1(_Func1);
1362  _Task_group.run(_Task_handle1);
1363 
1364  task_handle<_Function2> _Task_handle2(_Func2);
1365  _Task_group.run(_Task_handle2);
1366 
1367  task_handle<_Function3> _Task_handle3(_Func3);
1368  _Task_group.run(_Task_handle3);
1369 
1370  task_handle<_Function4> _Task_handle4(_Func4);
1371  _Task_group.run(_Task_handle4);
1372 
1373  task_handle<_Function5> _Task_handle5(_Func5);
1374  _Task_group.run(_Task_handle5);
1375 
1376  task_handle<_Function6> _Task_handle6(_Func6);
1377  _Task_group.run(_Task_handle6);
1378 
1379  task_handle<_Function7> _Task_handle7(_Func7);
1380  _Task_group.run(_Task_handle7);
1381 
1382  task_handle<_Function8> _Task_handle8(_Func8);
1383  _Task_group.run_and_wait(_Task_handle8);
1384 
1385  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1386 }
1387 
1453 
1454 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1455  typename _Function6, typename _Function7, typename _Function8, typename _Function9>
1456 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1457  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9)
1458 {
1459  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1460 
1461  structured_task_group _Task_group;
1462 
1463  task_handle<_Function1> _Task_handle1(_Func1);
1464  _Task_group.run(_Task_handle1);
1465 
1466  task_handle<_Function2> _Task_handle2(_Func2);
1467  _Task_group.run(_Task_handle2);
1468 
1469  task_handle<_Function3> _Task_handle3(_Func3);
1470  _Task_group.run(_Task_handle3);
1471 
1472  task_handle<_Function4> _Task_handle4(_Func4);
1473  _Task_group.run(_Task_handle4);
1474 
1475  task_handle<_Function5> _Task_handle5(_Func5);
1476  _Task_group.run(_Task_handle5);
1477 
1478  task_handle<_Function6> _Task_handle6(_Func6);
1479  _Task_group.run(_Task_handle6);
1480 
1481  task_handle<_Function7> _Task_handle7(_Func7);
1482  _Task_group.run(_Task_handle7);
1483 
1484  task_handle<_Function8> _Task_handle8(_Func8);
1485  _Task_group.run(_Task_handle8);
1486 
1487  task_handle<_Function9> _Task_handle9(_Func9);
1488  _Task_group.run_and_wait(_Task_handle9);
1489 
1490  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1491 }
1492 
1564 
1565 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1566  typename _Function6, typename _Function7, typename _Function8, typename _Function9, typename _Function10>
1567 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1568  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9, const _Function10& _Func10)
1569 {
1570  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
1571 
1572  structured_task_group _Task_group;
1573 
1574  task_handle<_Function1> _Task_handle1(_Func1);
1575  _Task_group.run(_Task_handle1);
1576 
1577  task_handle<_Function2> _Task_handle2(_Func2);
1578  _Task_group.run(_Task_handle2);
1579 
1580  task_handle<_Function3> _Task_handle3(_Func3);
1581  _Task_group.run(_Task_handle3);
1582 
1583  task_handle<_Function4> _Task_handle4(_Func4);
1584  _Task_group.run(_Task_handle4);
1585 
1586  task_handle<_Function5> _Task_handle5(_Func5);
1587  _Task_group.run(_Task_handle5);
1588 
1589  task_handle<_Function6> _Task_handle6(_Func6);
1590  _Task_group.run(_Task_handle6);
1591 
1592  task_handle<_Function7> _Task_handle7(_Func7);
1593  _Task_group.run(_Task_handle7);
1594 
1595  task_handle<_Function8> _Task_handle8(_Func8);
1596  _Task_group.run(_Task_handle8);
1597 
1598  task_handle<_Function9> _Task_handle9(_Func9);
1599  _Task_group.run(_Task_handle9);
1600 
1601  task_handle<_Function10> _Task_handle10(_Func10);
1602  _Task_group.run_and_wait(_Task_handle10);
1603 
1604  _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
1605 }
1606 
1612 
1614 {
1615 public:
1619 
1621 
1625 
1627 
1628  template<class _Type>
1630  {
1632  }
1633 };
1634 
1639 
1641 {
1642 public:
1646 
1648  {
1649  }
1650 
1654 
1656 
1657  template<class _Type>
1659  {
1661  }
1662 };
1663 
1668 
1670 {
1671 private:
1672  typedef unsigned long long _Size_type;
1673 
1674 public:
1681 
1682  explicit simple_partitioner(_Size_type _Chunk_size) : _M_chunk_size(_Chunk_size)
1683  {
1684  if (_Chunk_size == 0)
1685  {
1686  throw std::invalid_argument("_Chunk_size");
1687  }
1688  }
1689 
1693 
1695 
1696  template<class _Type>
1697  _Type _Get_num_chunks(_Type _Range_arg) const
1698  {
1699  static_assert(sizeof(_Type) <= sizeof(_Size_type), "Potential truncation of _Range_arg");
1700  _Size_type _Num_chunks = (static_cast<_Size_type>(_Range_arg) / _M_chunk_size);
1701 
1702  if (_Num_chunks == 0)
1703  {
1704  _Num_chunks = 1;
1705  }
1706 
1707  return static_cast<_Type>(_Num_chunks);
1708  }
1709 
1710 private:
1711 
1712  _Size_type _M_chunk_size;
1713 };
1714 
1721 
1723 {
1724 public:
1725 
1729 
1731  {
1732  }
1733 
1737 
1739  {
1740  delete [] _M_pChunk_locations;
1741  }
1742 
1743  location& _Get_chunk_location(unsigned int _ChunkIndex)
1744  {
1745  return _M_pChunk_locations[_ChunkIndex];
1746  }
1747 
1748  template<class _Type>
1750  {
1751  if (_M_num_chunks == 0)
1752  {
1754  }
1755 
1756  return static_cast<_Type>(_M_num_chunks);
1757  }
1758 
1759 private:
1760  // The number of chunks the partitioner will record affinity for.
1761  unsigned int _M_num_chunks;
1762 
1763  // Array of remembered locations.
1765 
1766  void _Initialize_locations(unsigned int _Num_chunks)
1767  {
1768  _M_num_chunks = _Num_chunks;
1769  _M_pChunk_locations = new location[_Num_chunks];
1770  }
1771 };
1772 
1773 // Helper methods for scheduling and executing parallel tasks
1774 
1775 // Disable C4180: qualifier applied to function type has no meaning; ignored
1776 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
1777 #pragma warning(push)
1778 #pragma warning(disable: 4180)
1779 // Disable C6263: using _alloca in a loop; this can quickly overflow stack
1780 #pragma warning(disable: 6263)
1781 
1782 // Template class that invokes user function on a parallel_for_each
1783 
1784 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
1786 {
1787 public:
1788  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1789  {
1790  _Func(_First[_Index]);
1791  }
1792 };
1793 
1794 // Template specialized class that invokes user function on a parallel_for
1795 
1796 template <typename _Random_iterator, typename _Index_type, typename _Function>
1797 class _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, false>
1798 {
1799 public:
1800  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1801  {
1802  _Func(static_cast<_Random_iterator>(_First + _Index));
1803  }
1804 };
1805 
1806 // Represents a range of iteration
1807 
1808 template<typename _Index_type>
1809 class _Range
1810 {
1811 public:
1812 
1813  // Construct an object for the range [_Current_iteration, _Last_iteration)
1814  _Range(_Index_type _Current_iteration, _Index_type _Last_iteration) :
1815  _M_current(_Current_iteration), _M_last(_Last_iteration)
1816  {
1817  // On creation, the range shall have at least 1 iteration.
1819  }
1820 
1821  // Send a portion of the range to the helper
1822  void _Send_range(_Range<_Index_type> * _Helper_range)
1823  {
1824  // If there are no iterations other than the current one left until finish, there is no help
1825  // needed. Set the pointer to a special value that helper will understand and continue
1826  // doing the work.
1827  _Index_type _Remaining_iterations = _Number_of_iterations();
1828  if (_Remaining_iterations > 1)
1829  {
1830  // Compute the two pieces of the work range: one for the worker and one for helper class.
1831  _M_last_iteration = _M_current_iteration + _Remaining_iterations / 2;
1832 
1833  // There needs to be at least 1 iteration left because the current iteration cannot be sent.
1835  }
1836 
1837  // This is also a signal for the helper that a range has been sent to it.
1838  _Helper_range->_M_current_iteration = _M_last_iteration;
1839  }
1840 
1841  // Steal the entire range and give it to the helper
1842  void _Steal_range(_Range<_Index_type> * _Helper_range)
1843  {
1844  // We allow stealing only from a range that has atleast 1 iteration
1846 
1847  _Index_type _Current_iter = _M_current_iteration;
1848 
1849  _Helper_range->_M_current_iteration = _Current_iter + 1;
1850  _Helper_range->_M_last_iteration = _M_last_iteration;
1851 
1852  _M_last_iteration = _Current_iter + 1;
1853  }
1854 
1855  // Returns the number of iterations in this range
1856  _Index_type _Number_of_iterations() const
1857  {
1858  return (_M_last_iteration - _M_current_iteration);
1859  }
1860 
1861  // Returns the current iteration in the range
1862  _Index_type _Get_current_iteration() const
1863  {
1864  return _M_current;
1865  }
1866 
1867  // Sets the current iteration in the range
1868  void _Set_current_iteration(const _Index_type _I)
1869  {
1870  _M_current = _I;
1871  }
1872 
1873  __declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration;
1874 
1875  // Returns the last iteration in the range
1876  _Index_type _Get_last_iteration() const
1877  {
1878  return _M_last;
1879  }
1880 
1881  // Sets the last iteration in the range
1882  void _Set_last_iteration(const _Index_type _I)
1883  {
1884  _M_last = _I;
1885  }
1886 
1887  __declspec(property(get=_Get_last_iteration, put=_Set_last_iteration)) _Index_type _M_last_iteration;
1888 
1889 private:
1890 
1891  // These members are volatile because they are updated by the helper
1892  // and used by the worker.
1893  volatile _Index_type _M_current;
1894  volatile _Index_type _M_last;
1895 };
1896 
1897 // A proxy for the worker responsible for maintaining communication with the helper
1898 
1899 template<typename _Index_type>
1901 {
1902 public:
1903  _Worker_proxy(_Worker_proxy *_PParent_worker = NULL) :
1904  _M_pHelper_range(NULL), _M_pParent_worker(_PParent_worker), _M_pWorker_range(NULL), _M_completion_count(0), _M_stop_iterating(0)
1905  {
1907  }
1908 
1910  {
1911  // Make the check to avoid doing extra work in the non-exceptional cases
1912  if (_M_completion_count != _Tree_Complete)
1913  {
1914  // On exception, notify our parent so it breaks out of its loop.
1915  _Propagate_cancel();
1916 
1917  // On exception, we need to set _M_completion_count to ensure that the helper breaks out of its spin wait.
1918  _Set_done();
1919  }
1920  }
1921 
1922  // Obtain a range from the worker
1923  bool _Receive_range(_Range<_Index_type> * _Helper_range)
1924  {
1925  // If the worker already finished, then there is no work left for the helper
1926  if (_M_completion_count)
1927  {
1928  return false;
1929  }
1930 
1931  _CONCRT_ASSERT(_Helper_range != NULL);
1932 
1933  // There are two special values for _M_current_iteration that are not valid: one is the
1934  // initial value of the working class which it will never share, and the other is
1935  // the last exclusive iteration of the working class, which has no work to be done.
1936  // We use the former value so that we can understand worker's response.
1937  _Index_type _Cached_first_iteration = _Helper_range->_M_current_iteration;
1938 
1939  // Following operation is not done via interlocked operation because it does not have to.
1940  // Helper lazily registers that it would like to help the worker, but it allows for some
1941  // time to elapse before that information has made it over to the worker. The idea
1942  // is not to disturb the worker if it is not necessary. It is possible to add interlocked
1943  // operation in the future if the time spent in the busy wait loop is too big.
1944  _CONCRT_ASSERT(_M_pHelper_range == NULL);
1945  _M_pHelper_range = _Helper_range;
1946 
1948 
1949  // If the worker is done, it will flush the store buffer and signal the helper by
1950  // changing _M_current_iteration in the helper's range.
1951  while ((_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1952  {
1953  if ((_M_pWorker_range != NULL) && _M_context._IsSynchronouslyBlocked())
1954  {
1955  // Attempt to steal the entire range from the worker if it is synchronously blocked.
1956 
1957  // Make sure that worker makes no forward progress while helper is attempting to
1958  // steal its range. If worker does get unblocked, simply back off in the helper.
1959  // Note that there could be another helper running if a range has already been
1960  // sent to us.
1961  long _Stop_iterating = _InterlockedIncrement(&_M_stop_iterating);
1962  _CONCRT_ASSERT(_Stop_iterating > 0);
1963 
1964  // We need to make a local copy as the pointer could be changed by the worker.
1965  _Range<_Index_type> * _Worker_range = _M_pWorker_range;
1966 
1967  // The order of comparison needs to be preserved. If the parent is blocked, then
1968  // it cannot send a range (because _M_stop_iterating is already set). If it sent a range
1969  // before being synchronously blocked, then we are no longer the helper. Refrain
1970  // from intrusively stealing the range.
1971  if ((_Worker_range != NULL) && _M_context._IsSynchronouslyBlocked()
1972  && (_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1973  {
1974  _CONCRT_ASSERT(_M_pHelper_range == _Helper_range);
1975 
1976  _M_pHelper_range = NULL;
1977  _Worker_range->_Steal_range(_Helper_range);
1978 
1979  _CONCRT_ASSERT(_Helper_range->_M_current_iteration != _Cached_first_iteration);
1980  }
1981 
1982  // At this point, worker is either:
1983  //
1984  // a) no longer blocked so range will come to the helper naturally, or
1985  // b) out of iterations because helper stole all of it
1986  _Stop_iterating = _InterlockedDecrement(&_M_stop_iterating);
1987  _CONCRT_ASSERT(_Stop_iterating >= 0);
1988  }
1989  else
1990  {
1991  // If there is no work received in a full spin, then start yielding the context
1992  spinWait._SpinOnce();
1993  }
1994  }
1995 
1996  // If the initial iteration is the same as the original first iteration then the
1997  // worker class is sending the signal that it does not need any help.
1998  if (_Helper_range->_M_current_iteration == _Cached_first_iteration)
1999  {
2000  return false;
2001  }
2002 
2003  return (_Helper_range->_Number_of_iterations() > 0);
2004  }
2005 
2006  // Send a portion of our range and notify the helper.
2007  bool _Send_range(_Range<_Index_type> * _Worker_range)
2008  {
2009  // Worker range shall not be available for stealing at this time.
2010  _CONCRT_ASSERT(_M_pWorker_range == NULL);
2011 
2012  // Helper shall be registered.
2013  _CONCRT_ASSERT(_M_pHelper_range != NULL);
2014 
2015  // Send the range
2016  _Worker_range->_Send_range(_M_pHelper_range);
2017 
2018  // Notify the helper. The fence ensures that the prior updates are visible.
2019  _InterlockedExchangePointer(reinterpret_cast<void * volatile *>(&_M_pHelper_range), NULL);
2020 
2021  // The current iteration should still be left
2022  _CONCRT_ASSERT(_Worker_range->_Number_of_iterations() >= 1);
2023 
2024  // Indicate if we need another helper
2025  return (_Worker_range->_Number_of_iterations() > 1);
2026  }
2027 
2028  // Let the helper know that it is ok to intrusively steal range from the worker by publishing the
2029  // remaining range.
2031  {
2032  _M_pWorker_range = _Worker_range;
2033  }
2034 
2035  // Prevent the helper from intrusively stealing range from the worker
2037  {
2038  _M_pWorker_range = NULL;
2039  _Wait_on_intrusive_steal();
2040  }
2041 
2043  {
2044  return (_M_pHelper_range != NULL);
2045  }
2046 
2047  bool _Is_done()
2048  {
2049  return (_M_completion_count != 0);
2050  }
2051 
2052  void _Set_done()
2053  {
2054  // Let the helper know that this class is done with work and flush the store buffer. This operation
2055  // ensures that any buffered store to helper range in _Send_range is flushed and
2056  // available in _Receive_range (so there will be no lost ranges).
2057  _InterlockedExchange(&_M_completion_count, 1);
2058  }
2059 
2061  {
2062  // Make sure that **WE** know when our destructor hits that the entire tree is complete.
2063  _M_completion_count = _Tree_Complete;
2064  }
2065 
2067  {
2068  return _M_beacon._Is_signaled();
2069  }
2070 
2072  {
2073  return _M_beacon._Confirm_cancel();
2074  }
2075 
2076 private:
2077 
2078  // Spin wait for any intrusive steal that is in progress.
2080  {
2081  // This code is used to synchronize with helper in case of worker cooperative blocking.
2082  if (_M_stop_iterating != 0)
2083  {
2085 
2086  while (_M_stop_iterating != 0)
2087  {
2088  spinWait._SpinOnce();
2089  }
2090  }
2091  }
2092 
2094  {
2095  _M_beacon._Raise();
2096  }
2097 
2099  {
2100  if (_M_pParent_worker != NULL)
2101  {
2102  _M_pParent_worker->_NotifyCancel();
2103  }
2104  }
2105 
2106  // Constant indicating sub-tree completion
2107  static const long _Tree_Complete = 2;
2108 
2109  // Read in the loop
2111 
2112  // Read at the end of the loop
2113  _Worker_proxy * _M_pParent_worker;
2114 
2115  // Written rarely
2118 
2119  volatile long _M_completion_count;
2120 
2121  // Written to in the loop
2123  volatile long _M_stop_iterating;
2124 
2125  _Worker_proxy const & operator=(_Worker_proxy const&); // no assignment operator
2126 
2127 };
2128 
2129 // parallel_for -- Performs parallel iteration over a range of indices from _First to _Last,
2130 // excluding _Last. The order in which each iteration is executed is unspecified and non-deterministic.
2131 
2132 // Closure (binding) classes for invoking parallel_for and parallel_for_each, with chunks
2133 
2134 // A dynamically rebalancing closure class used for packaging parallel_for or parallel_for_each for invocation in chunks.
2135 // If some tasks finish earlier than others, helper tasks get executed which ensures further distribution of work.
2136 
2137 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2139 {
2140 public:
2141  _Parallel_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2142  const _Function& _Func, const _Partitioner&, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2143  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func),
2144  _M_parent_worker(_Parent_data)
2145  {
2146  // Empty constructor because members are already assigned
2147  }
2148 
2149  // Constructor overload that accepts a range
2150  _Parallel_chunk_helper(const _Random_iterator& _First, const _Index_type& _Step, const _Function& _Func,
2151  const _Range<_Index_type>& _Worker_range, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2152  _M_first(_First), _M_first_iteration(_Worker_range._M_current_iteration), _M_last_iteration(_Worker_range._M_last_iteration), _M_step(_Step), _M_function(_Func),
2153  _M_parent_worker(_Parent_data)
2154  {
2155  // Empty constructor because members are already assigned
2156  }
2157 
2158  // The main helper function which iterates over the given collection and invokes user function on every iteration.
2159  // Function is marked as const even though it does mutate some of its members (those are declared as mutable). This is done
2160  // in order to easily communicate between a worker and a helper instance, without holding references to many local variables.
2161  // However, this function does not mutate any state that is visible to anyone outside of this class, nor would that be
2162  // possible due to the implicit copy of the functor that happens when a new task_handle is created.
2163  void operator()() const
2164  {
2165  _Range<_Index_type> _Worker_range(_M_first_iteration, _M_last_iteration);
2166 
2167  // This class has two modes: worker and helper. The originally split chunk is always a
2168  // worker, while any subsequent class spawned from this class is in the helper
2169  // mode, which is signified using a link to the worker class through _M_pOwning_worker
2170  // handle. So, it will wait for work to be dished out by the working class while in helper mode.
2171  if (_M_parent_worker != NULL && !_M_parent_worker->_Receive_range(&_Worker_range))
2172  {
2173  // If the worker class rejected the help, simply return
2174  return;
2175  }
2176 
2177  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2178  _Index_type _Current_iteration = _Worker_range._M_current_iteration;
2179  _Index_type _Scaled_index = _Current_iteration * _M_step;
2180 
2181  // If there is only one iteration to be executed there is no need to initialize any
2182  // helper classes (work is indivisible).
2183  if (_Worker_range._Number_of_iterations() == 1)
2184  {
2185  // Execute one iteration
2187  return;
2188  }
2189 
2190  // If the execution reaches this point it means that this class now has a chunk of work
2191  // that it needs to get done, so it has transitioned into the worker mode.
2192  structured_task_group _Helper_group;
2193 
2194  // Initialize fields that are needed in the helper
2195  _Worker_proxy<_Index_type> _Worker(_M_parent_worker);
2196 
2197  // Instantiate a helper class for this working class and put it on the work queue.
2198  // If some thread is idle it will be able to steal the helper and help this class
2199  // finish its work by stealing a piece of the work range.
2200  task_handle<_Parallel_chunk_helper> _Helper_task(_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2201 
2202  _Helper_group.run(_Helper_task);
2203 
2205 
2206  // Normally, for a cancellation semantic in cooperation with the helper, we would run_and_wait the below code on the Helper_group. Unfortunately,
2207  // the capture by reference of things which must be shared (_Worker, and so forth) will cause the loop below to add additional indirection
2208  // instructions. The loop below *MUST* be as tight as possible with the defined semantics. Instead, we will manually notify our parent if the
2209  // worker's destructor runs without hitting the bottom of our chunk. This is done through notification on the beacon.
2210 
2211  for (_Index_type _I = _Current_iteration; _I < _Worker_range._M_last_iteration; (_I++, _Worker_range._M_current_iteration =_I, _Scaled_index += _M_step))
2212  {
2213  if (_Worker._Is_beacon_signaled())
2214  {
2215  // Either a parent task group is canceled or one of the other iterations
2216  // threw an exception. Abort the remaining iterations
2217  //
2218  // Note that this could be a false positive that we must verify.
2219  if (_Worker._Is_done() || _Worker._Verify_beacon_cancellation())
2220  {
2221  break;
2222  }
2223  }
2224 
2225  if (_Worker._Is_helper_registered())
2226  {
2227  // The helper class (there can only be one) registered to help this class with the work.
2228  // Thus, figure out if this class needs help and split the range among the two classes.
2229 
2230  if (_Worker._Send_range(&_Worker_range))
2231  {
2232  // Construct every new instance of a helper class on the stack because it is beneficial to use
2233  // a structured task group where the class itself is responsible for task handle's lifetime.
2235 
2236  new(_Helper_subtask) task_handle<_Parallel_chunk_helper>
2237  (_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2238 
2239  // If _Send_range returns true, that means that there is still some non-trivial
2240  // work to be done, so this class will potentially need another helper.
2241  _Helper_group.run(*_Helper_subtask);
2242  }
2243  }
2244 
2245  // Allow intrusive stealing by the helper
2246  _Worker._Enable_intrusive_steal(&_Worker_range);
2247 
2248  // Execute one iteration: the element is at scaled index away from the first element.
2250 
2251  // Helper shall not steal a range after this call
2252  _Worker._Disable_intrusive_steal();
2253  }
2254 
2255  // Indicate that the worker is done with its iterations.
2256  _Worker._Set_done();
2257 
2258  // Wait for all worker/helper iterations to finish
2259  _Helper_group.wait();
2260 
2261  // Make sure that we've signaled that the tree is complete. This is used to detect any exception out of either _Parallel_chunk_helper_invoke or
2262  // _Helper_group.wait() above as a cancellation of the loop which must propagate upwards because we do not wrap the loop body in run_and_wait.
2263  _Worker._Set_tree_done();
2264  }
2265 
2266 private:
2267 
2268  const _Random_iterator& _M_first;
2269  const _Index_type& _M_step;
2270  const _Function& _M_function;
2271 
2272  const _Index_type _M_first_iteration;
2273  const _Index_type _M_last_iteration;
2274 
2276 
2277  _Parallel_chunk_helper const & operator=(_Parallel_chunk_helper const&); // no assignment operator
2278 };
2279 
2280 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2282 {
2283 public:
2284  _Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration,
2285  _Index_type _Last_iteration, const _Index_type& _Step, const _Function& _Func, const _Partitioner&) :
2286  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func)
2287  {
2288  // Empty constructor because members are already assigned
2289  }
2290 
2291  void operator()() const
2292  {
2293  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2294  _Index_type _Scaled_index = _M_first_iteration * _M_step;
2295 
2296  for (_Index_type _I = _M_first_iteration; _I < _M_last_iteration; (_I++, _Scaled_index += _M_step))
2297  {
2298  // Execute one iteration: the element is at scaled index away from the first element.
2300  }
2301  }
2302 private:
2303 
2304  const _Random_iterator& _M_first;
2305  const _Index_type& _M_step;
2306  const _Function& _M_function;
2307 
2308  const _Index_type _M_first_iteration;
2309  const _Index_type _M_last_iteration;
2310 
2311  _Parallel_fixed_chunk_helper const & operator=(_Parallel_fixed_chunk_helper const&); // no assignment operator
2312 };
2313 
2314 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
2316 {
2317 public:
2319 
2320  _Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2321  const _Function& _Func, affinity_partitioner& _Part) :
2322  _M_fixed_helper(_Chunk_index, _First, _First_iteration, _Last_iteration, _Step, _Func, static_partitioner()),
2323  _M_chunk_location(_Part._Get_chunk_location(static_cast<unsigned int>(_Chunk_index)))
2324  {
2325  // Empty constructor because members are already assigned
2326  }
2327 
2328  // Override the operator() in the base class. Note that this is not a virtual override.
2329  void operator()() const
2330  {
2331  // Check here if location needs to be saved.
2332  if (_M_chunk_location._Is_system())
2333  {
2334  _M_chunk_location = location::current();
2335  }
2336 
2337  _M_fixed_helper();
2338  }
2339 private:
2340 
2343  _Parallel_localized_chunk_helper const & operator=(_Parallel_localized_chunk_helper const&); // no assignment operator
2344 };
2345 
2346 #pragma warning(pop)
2347 
2348 template <typename _Worker_class, typename _Index_type, typename Partitioner>
2350  task_handle<_Worker_class>* _Chunk_helpers,
2351  const Partitioner&,
2352  _Index_type _I)
2353 {
2354  _Task_group.run(_Chunk_helpers[_I]);
2355 }
2356 
2357 template <typename _Worker_class, typename _Index_type>
2359  task_handle<_Worker_class>* _Chunk_helpers,
2360  affinity_partitioner& _Part,
2361  _Index_type _I)
2362 {
2363  _Task_group.run(_Chunk_helpers[_I], _Part._Get_chunk_location(static_cast<unsigned int>(_I)));
2364 }
2365 
2366 
2367 // Helper functions that implement parallel_for
2368 
2369 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner>
2370 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func, _Partitioner&& _Part)
2371 {
2372  _CONCRT_ASSERT(_Range_arg > 1);
2373  _CONCRT_ASSERT(_Step > 0);
2374 
2375  _Index_type _Num_iterations = (_Step == 1) ? _Range_arg : (((_Range_arg - 1) / _Step) + 1);
2376  _CONCRT_ASSERT(_Num_iterations > 1);
2377 
2378  _Index_type _Num_chunks = _Part._Get_num_chunks(_Num_iterations);
2379  _CONCRT_ASSERT(_Num_chunks > 0);
2380 
2381  // Allocate memory on the stack for task_handles to ensure everything is properly structured.
2383  task_handle<_Worker_class> * _Chunk_helpers = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker_class>) * static_cast<size_t>(_Num_chunks)));
2384 
2385  structured_task_group _Task_group;
2386 
2387  _Index_type _Iterations_per_chunk = _Num_iterations / _Num_chunks;
2388  _Index_type _Remaining_iterations = _Num_iterations % _Num_chunks;
2389 
2390  // If there are less iterations than desired chunks, set the chunk number
2391  // to be the number of iterations.
2392  if (_Iterations_per_chunk == 0)
2393  {
2394  _Num_chunks = _Remaining_iterations;
2395  }
2396 
2397  _Index_type _Work_size = 0;
2398  _Index_type _Start_iteration = 0;
2399  _Index_type _I;
2400 
2401  // Split the available work in chunks
2402  for (_I = 0; _I < _Num_chunks - 1; _I++)
2403  {
2404  if (_Remaining_iterations > 0)
2405  {
2406  // Iterations are not divided evenly, so add 1 remainder iteration each time
2407  _Work_size = _Iterations_per_chunk + 1;
2408  _Remaining_iterations--;
2409  }
2410  else
2411  {
2412  _Work_size = _Iterations_per_chunk;
2413  }
2414 
2415  // New up a task_handle "in-place", in the array preallocated on the stack
2416  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2418 
2419  // Run each of the chunk tasks in parallel
2420  _Parallel_chunk_task_group_run(_Task_group, _Chunk_helpers, std::forward<_Partitioner>(_Part), _I);
2421 
2422  // Prepare for the next iteration
2423  _Start_iteration += _Work_size;
2424  }
2425 
2426  // Because this is the last iteration, then work size might be different
2427  _CONCRT_ASSERT((_Remaining_iterations == 0) || ((_Iterations_per_chunk == 0) && (_Remaining_iterations == 1)));
2428  _Work_size = _Num_iterations - _Start_iteration;
2429 
2430  // New up a task_handle "in-place", in the array preallocated on the stack
2431  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2433 
2434  _Task_group.run_and_wait(_Chunk_helpers[_I]);
2435 }
2436 
2437 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function>
2438 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func)
2439 {
2440  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, auto_partitioner());
2441 }
2442 
2443 // Helper for the parallel for API with the default dynamic partitioner which implements range-stealing to balance load.
2444 template <typename _Index_type, typename _Diff_type, typename _Function>
2445 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2446 {
2448  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2449 }
2450 
2451 // Helper for the parallel_for API with a static partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2452 template <typename _Index_type, typename _Diff_type, typename _Function>
2453 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const static_partitioner& _Part)
2454 {
2456  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2457 }
2458 
2459 // Helper for the parallel_for API with a simple partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2460 template <typename _Index_type, typename _Diff_type, typename _Function>
2461 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2462 {
2464  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2465 }
2466 
2467 // Helper for the parallel_for API with an affinity partitioner - creates a fixed number of chunks up front with no range-stealing enabled. subsequent
2468 // calls to parallel_for with the same affinity partitioner (pass in as a non-const reference) are scheduled to the same location they previously ran on
2469 template <typename _Index_type, typename _Diff_type, typename _Function>
2470 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2471 {
2473  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2474 }
2475 
2476 template <typename _Index_type, typename _Function, typename _Partitioner>
2477 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2478 {
2479  // The step argument must be 1 or greater; otherwise it is an invalid argument
2480  if (_Step < 1)
2481  {
2482  throw std::invalid_argument("_Step");
2483  }
2484 
2485  // If there are no elements in this range we just return
2486  if (_First >= _Last)
2487  {
2488  return;
2489  }
2490 
2491  // Compute the difference type based on the arguments and avoid signed overflow for int, long, and long long
2492  typedef typename std::tr1::conditional<std::tr1::is_same<_Index_type, int>::value, unsigned int,
2493  typename std::tr1::conditional<std::tr1::is_same<_Index_type, long>::value, unsigned long,
2494  typename std::tr1::conditional<std::tr1::is_same<_Index_type, long long>::value, unsigned long long, decltype(_Last - _First)
2495  >::type
2496  >::type
2497  >::type _Diff_type;
2498 
2499  _Diff_type _Range_size = _Diff_type(_Last) - _Diff_type(_First);
2500  _Diff_type _Diff_step = _Step;
2501 
2502  if (_Range_size <= _Diff_step)
2503  {
2504  _Func(_First);
2505  }
2506  else
2507  {
2508  _Parallel_for_partitioned_impl<_Index_type, _Diff_type, _Function>(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2509  }
2510 }
2511 
2512 template <typename _Index_type, typename _Function>
2513 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2514 {
2515  _Parallel_for_impl(_First, _Last, _Step, _Func, auto_partitioner());
2516 }
2517 
2557 
2558 template <typename _Index_type, typename _Function, typename _Partitioner>
2559 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2560 {
2562  _Parallel_for_impl(_First, _Last, _Step, _Func, std::forward<_Partitioner>(_Part));
2564 }
2565 
2593 
2594 template <typename _Index_type, typename _Function>
2595 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2596 {
2597  parallel_for(_First, _Last, _Step, _Func, auto_partitioner());
2598 }
2599 
2632 
2633 template <typename _Index_type, typename _Function>
2634 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const auto_partitioner& _Part = auto_partitioner())
2635 {
2636  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2637 }
2638 
2671 
2672 template <typename _Index_type, typename _Function>
2673 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const static_partitioner& _Part)
2674 {
2675  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2676 }
2677 
2710 
2711 template <typename _Index_type, typename _Function>
2712 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const simple_partitioner& _Part)
2713 {
2714  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2715 }
2716 
2749 
2750 template <typename _Index_type, typename _Function>
2751 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, affinity_partitioner& _Part)
2752 {
2753  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2754 }
2755 
2756 // parallel_for_each -- This function will iterate over all elements in the iterator's range.
2757 
2758 // Closure (binding) classes for invoking parallel_for_each recursively
2759 
2760 // A closure class used for packaging chunk of elements in parallel_for_each for parallel invocation
2761 
2762 // Forward iterator for_each using unstructured task group
2763 
2764 // Disable C4180: qualifier applied to function type has no meaning; ignored
2765 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2766 #pragma warning(push)
2767 #pragma warning(disable: 4180)
2768 
2769 template <typename _Forward_iterator, typename _Function, unsigned int _Chunk_size>
2771 {
2772 public:
2773  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
2774  static const unsigned int _Size = _Chunk_size;
2775 
2776  _Parallel_for_each_helper(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func) :
2777  _M_function(_Func), _M_len(0)
2778  {
2779  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
2780  // Add a batch of work items to this functor's array
2781  for (unsigned int _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
2782  {
2783  _M_element[_M_len++] = &(*_First++);
2784  }
2785  }
2786 
2787  void operator()() const
2788  {
2789  // Invoke parallel_for on the batched up array of elements
2790  _Parallel_for_impl(0U, _M_len, 1U,
2791  [this] (unsigned int _Index)
2792  {
2793  _M_function(*(_M_element[_Index]));
2794  }
2795  );
2796  }
2797 
2798 private:
2799 
2800  const _Function& _M_function;
2801  typename std::iterator_traits<_Forward_iterator>::pointer _M_element[_Size];
2802  unsigned int _M_len;
2803 
2804  _Parallel_for_each_helper const & operator=(_Parallel_for_each_helper const&); // no assignment operator
2805 };
2806 
2807 #pragma warning(pop)
2808 
2809 // Helper functions that implement parallel_for_each
2810 
2811 template <typename _Forward_iterator, typename _Function>
2812 void _Parallel_for_each_chunk(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2813 {
2814  // The chunk size selection depends more on the internal implementation of parallel_for than
2815  // on the actual input. Also, it does not have to be dynamically computed, but it helps
2816  // parallel_for if it is a power of 2 (easy to divide).
2817  const unsigned int _Chunk_size = 1024;
2818 
2819  // This functor will be copied on the heap and will execute the chunk in parallel
2821 
2822  // Because this is an unstructured task group, running the task will make a copy of the necessary data
2823  // on the heap, ensuring that it is available at the time of execution.
2824  _Task_group.run(_Functor);
2825 }
2826 
2827 template <typename _Forward_iterator, typename _Function>
2828 void _Parallel_for_each_forward_impl(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2829 {
2830  _Parallel_for_each_chunk(_First, _Last, _Func, _Task_group);
2831 
2832  // If there is a tail, push the tail
2833  if (_First != _Last)
2834  {
2835  _Task_group.run(
2836  [&_First, &_Last, &_Func, &_Task_group]
2837  {
2838  Concurrency::_Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2839  }
2840  );
2841  }
2842 }
2843 
2844 template <typename _Forward_iterator, typename _Function>
2845 void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func, const auto_partitioner&, std::forward_iterator_tag)
2846 {
2847  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
2848  // it is up to the user to provide valid range.
2849  if (_First != _Last)
2850  {
2851  task_group _Task_group;
2852 
2853  _Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2854 
2855  _Task_group.wait();
2856  }
2857 }
2858 
2859 template <typename _Random_iterator, typename _Index_type, typename _Function>
2860 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2861 {
2863  // Use the same function that schedules work for parallel for
2864  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2865 }
2866 
2867 template <typename _Random_iterator, typename _Index_type, typename _Function>
2868 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const static_partitioner& _Part)
2869 {
2871  // Use the same function that schedules work for parallel for
2872  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2873 }
2874 
2875 template <typename _Random_iterator, typename _Index_type, typename _Function>
2876 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2877 {
2879  // Use the same function that schedules work for parallel for
2880  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2881 }
2882 
2883 template <typename _Random_iterator, typename _Index_type, typename _Function>
2884 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2885 {
2887  // Use the same function that schedules work for parallel for
2888  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2889 }
2890 
2891 template <typename _Random_iterator, typename _Function, typename _Partitioner>
2892 void _Parallel_for_each_impl(const _Random_iterator& _First, const _Random_iterator& _Last, const _Function& _Func, _Partitioner&& _Part, std::random_access_iterator_tag)
2893 {
2894  typedef typename std::iterator_traits<_Random_iterator>::difference_type _Index_type;
2895 
2896  // Exit early if there is nothing in the collection
2897  if (_First >= _Last)
2898  {
2899  return;
2900  }
2901 
2902  _Index_type _Range_size = _Last - _First;
2903 
2904  if (_Range_size == 1)
2905  {
2906  _Func(*_First);
2907  }
2908  else
2909  {
2910  _Index_type _Step = 1;
2911 
2912  _Parallel_for_each_partitioned_impl(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2913  }
2914 }
2915 
2943 
2944 template <typename _Iterator, typename _Function>
2945 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func)
2946 {
2947  parallel_for_each(_First, _Last, _Func, auto_partitioner());
2948 }
2949 
2986 
2987 template <typename _Iterator, typename _Function, typename _Partitioner>
2988 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func, _Partitioner&& _Part)
2989 {
2990  _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);
2991  _Parallel_for_each_impl(_First, _Last, _Func, std::forward<_Partitioner>(_Part), std::_Iter_cat(_First));
2992  _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);
2993 }
2994 
2995 // Disable C4180: qualifier applied to function type has no meaning; ignored
2996 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2997 #pragma warning(push)
2998 #pragma warning(disable: 4180)
2999 
3038 
3039 template<typename _Forward_iterator>
3040 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(
3041  _Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity)
3042 {
3043  return parallel_reduce(_Begin, _End, _Identity, std::plus<typename std::iterator_traits<_Forward_iterator>::value_type>());
3044 }
3045 
3092 
3093 template<typename _Forward_iterator, typename _Sym_reduce_fun>
3094 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End,
3095  const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity, _Sym_reduce_fun _Sym_fun)
3096 {
3097  typedef typename std::remove_cv<typename std::iterator_traits<_Forward_iterator>::value_type>::type _Reduce_type;
3098 
3099  return parallel_reduce(_Begin, _End, _Identity,
3100  [_Sym_fun](_Forward_iterator _Begin, _Forward_iterator _End, _Reduce_type _Init)->_Reduce_type
3101  {
3102  while (_Begin != _End)
3103  {
3104  _Init = _Sym_fun(_Init, *_Begin++);
3105  }
3106 
3107  return _Init;
3108  },
3109  _Sym_fun);
3110 }
3111 
3112 template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
3114 
3115 template<typename _Ty, typename _Sym_fun>
3117 
3175 
3176 template<typename _Reduce_type, typename _Forward_iterator, typename _Range_reduce_fun, typename _Sym_reduce_fun>
3177 inline _Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type& _Identity,
3178  const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
3179 {
3180  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
3181 
3182  static_assert(!std::tr1::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::input_iterator_tag>::value
3183  && !std::tr1::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::output_iterator_tag>::value,
3184  "iterator can not be input_iterator or output_iterator.");
3185 
3186  return _Parallel_reduce_impl(_Begin, _End,
3187  _Reduce_functor_helper<_Reduce_type, _Range_reduce_fun,
3189  typename std::iterator_traits<_Forward_iterator>::iterator_category());
3190 }
3191 
3192 // Ordered serial combinable object
3193 template<typename _Ty, typename _Sym_fun>
3194 class _Order_combinable
3195 {
3196 public:
3197  // Only write once, limited contention will be caused
3198  struct _Bucket
3199  {
3200  // Allocate enough space in the Bucket to hold a value
3201  char _Value[(sizeof(_Ty) / sizeof(char))];
3203 
3205  {
3206  _Next = _N;
3207  }
3208 
3209  void _Insert(_Bucket *_Item)
3210  {
3211  // No need to lock, only one thread will insert
3212  _Item->_Next = _Next;
3213  _Next = _Item;
3214  }
3215 
3216  // Construct value in bucket
3217  void _Put(const _Ty &_Cur)
3218  {
3219  new(reinterpret_cast<_Ty *>(&_Value)) _Ty(_Cur);
3220  }
3221  };
3222 
3223 private:
3224  const _Sym_fun &_M_fun;
3225  size_t _M_number;
3227  _Order_combinable &operator =(const _Order_combinable &other);
3228 
3229 public:
3231  {
3232  _Bucket * _Ret = static_cast<_Bucket *>(Concurrency::Alloc(sizeof(_Bucket)));
3233  return new(_Ret)_Bucket(_Par);
3234  }
3235 
3236  _Order_combinable(const _Sym_fun &_Fun): _M_fun(_Fun)
3237  {
3238  _M_root = 0;
3239  _M_number = 0;
3240  }
3241 
3243  {
3244  while (_M_root)
3245  {
3246  _Bucket *_Cur = _M_root;
3247  _M_root = _M_root->_Next;
3248  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3249  Concurrency::Free(_Cur);
3250  }
3251  }
3252 
3253  // Serially combine and release the list, return result
3255  {
3256  _Ty _Ret(reinterpret_cast<_Ty &>(_M_root->_Value));
3257  _Bucket *_Cur = _M_root;
3258  _M_root = _M_root->_Next;
3259 
3260  while (_M_root)
3261  {
3262  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3263  Concurrency::Free(_Cur);
3264  _Cur = _M_root;
3265  _Ret = _M_fun(reinterpret_cast <_Ty &> (_Cur->_Value), _Ret);
3266  _M_root = _M_root->_Next;
3267  }
3268 
3269  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3270  Concurrency::Free(_Cur);
3271 
3272  return _Ret;
3273  }
3274 
3275  // allocate a bucket and push back to the list
3277  {
3278  return _M_root = _Construct(_M_root);
3279  }
3280 };
3281 
3282 // Implementation for the parallel reduce
3283 template <typename _Forward_iterator, typename _Function>
3284 typename _Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func,
3285  std::forward_iterator_tag)
3286 {
3287  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
3288  // it is up to the user to provide valid range.
3289  if (_First != _Last)
3290  {
3291  task_group _Task_group;
3292  _Parallel_reduce_forward_executor(_First, _Last, _Func, _Task_group);
3293  _Task_group.wait();
3294  return _Func._Combinable._Serial_combine_release();
3295  }
3296  else
3297  {
3298  return _Func._Identity_value;
3299  }
3300 }
3301 
3302 template<typename _Forward_iterator, typename _Functor>
3304 
3305 template <typename _Worker, typename _Random_iterator, typename _Function>
3306 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun);
3307 
3308 template <typename _Random_iterator, typename _Function>
3309 typename _Function::_Reduce_type _Parallel_reduce_impl(_Random_iterator _First, _Random_iterator _Last, const _Function& _Func,
3310  std::random_access_iterator_tag)
3311 {
3313 
3314  // Special case for 0, 1 element
3315  if (_First >= _Last)
3316  {
3317  return _Func._Identity_value;
3318  }
3319  // Directly compute if size is too small
3320  else if (_Last - _First <= 1)
3321  {
3322  _Worker_class(_First, _Last, _Func)();
3323  return _Func._Combinable._Serial_combine_release();
3324  }
3325  else
3326  {
3327  // Use fixed ordered chunk partition to schedule works
3328  _Parallel_reduce_random_executor<_Worker_class>(_First, _Last, _Func);
3329  return _Func._Combinable._Serial_combine_release();
3330  }
3331 }
3332 
3333 // Helper function assemble all functors
3334 template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
3335 struct _Reduce_functor_helper
3336 {
3337  const _Sub_function &_Sub_fun;
3339 
3340  mutable _Combinable_type &_Combinable;
3341 
3343  typedef typename _Combinable_type::_Bucket Bucket_type;
3344 
3345  _Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&comb):
3346  _Sub_fun(_Sub_fun), _Combinable(comb), _Identity_value(_Identity)
3347  {
3348  }
3349 
3350 private:
3351  _Reduce_functor_helper &operator =(const _Reduce_functor_helper &other);
3352 };
3353 
3354 // All the code below is the worker without range stealing
3355 template<typename _Forward_iterator, typename _Functor>
3356 class _Parallel_reduce_fixed_worker
3357 {
3358 public:
3359  // The bucket allocation order will depend on the worker construction order
3360  _Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun):
3361  _M_begin(_Begin), _M_end(_End), _M_fun(_Fun), _M_bucket(_M_fun._Combinable._Unsafe_push_back())
3362  {
3363  }
3364 
3365  void operator ()() const
3366  {
3367  _M_bucket->_Put(_M_fun._Sub_fun(_M_begin, _M_end, _M_fun._Identity_value));
3368  }
3369 
3370 private:
3371  const _Functor &_M_fun;
3372  const _Forward_iterator _M_begin, _M_end;
3373  typename _Functor::Bucket_type * const _M_bucket;
3375 };
3376 
3377 // the parallel worker executor for fixed iterator
3378 // it will divide fixed number of chunks
3379 // almost same as fixed parallel for, except keep the chunk dividing order
3380 template <typename _Worker, typename _Random_iterator, typename _Function>
3381 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun)
3382 {
3383  size_t _Cpu_num = static_cast<size_t>(Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors()), _Size = _End - _Begin;
3384 
3387  task_handle<_Worker> *_Tasks = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker>) * (_Cpu_num - 1)));
3388 
3389  size_t _Begin_index = 0;
3390  size_t _Step = _Size / _Cpu_num;
3391  size_t _NumRemaining = _Size - _Step * _Cpu_num;
3392 
3393  for(size_t _I = 0; _I < _Cpu_num - 1; _I++)
3394  {
3395  size_t _Next = _Begin_index + _Step;
3396 
3397  // Add remaining to each chunk
3398  if (_NumRemaining)
3399  {
3400  --_NumRemaining;
3401  ++_Next;
3402  }
3403 
3404  // New up a task_handle "in-place", in the array preallocated on the stack
3405  new (_Tasks + _I) task_handle<_Worker>(_Worker(_Begin + _Begin_index, _Begin + _Next, _Fun));
3407 
3408  // Run each of the chunk _Tasks in parallel
3409  _Tg.run(_Tasks[_I]);
3410  _Begin_index = _Next;
3411  }
3412 
3413  task_handle<_Worker> _Tail(_Worker(_Begin + _Begin_index, _End, _Fun));
3414  _Tg.run_and_wait(_Tail);
3415 }
3416 
3417 // The parallel worker executor for forward iterators
3418 // Divide chunks on the fly
3419 template <typename _Forward_iterator, typename _Function, int _Default_worker_size, int _Default_chunk_size>
3421 {
3423  mutable std::auto_ptr<task_handle<_Worker_class>> _Workers;
3425 
3426  _Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function& _Func):
3427  _Workers(static_cast<task_handle<_Worker_class> *>(Concurrency::Alloc(sizeof(task_handle<_Worker_class>) * _Default_worker_size)))
3428  {
3429  _Worker_size = 0;
3430  while (_Worker_size < _Default_worker_size && _First != _Last)
3431  {
3432  // Copy the range _Head
3433  _Forward_iterator _Head = _First;
3434 
3435  // Read from forward iterator
3436  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3437  {
3438  // Body is empty
3439  }
3440 
3441  // _First will be the end of current chunk
3442  new (_Workers.get() + _Worker_size++) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3443  }
3444  }
3445 
3447  _Workers(_Other._Workers), _Worker_size(_Other._Worker_size)
3448  {
3449  }
3450 
3451  void operator ()() const
3452  {
3454  for(int _I = 0; _I < _Worker_size; _I++)
3455  {
3456  _Tg.run(_Workers.get()[_I]);
3457  }
3458  _Tg.wait();
3459  }
3460 
3462  {
3463  if (_Workers.get())
3464  {
3465  for (int _I = 0; _I < _Worker_size; _I++)
3466  {
3467  _Workers.get()[_I].~task_handle<_Worker_class>();
3468  }
3469  Concurrency::Free(_Workers.release());
3470  }
3471  }
3472 };
3473 
3474 template <typename _Forward_iterator, typename _Function>
3475 void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function& _Func, task_group& _Task_group)
3476 {
3477  const static int _Internal_worker_number = 1024, _Default_chunk_size = 512;
3479 
3480  structured_task_group _Worker_group;
3482  task_handle<_Worker_class>* _Workers = _Holder._InitOnRawMalloca(_malloca(_Internal_worker_number * sizeof(task_handle<_Worker_class>)));
3483 
3484  // Start execution first
3485  int _Index = 0;
3486  while (_Index < _Internal_worker_number && _First != _Last)
3487  {
3488  // Copy the range _Head
3489  _Forward_iterator _Head = _First;
3490 
3491  // Read from forward iterator
3492  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3493  {
3494  // Body is empty
3495  };
3496 
3497  // Create a new task, _First is range _End
3498  new (_Workers + _Index) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3500  _Worker_group.run(_Workers[_Index]);
3501  ++_Index;
3502  }
3503 
3504  // Divide and append the left
3505  while (_First != _Last)
3506  {
3508  }
3509 
3510  _Worker_group.wait();
3511 }
3512 
3513 #pragma warning(pop)
3514 
3515 
3516 // Disable C4180: qualifier applied to function type has no meaning; ignored
3517 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
3518 #pragma warning(push)
3519 #pragma warning(disable: 4180)
3520 
3521 //
3522 // Dispatch the execution and handle the condition that all of the iterators are random access
3523 //
3524 template<typename _Any_input_traits, typename _Any_output_traits>
3526 {
3527  template<typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3528  static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator& _Result, const _Unary_operator& _Unary_op, const auto_partitioner&)
3529  {
3530  task_group _Tg;
3531  _Parallel_transform_unary_impl2(_Begin, _End, _Result, _Unary_op, _Tg);
3532  _Tg.wait();
3533  }
3534 };
3535 
3536 template<>
3537 struct _Unary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag>
3538 {
3539  template<typename _Random_input_iterator, typename _Random_output_iterator, typename _Unary_operator, typename _Partitioner>
3540  static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End,
3541  _Random_output_iterator& _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3542  {
3543  if (_Begin < _End)
3544  {
3545  Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End - _Begin), static_cast<size_t>(1),
3546  [_Begin, &_Result, &_Unary_op](size_t _Index)
3547  {
3548  _Result[_Index] = _Unary_op(_Begin[_Index]);
3549  },
3550  std::forward<_Partitioner>(_Part));
3551  _Result += _End - _Begin;
3552  }
3553  }
3554 };
3555 
3556 template<typename _Any_input_traits1, typename _Any_input_traits2, typename _Any_output_traits>
3558 {
3559 
3560  template<typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3561  static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2,
3562  _Output_iterator& _Result, const _Binary_operator& _Binary_op, const auto_partitioner&)
3563  {
3564  task_group _Tg;
3565  _Parallel_transform_binary_impl2(_Begin1, _End1, _Begin2, _Result, _Binary_op, _Tg);
3566  _Tg.wait();
3567  }
3568 };
3569 
3570 template<>
3571 struct _Binary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag>
3572 {
3573  template<typename _Random_input_iterator1, typename _Random_input_iterator2, typename _Random_output_iterator, typename _Binary_operator, typename _Partitioner>
3574  static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1,
3575  _Random_input_iterator2 _Begin2, _Random_output_iterator& _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
3576  {
3577  if (_Begin1 < _End1)
3578  {
3579  Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End1 - _Begin1), static_cast<size_t>(1),
3580  [_Begin1, _Begin2, &_Result, &_Binary_op](size_t _Index)
3581  {
3582  _Result[_Index] = _Binary_op(_Begin1[_Index], _Begin2[_Index]);
3583  },
3584  std::forward<_Partitioner>(_Part));
3585  _Result += _End1 - _Begin1;
3586  }
3587  }
3588 };
3589 
3590 //
3591 // The implementation for at least one of the iterator is forward iterator
3592 //
3593 template <typename _Forward_iterator, typename _Iterator_kind>
3595 {
3596 public:
3597  static const size_t _Size = 1024;
3598  typedef typename std::iterator_traits<_Forward_iterator>::value_type value_type;
3599 
3601  {
3602  static_assert(!std::is_same<_Iterator_kind, std::input_iterator_tag>::value
3603  && !std::is_same<_Iterator_kind, std::output_iterator_tag>::value,
3604  "iterator can not be input_iterator or output_iterator.");
3605  }
3606 
3607  size_t _Populate(_Forward_iterator& _First, _Forward_iterator _Last)
3608  {
3609  size_t _Length = 0;
3610  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
3611 
3612  for (size_t _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
3613  {
3614  // We only support l-value here, so it's safe
3615  _M_element_array[_Length++] = &(*_First++);
3616  }
3617 
3618  return _Length;
3619  }
3620 
3621  void _Populate(_Forward_iterator& _First, size_t _Length)
3622  {
3623  for (size_t _Index=0; _Index < _Length; _Index++)
3624  {
3625  _M_element_array[_Index] = &(*_First++);
3626  }
3627  }
3628 
3629  void _Store(const value_type& _Elem, size_t _Index) const
3630  {
3631  *(_M_element_array[_Index]) = _Elem;
3632  }
3633 
3634  typename std::iterator_traits<_Forward_iterator>::reference _Load(size_t _Index) const
3635  {
3636  return *(_M_element_array[_Index]);
3637  }
3638 
3639 private:
3640  typename std::iterator_traits<_Forward_iterator>::pointer _M_element_array[_Size];
3641 };
3642 
3643 template <typename _Random_iterator>
3645 {
3646 public:
3647  static const size_t _Size = 1024;
3648  typedef typename std::iterator_traits<_Random_iterator>::value_type value_type;
3649 
3651  {
3652  }
3653 
3654  size_t _Populate(_Random_iterator& _First, _Random_iterator _Last)
3655  {
3656  typename std::iterator_traits<_Random_iterator>::difference_type _Range_size = _Last - _First;
3657  typename std::iterator_traits<_Random_iterator>::difference_type _Sized = _Size;
3658  _M_first = _First;
3659 
3660  if (_Range_size > _Sized)
3661  {
3662  _First += _Size;
3663  return _Size;
3664  }
3665  else
3666  {
3667  _First += _Range_size;
3668  return static_cast<size_t>(_Range_size);
3669  }
3670  }
3671 
3672  void _Populate(_Random_iterator& _First, size_t _Length)
3673  {
3674  _M_first = _First;
3675  _First += _Length;
3676  }
3677 
3678  void _Store(const value_type& _Elem, size_t _Index) const
3679  {
3680  _M_first[_Index] = _Elem;
3681  }
3682 
3683  typename std::iterator_traits<_Random_iterator>::reference _Load(size_t _Index) const
3684  {
3685  // We only support l-value here
3686  return _M_first[_Index];
3687  }
3688 
3689 private:
3690  _Random_iterator _M_first;
3691 };
3692 
3693 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3695 {
3696 public:
3697  _Parallel_transform_binary_helper(_Input_iterator1& _First1, _Input_iterator1 _Last1, _Input_iterator2& _First2,
3698  _Output_iterator& _Result, const _Binary_operator& _Binary_op) :
3699  _M_binary_op(_Binary_op), _M_len(0)
3700  {
3701  _M_len = _M_input_helper1._Populate(_First1, _Last1);
3702  _M_input_helper2._Populate(_First2, _M_len);
3703  _M_output_helper._Populate(_Result, _M_len);
3704  }
3705 
3706  void operator()() const
3707  {
3708  // Invoke parallel_for on the batched up array of elements
3709  Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3710  [this] (size_t _Index)
3711  {
3712  _M_output_helper._Store(_M_binary_op(_M_input_helper1._Load(_Index), _M_input_helper2._Load(_Index)), _Index);
3713  });
3714  }
3715 
3716 private:
3717 
3721  const _Binary_operator& _M_binary_op;
3722  size_t _M_len;
3723 
3724  _Parallel_transform_binary_helper const & operator=(_Parallel_transform_binary_helper const&); // no assignment operator
3725 };
3726 
3727 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3728 void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result,
3729  const _Binary_operator& _Binary_op, task_group& _Tg)
3730 {
3731  // This functor will be copied on the heap and will execute the chunk in parallel
3732  {
3734  _Tg.run(functor);
3735  }
3736 
3737  // If there is a tail, push the tail
3738  if (_First1 != _Last1)
3739  {
3740  _Tg.run(
3741  [=, &_Result, &_Binary_op, &_Tg]
3742  {
3743  _Parallel_transform_binary_impl2(_First1, _Last1, _First2, _Result, _Binary_op, _Tg);
3744  });
3745  }
3746 }
3747 
3748 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3750 {
3751 public:
3752  _Parallel_transform_unary_helper(_Input_iterator& _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator& _Unary_op) :
3753  _M_unary_op(_Unary_op), _M_len(0)
3754  {
3755  _M_len = _M_input_helper._Populate(_First, _Last);
3756  _M_output_helper._Populate(_Result, _M_len);
3757  }
3758 
3759  void operator()() const
3760  {
3761  // Invoke parallel_for on the batched up array of elements
3762  Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3763  [this] (size_t _Index)
3764  {
3765  _M_output_helper._Store(_M_unary_op(_M_input_helper._Load(_Index)), _Index);
3766  });
3767  }
3768 
3769 private:
3770 
3773  const _Unary_operator& _M_unary_op;
3774  size_t _M_len;
3775 
3776  _Parallel_transform_unary_helper const & operator=(_Parallel_transform_unary_helper const&); // no assignment operator
3777 };
3778 
3779 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3780 void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result,
3781  const _Unary_operator& _Unary_op, task_group& _Tg)
3782 {
3783  // This functor will be copied on the heap and will execute the chunk in parallel
3784  {
3786  _Tg.run(functor);
3787  }
3788 
3789  // If there is a tail, push the tail
3790  if (_First != _Last)
3791  {
3792  _Tg.run(
3793  [=, &_Result, &_Unary_op, &_Tg]
3794  {
3795  _Parallel_transform_unary_impl2(_First, _Last, _Result, _Unary_op, _Tg);
3796  });
3797  }
3798 }
3799 
3800 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator, typename _Partitioner>
3801 _Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3802 {
3803  typedef typename std::iterator_traits<_Input_iterator>::iterator_category _Input_iterator_type;
3804  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
3805 
3806  if (_First != _Last)
3807  {
3809  ::_Parallel_transform_unary_impl(_First, _Last, _Result, _Unary_op, std::forward<_Partitioner>(_Part));
3810  }
3811 
3812  return _Result;
3813 }
3814 
3865 
3866 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3867 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const auto_partitioner& _Part = auto_partitioner())
3868 {
3869  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3870 }
3871 
3922 
3923 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3924 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const static_partitioner& _Part)
3925 {
3926  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3927 }
3928 
3979 
3980 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3981 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const simple_partitioner& _Part)
3982 {
3983  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3984 }
3985 
4036 
4037 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
4038 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, affinity_partitioner& _Part)
4039 {
4040  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
4041 }
4042 
4099 
4100 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator, typename _Partitioner>
4101 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4102  _Output_iterator _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
4103 {
4104  typedef typename std::iterator_traits<_Input_iterator1>::iterator_category _Input_iterator_type1;
4105  typedef typename std::iterator_traits<_Input_iterator2>::iterator_category _Input_iterator_type2;
4106  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
4107 
4108  if (_First1 != _Last1)
4109  {
4111  ::_Parallel_transform_binary_impl(_First1, _Last1, _First2, _Result, _Binary_op, std::forward<_Partitioner>(_Part));
4112  }
4113 
4114  return _Result;
4115 }
4116 
4164 
4165 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
4166 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4167  _Output_iterator _Result, const _Binary_operator& _Binary_op)
4168 {
4169  return parallel_transform(_First1, _Last1, _First2, _Result, _Binary_op, auto_partitioner());
4170 }
4171 
4172 #pragma warning(pop)
4173 
4174 #pragma warning(push)
4175 // object allocated on the heap may not be aligned 64
4176 #pragma warning(disable: 4316)
4177 
4191 
4192 template<typename _Ty>
4194 {
4195 private:
4196 
4197 // Disable warning C4324: structure was padded due to __declspec(align())
4198 // This padding is expected and necessary.
4199 #pragma warning(push)
4200 #pragma warning(disable: 4324)
4202  struct _Node
4203  {
4204  unsigned long _M_key;
4205  _Ty _M_value;
4206  _Node* _M_chain;
4207 
4208  _Node(unsigned long _Key, _Ty _InitialValue)
4209  : _M_key(_Key), _M_value(_InitialValue), _M_chain(NULL)
4210  {
4211  }
4212  };
4213 #pragma warning(pop)
4214 
4215  static _Ty _DefaultInit()
4216  {
4217  return _Ty();
4218  }
4219 
4220 public:
4231 
4233  : _M_fnInitialize(_DefaultInit)
4234  {
4235  _InitNew();
4236  }
4237 
4255 
4256  template <typename _Function>
4257  explicit combinable(_Function _FnInitialize)
4258  : _M_fnInitialize(_FnInitialize)
4259  {
4260  _InitNew();
4261  }
4262 
4276 
4277  combinable(const combinable& _Copy)
4278  : _M_size(_Copy._M_size), _M_fnInitialize(_Copy._M_fnInitialize)
4279  {
4280  _InitCopy(_Copy);
4281  }
4282 
4292 
4294  {
4295  clear();
4296  delete [] _M_buckets;
4297  _M_fnInitialize = _Copy._M_fnInitialize;
4298  _M_size = _Copy._M_size;
4299  _InitCopy(_Copy);
4300 
4301  return *this;
4302  }
4303 
4307 
4309  {
4310  clear();
4311  delete [] _M_buckets;
4312  }
4313 
4321 
4322  _Ty& local()
4323  {
4325  size_t _Index;
4326  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4327  if (_ExistingNode == NULL)
4328  {
4329  _ExistingNode = _AddLocalItem(_Key, _Index);
4330  }
4331 
4332  _CONCRT_ASSERT(_ExistingNode != NULL);
4333  return _ExistingNode->_M_value;
4334  }
4335 
4348 
4349  _Ty& local(bool& _Exists)
4350  {
4352  size_t _Index;
4353  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4354  if (_ExistingNode == NULL)
4355  {
4356  _Exists = false;
4357  _ExistingNode = _AddLocalItem(_Key, _Index);
4358  }
4359  else
4360  {
4361  _Exists = true;
4362  }
4363 
4364  _CONCRT_ASSERT(_ExistingNode != NULL);
4365  return _ExistingNode->_M_value;
4366  }
4367 
4371 
4372  void clear()
4373  {
4374  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4375  {
4376  _Node* _CurrentNode = _M_buckets[_Index];
4377  while (_CurrentNode != NULL)
4378  {
4379  _Node* _NextNode = _CurrentNode->_M_chain;
4380  delete _CurrentNode;
4381  _CurrentNode = _NextNode;
4382  }
4383  }
4384  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4385  }
4386 
4401 
4402  template<typename _Function>
4403  _Ty combine(_Function _FnCombine) const
4404  {
4405  _Node* _CurrentNode = NULL;
4406  size_t _Index;
4407 
4408  // Look for the first value in the set, and use (a copy of) that as the result.
4409  // This eliminates a single call (of unknown cost) to _M_fnInitialize.
4410  for (_Index = 0; _Index < _M_size; ++_Index)
4411  {
4412  _CurrentNode = _M_buckets[_Index];
4413  if (_CurrentNode != NULL)
4414  {
4415  break;
4416  }
4417  }
4418 
4419  // No values... return the initializer value.
4420  if (_CurrentNode == NULL)
4421  {
4422  return _M_fnInitialize();
4423  }
4424 
4425  // Accumulate the rest of the items in the current bucket.
4426  _Ty _Result = _CurrentNode->_M_value;
4427  for (_CurrentNode = _CurrentNode->_M_chain; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4428  {
4429  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4430  }
4431 
4432  // Accumulate values from the rest of the buckets.
4433  _CONCRT_ASSERT(_Index < _M_size);
4434  for (++_Index; _Index < _M_size; ++_Index)
4435  {
4436  for (_CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4437  {
4438  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4439  }
4440  }
4441 
4442  return _Result;
4443  }
4444 
4457 
4458  template<typename _Function>
4459  void combine_each(_Function _FnCombine) const
4460  {
4461  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4462  {
4463  for (_Node* _CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4464  {
4465  _FnCombine(_CurrentNode->_M_value);
4466  }
4467  }
4468  }
4469 
4470 private:
4471  void _InitNew()
4472  {
4474  _M_buckets = new _Node*[_M_size];
4475  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4476  }
4477 
4478  void _InitCopy(const combinable& _Copy)
4479  {
4480  _M_buckets = new _Node*[_M_size];
4481  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4482  {
4483  _M_buckets[_Index] = NULL;
4484  for (_Node* _CurrentNode = _Copy._M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4485  {
4486  _Node* _NewNode = new _Node(_CurrentNode->_M_key, _CurrentNode->_M_value);
4487  _NewNode->_M_chain = _M_buckets[_Index];
4488  _M_buckets[_Index] = _NewNode;
4489  }
4490  }
4491  }
4492 
4493  _Node* _FindLocalItem(unsigned long _Key, size_t* _PIndex)
4494  {
4495  _CONCRT_ASSERT(_PIndex != NULL);
4496 
4497  *_PIndex = _Key % _M_size;
4498 
4499  // Search at this index for an existing value.
4500  _Node* _CurrentNode = _M_buckets[*_PIndex];
4501  while (_CurrentNode != NULL)
4502  {
4503  if (_CurrentNode->_M_key == _Key)
4504  {
4505  return _CurrentNode;
4506  }
4507 
4508  _CurrentNode = _CurrentNode->_M_chain;
4509  }
4510 
4511  return NULL;
4512  }
4513 
4514  _Node* _AddLocalItem(unsigned long _Key, size_t _Index)
4515  {
4516  _Node* _NewNode = new _Node(_Key, _M_fnInitialize());
4517  _Node* _TopNode;
4518  do
4519  {
4520  _TopNode = _M_buckets[_Index];
4521  _NewNode->_M_chain = _TopNode;
4522  } while (_InterlockedCompareExchangePointer(reinterpret_cast<void * volatile *>(&_M_buckets[_Index]), _NewNode, _TopNode) != _TopNode);
4523 
4524  return _NewNode;
4525  }
4526 
4527 private:
4528  _Node *volatile * _M_buckets;
4529  size_t _M_size;
4530  std::tr1::function<_Ty ()> _M_fnInitialize;
4531 };
4532 
4533 #pragma warning(pop) // C4316
4534 
4535 #pragma push_macro("_MAX_NUM_TASKS_PER_CORE")
4536 #pragma push_macro("_FINE_GRAIN_CHUNK_SIZE")
4537 #pragma push_macro("_SORT_MAX_RECURSION_DEPTH")
4538 
4539 // This number is used to control dynamic task splitting
4540 // The ideal chunk (task) division is that the number of cores is equal to the number of tasks, but it will
4541 // perform very poorly when tasks are not balanced. The simple solution is to allocate more tasks than number
4542 // of cores. _MAX_NUM_TASKS_PER_CORE provides a maximum number of tasks that will be allocated per core.
4543 // If this number is too small, the load balancing problem will affect efficiency very seriously, especially
4544 // when the compare operation is expensive.
4545 //
4546 // Note that this number is a maximum number -- the dynamic partition system will reduce the number of partitions
4547 // per core based on the dynamic load. If all cores are very busy, the number of partitions will shrink to
4548 // reduce the scheduler overhead.
4549 //
4550 // Initially, the total tasks(chunks) number of partitions "_Div_num" will be: core number * _MAX_NUM_TASKS_PER_CORE.
4551 // The _Div_num will be divided by 2 after each task splitting. There are two special numbers for _Div_num:
4552 // 1. When _Div_num reaches the point that _Div_num < _MAX_NUM_TASKS_PER_CORE, it means we have split more tasks than cores.
4553 // 2. When _Div_num reaches the point that _Div_num <= 1, it means stop splitting more tasks and begin sorting serially.
4554 #define _MAX_NUM_TASKS_PER_CORE 1024
4555 
4556 // This is a number mainly is used to control the sampling and dynamic task splitting strategies.
4557 // If the user configurable minimal divisible chunk size (default is 2048) is smaller than FINE_GRAIN_CHUNK_SIZE,
4558 // the random sampling algorithm for quicksort will enter fine-grained mode, and take a strategy that reduces the sampling
4559 // overhead. Also, the dynamic task splitting will enter fine-grained mode, which will split as many tasks as possible.
4560 #define _FINE_GRAIN_CHUNK_SIZE 512
4561 
4562 // This is the maximum depth that the quicksort will be called recursively. If we allow too far, a stack overflow may occur.
4563 #define _SORT_MAX_RECURSION_DEPTH 64
4564 
4565 template<typename _Random_iterator, typename _Function>
4566 inline size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
4567 {
4568  _Potentially_equal = false;
4569  if (_Func(_Begin[_A], _Begin[_B]))
4570  {
4571  if (_Func(_Begin[_A], _Begin[_C]))
4572  {
4573  return _Func(_Begin[_B], _Begin[_C]) ? _B : _C;
4574  }
4575  else
4576  {
4577  return _A;
4578  }
4579  }
4580  else
4581  {
4582  if (_Func(_Begin[_B], _Begin[_C]))
4583  {
4584  return _Func(_Begin[_A], _Begin[_C]) ? _A : _C;
4585  }
4586  else
4587  {
4588  _Potentially_equal = true;
4589  return _B;
4590  }
4591  }
4592 }
4593 
4594 template<typename _Random_iterator, typename _Function>
4595 inline size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
4596 {
4597  size_t _Offset = _Size / 8;
4598  size_t _A = _Median_of_three(_Begin, 0, _Offset, _Offset * 2, _Func, _Potentially_equal),
4599  _B = _Median_of_three(_Begin, _Offset * 3, _Offset * 4, _Offset * 5, _Func, _Potentially_equal),
4600  _C = _Median_of_three(_Begin, _Offset * 6, _Offset * 7, _Size - 1, _Func, _Potentially_equal);
4601  _B = _Median_of_three(_Begin, _A, _B, _C, _Func, _Potentially_equal);
4602 
4603  if (_Potentially_equal)
4604  {
4605  _Potentially_equal = !_Func(_Begin[_C], _Begin[_A]);
4606  }
4607 
4608  return _B;
4609 }
4610 
4611 // _Potentially_equal means that potentially all the values in the buffer are equal to the pivot value
4612 template<typename _Random_iterator, typename _Function>
4613 inline size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
4614 {
4615  // Base on different chunk size, apply different sampling optimization
4616  if (_Chunk_size < _FINE_GRAIN_CHUNK_SIZE && _Size <= std::max<size_t>(_Chunk_size * 4, static_cast<size_t>(15)))
4617  {
4618  bool _Never_care_equal;
4619  return _Median_of_three(_Begin, 0, _Size / 2, _Size - 1, _Func, _Never_care_equal);
4620  }
4621  else
4622  {
4623  return _Median_of_nine(_Begin, _Size, _Func, _Potentially_equal);
4624  }
4625 }
4626 
4627 // Find out two middle points for two sorted arrays by binary search so that the number of total elements on the left part of two middle points is equal
4628 // to the number of total elements on the right part of two sorted arrays and all elements on the left part is smaller than right part.
4629 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4630 size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
4631 {
4632  size_t _Len = (_Len1 + _Len2) / 2, _Index1 = 0, _Index2 = 0;
4633 
4634  while (_Index1 < _Len1 && _Index2 < _Len2)
4635  {
4636  size_t _Mid1 = (_Index1 + _Len1) / 2, _Mid2 = (_Index2 + _Len2) / 2;
4637  if (_Func(_Begin1[_Mid1], _Begin2[_Mid2]))
4638  {
4639  if (_Mid1 + _Mid2 < _Len)
4640  {
4641  _Index1 = _Mid1 + 1;
4642  }
4643  else
4644  {
4645  _Len2 = _Mid2;
4646  }
4647  }
4648  else
4649  {
4650  if (_Mid1 + _Mid2 < _Len)
4651  {
4652  _Index2 = _Mid2 + 1;
4653  }
4654  else
4655  {
4656  _Len1 = _Mid1;
4657  }
4658  }
4659  }
4660 
4661  if (_Index1 == _Len1)
4662  {
4663  _Len2 = _Len - _Len1;
4664  }
4665  else
4666  {
4667  _Len1 = _Len - _Len2;
4668  }
4669 
4670  return _Len;
4671 }
4672 
4673 // "move" operation is applied between buffers
4674 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4675 void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2,
4676  _Random_output_iterator _Output, const _Function &_Func)
4677 {
4678  while (_Begin1 != _End1 && _Begin2 != _End2)
4679  {
4680  if (_Func(*_Begin1, *_Begin2))
4681  {
4682  *_Output++ = std::move(*_Begin1++);
4683  }
4684  else
4685  {
4686  *_Output++ = std::move(*_Begin2++);
4687  }
4688  }
4689 
4690  if (_Begin1 != _End1)
4691  {
4692  std::_Move(_Begin1, _End1, _Output);
4693  }
4694  else if (_Begin2 != _End2)
4695  {
4696  std::_Move(_Begin2, _End2, _Output);
4697  }
4698 }
4699 
4700 // _Div_num of threads(tasks) merge two chunks in parallel, _Div_num should be power of 2, if not, the largest power of 2 that is
4701 // smaller than _Div_num will be used
4702 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4703 void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output,
4704  const _Function &_Func, size_t _Div_num)
4705 {
4706  // Turn to serial merge or continue splitting chunks base on "_Div_num"
4707  if (_Div_num <= 1 || (_Len1 <= 1 && _Len2 <= 1))
4708  {
4709  _Merge_chunks(_Begin1, _Begin1 + _Len1, _Begin2, _Begin2 + _Len2, _Output, _Func);
4710  }
4711  else
4712  {
4713  size_t _Mid_len1 = _Len1, _Mid_len2 = _Len2;
4714  size_t _Mid = _Search_mid_point(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Func);
4715 
4717  auto _Handle = make_task([&]
4718  {
4719  _Parallel_merge(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Output, _Func, _Div_num / 2);
4720  });
4721  _Tg.run(_Handle);
4722 
4723  _Parallel_merge(_Begin1 + _Mid_len1, _Len1 - _Mid_len1, _Begin2 + _Mid_len2, _Len2 - _Mid_len2, _Output + _Mid, _Func, _Div_num / 2);
4724 
4725  _Tg.wait();
4726  }
4727 }
4728 
4729 // Return current sorting byte from key
4730 template<typename _Ty, typename _Function>
4731 inline size_t _Radix_key(const _Ty& _Val, size_t _Radix, _Function _Proj_func)
4732 {
4733  return static_cast<size_t>(_Proj_func(_Val) >> static_cast<int>(8 * _Radix) & 255);
4734 }
4735 
4736 // One pass of radix sort
4737 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4738 void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
4739 {
4740  if (!_Size)
4741  {
4742  return;
4743  }
4744 
4745  size_t _Pos[256] = {0};
4746 
4747  for (size_t _I = 0; _I < _Size; _I++)
4748  {
4749  ++_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)];
4750  }
4751 
4752  for (size_t _I = 1; _I < 256; _I++)
4753  {
4754  _Pos[_I] += _Pos[_I - 1];
4755  }
4756 
4757  // _Size > 0
4758  for (size_t _I = _Size - 1; _I != 0; _I--)
4759  {
4760  _Output[--_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)]] = std::move(_Begin[_I]);
4761  }
4762 
4763  _Output[--_Pos[_Radix_key(_Begin[0], _Radix, _Proj_func)]] = std::move(_Begin[0]);
4764 }
4765 
4766 // Serial least-significant-byte radix sort, it will sort base on last "_Radix" number of bytes
4767 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4768 void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4769  size_t _Radix, _Function _Proj_func, size_t _Deep = 0)
4770 {
4771  size_t _Cur_radix = 0;
4772  if (_Size == 0)
4773  {
4774  return;
4775  }
4776 
4777  while (_Cur_radix < _Radix)
4778  {
4779  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4780  _Integer_radix_pass(_Output, _Size, _Begin, _Cur_radix++, _Proj_func);
4781  }
4782 
4783  if (_Cur_radix == _Radix)
4784  {
4785  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4786  }
4787 
4788  // if odd round is passed, then move result back to input buffer
4789  if (_Deep + _Radix + 1 & 1)
4790  {
4791  if (_Radix + 1 & 1)
4792  {
4793  std::_Move(_Output, _Output + _Size, _Begin);
4794  }
4795  else
4796  {
4797  std::_Move(_Begin, _Begin + _Size, _Output);
4798  }
4799  }
4800 }
4801 
4802 // Parallel most-significant-byte _Radix sort.
4803 // In the end, it will turn to serial least-significant-byte radix sort
4804 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4805 void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4806  size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep = 0)
4807 {
4808  // If the chunk _Size is too small, then turn to serial least-significant-byte radix sort
4809  if (_Size <= _Chunk_size || _Radix < 1)
4810  {
4811  return _Integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Deep);
4812  }
4813 
4815  size_t _Buffer_size = sizeof(size_t) * 256 * _Threads_num;
4816  size_t _Step = _Size / _Threads_num;
4817  size_t _Remain = _Size % _Threads_num;
4818 
4820  size_t (*_Chunks)[256] = _Holder._InitOnRawMalloca(_malloca(_Buffer_size));
4821 
4822  memset(_Chunks, 0, _Buffer_size);
4823 
4824  // Our purpose is to map unsorted data in buffer "_Begin" to buffer "_Output" so that all elements who have the same
4825  // byte value in the "_Radix" position will be grouped together in the buffer "_Output"
4826  //
4827  // Serial version:
4828  // To understand this algorithm, first consider a serial version. In following example, we treat 1 digit as 1 byte, so we have a
4829  // total of 10 elements for each digit instead of 256 elements in each byte. Let's suppose "_Radix" == 1 (right most is 0), and:
4830  //
4831  // begin: [ 32 | 62 | 21 | 43 | 55 | 43 | 23 | 44 ]
4832  //
4833  // We want to divide the output buffer "_Output" into 10 chunks, and each the element in the "_Begin" buffer should be mapped into
4834  // the proper destination chunk based on its current digit (byte) indicated by "_Radix"
4835  //
4836  // Because "_Radix" == 1, after a pass of this function, the chunks in the "_Output" should look like:
4837  //
4838  // buffer: [ | | 21 23 | 32 | 43 43 44 | 55 | 62 | | | ]
4839  // 0 1 2 3 4 5 6 7 8 9
4840  //
4841  // The difficulty is determining where to insert values into the "_Output" to get the above result. The way to get the
4842  // start position of each chunk of the buffer is:
4843  // 1. Count the number of elements for each chunk (in above example, chunk0 is 0, chunk1 is 0, chunk2 is 2, chunk3 is 1 ...
4844  // 2. Make a partial sum for these chunks( in above example, we will get chunk0=chunk0=0, chunk1=chunk0+chunk1=0,
4845  // chunk2=chunk0+chunk1+chunk2=2, chunk3=chunk0+chunk1+chunk2+chunk3=3
4846  //
4847  // After these steps, we will get the end position of each chunk in the "_Output". The begin position of each chunk will be the end
4848  // point of last chunk (begin point is close but the end point is open). After that, we can scan the original array again and directly
4849  // put elements from original buffer "_Begin" into specified chunk on buffer "_Output".
4850  // Finally, we invoke _parallel_integer_radix_sort in parallel for each chunk and sort them in parallel based on the next digit (byte).
4851  // Because this is a STABLE sort algorithm, if two numbers has same key value on this byte (digit), their original order should be kept.
4852  //
4853  // Parallel version:
4854  // Almost the same as the serial version, the differences are:
4855  // 1. The count for each chunk is executed in parallel, and each thread will count one segment of the input buffer "_Begin".
4856  // The count result will be separately stored in their own chunk size counting arrays so we have a total of threads-number
4857  // of chunk count arrays.
4858  // For example, we may have chunk00, chunk01, ..., chunk09 for first thread, chunk10, chunk11, ..., chunk19 for second thread, ...
4859  // 2. The partial sum should be executed across these chunk counting arrays that belong to different threads, instead of just
4860  // making a partial sum in one counting array.
4861  // This is because we need to put values from different segments into one final buffer, and the absolute buffer position for
4862  // each chunkXX is needed.
4863  // 3. Make a parallel scan for original buffer again, and move numbers in parallel into the corresponding chunk on each buffer based
4864  // on these threads' chunk size counters.
4865 
4866  // Count in parallel and separately save their local results without reducing
4867  Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4868  {
4869  size_t _Beg_index, _End_index;
4870 
4871  // Calculate the segment position
4872  if (_Index < _Remain)
4873  {
4874  _Beg_index = _Index * (_Step + 1);
4875  _End_index = _Beg_index + (_Step + 1);
4876  }
4877  else
4878  {
4879  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4880  _End_index = _Beg_index + _Step;
4881  }
4882 
4883  // Do a counting
4884  while (_Beg_index != _End_index)
4885  {
4886  ++_Chunks[_Index][_Radix_key(_Begin[_Beg_index++], _Radix, _Proj_func)];
4887  }
4888  });
4889 
4890  int _Index = -1, _Count = 0;
4891 
4892  // Partial sum cross different threads' chunk counters
4893  for (int _I = 0; _I < 256; _I++)
4894  {
4895  size_t _Last = _I ? _Chunks[_Threads_num - 1][_I - 1] : 0;
4896  _Chunks[0][_I] += _Last;
4897 
4898  for (size_t _J = 1; _J < _Threads_num; _J++)
4899  {
4900  _Chunks[_J][_I] += _Chunks[_J - 1][_I];
4901  }
4902 
4903  // "_Chunks[_Threads_num - 1][_I] - _Last" will get the global _Size for chunk _I(including all threads local _Size for chunk _I)
4904  // this will chunk whether the chunk _I is empty or not. If it's not empty, it will be recorded.
4905  if (_Chunks[_Threads_num - 1][_I] - _Last)
4906  {
4907  ++_Count;
4908  _Index = _I;
4909  }
4910  }
4911 
4912  // If there is more than 1 chunk that has content, then continue the original algorithm
4913  if (_Count > 1)
4914  {
4915  // Move the elements in parallel into each chunk
4916  Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4917  {
4918  size_t _Beg_index, _End_index;
4919 
4920  // Calculate the segment position
4921  if (_Index < _Remain)
4922  {
4923  _Beg_index = _Index * (_Step + 1);
4924  _End_index = _Beg_index + (_Step + 1);
4925  }
4926  else
4927  {
4928  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4929  _End_index = _Beg_index + _Step;
4930  }
4931 
4932  // Do a move operation to directly put each value into its destination chunk
4933  // Chunk pointer is moved after each put operation.
4934  if (_Beg_index != _End_index--)
4935  {
4936  while (_Beg_index != _End_index)
4937  {
4938  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4939  --_End_index;
4940  }
4941  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4942  }
4943  });
4944 
4945  // Invoke _parallel_integer_radix_sort in parallel for each chunk
4946  Concurrency::parallel_for(static_cast<size_t>(0), static_cast<size_t>(256), [=](size_t _Index)
4947  {
4948  if (_Index < 256 - 1)
4949  {
4950  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Chunks[0][_Index + 1] - _Chunks[0][_Index],
4951  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4952  }
4953  else
4954  {
4955  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Size - _Chunks[0][_Index],
4956  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4957  }
4958  });
4959  }
4960  else
4961  {
4962  // Only one chunk has content
4963  // A special optimization is applied because one chunk means all numbers have a same value on this particular byte (digit).
4964  // Because we cannot sort them at all (they are all equal at this point), directly call _parallel_integer_radix_sort to
4965  // sort next byte (digit)
4966  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix - 1, _Proj_func, _Chunk_size, _Deep);
4967  }
4968 }
4969 
4970 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4971 void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4972  _Function _Proj_func, const size_t _Chunk_size)
4973 {
4974  typedef typename std::iterator_traits<_Random_iterator>::value_type _Value_type;
4975  // The key type of the radix sort, this must be an "unsigned integer-like" type, that is, it needs support:
4976  // operator>> (int), operator>>= (int), operator& (int), operator <, operator size_t ()
4977  typedef typename std::remove_const<typename std::remove_reference<decltype(_Proj_func(*_Begin))>::type>::type _Integer_type;
4978 
4979  // Find out the max value, which will be used to determine the highest differing byte (the radix position)
4980  _Integer_type _Max_val = Concurrency::parallel_reduce(_Begin, _Begin + _Size, _Proj_func(*_Begin),
4981  [=](_Random_iterator _Begin, _Random_iterator _End, _Integer_type _Init) -> _Integer_type
4982  {
4983  while (_Begin != _End)
4984  {
4985  _Integer_type _Ret = _Proj_func(*_Begin++);
4986  if (_Init < _Ret)
4987  {
4988  _Init = _Ret;
4989  }
4990  }
4991 
4992  return _Init;
4993  }, [](const _Integer_type &a, const _Integer_type &b) -> const _Integer_type& {return (a < b)? b : a;});
4994  size_t _Radix = 0;
4995 
4996  // Find out highest differing byte
4997  while (_Max_val >>= 8)
4998  {
4999  ++_Radix;
5000  }
5001 
5002  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Chunk_size);
5003 }
5004 
5005 template<typename _Random_iterator, typename _Function>
5006 void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
5007 {
5008  if (_Depth >= _SORT_MAX_RECURSION_DEPTH || _Size <= _Chunk_size || _Size <= static_cast<size_t>(3) || _Chunk_size >= _FINE_GRAIN_CHUNK_SIZE && _Div_num <= 1)
5009  {
5010  return std::sort(_Begin, _Begin + _Size, _Func);
5011  }
5012 
5013  // Determine whether we need to do a three-way quick sort
5014  // We benefit from three-way merge if there are a lot of elements that are EQUAL to the median value,
5015  // _Select_median_pivot function will test redundant density by sampling
5016  bool _Is_three_way_split = false;
5017  size_t _Mid_index = _Select_median_pivot(_Begin, _Size, _Func, _Chunk_size, _Is_three_way_split);
5018 
5019  // Move the median value to the _Begin position.
5020  if (_Mid_index)
5021  {
5022  std::swap(*_Begin, _Begin[_Mid_index]);
5023  }
5024  size_t _I = 1, _J = _Size - 1;
5025 
5026  // Three-way or two-way partition
5027  // _Div_num < _MAX_NUM_TASKS_PER_CORE is checked to make sure it will never do three-way split before splitting enough tasks
5028  if (_Is_three_way_split && _Div_num < _MAX_NUM_TASKS_PER_CORE)
5029  {
5030  while (_Func(*_Begin, _Begin[_J]))
5031  {
5032  --_J;
5033  }
5034 
5035  while (_Func(_Begin[_I], *_Begin))
5036  {
5037  ++_I;
5038  }
5039 
5040  // Starting from this point, left side of _I will less than median value, right side of _J will be greater than median value,
5041  // and the middle part will be equal to median. _K is used to scan between _I and _J
5042  size_t _K = _J;
5043  while (_I <= _K)
5044  {
5045  if (_Func(_Begin[_K], *_Begin))
5046  {
5047  std::swap(_Begin[_I++], _Begin[_K]);
5048  }
5049  else
5050  {
5051  --_K;
5052  }
5053 
5054  while (_Func(*_Begin, _Begin[_K]))
5055  {
5056  std::swap(_Begin[_K--], _Begin[_J--]);
5057  }
5058  }
5059 
5060  ++_J;
5061  }
5062  else
5063  {
5064  while (_I <= _J)
5065  {
5066  // Will stop before _Begin
5067  while (_Func(*_Begin, _Begin[_J]))
5068  {
5069  --_J;
5070  }
5071 
5072  // There must be another element equal or greater than *_Begin
5073  while (_Func(_Begin[_I], *_Begin))
5074  {
5075  ++_I;
5076  }
5077 
5078  if (_I < _J)
5079  {
5080  std::swap(_Begin[_I++], _Begin[_J--]);
5081  }
5082  else
5083  {
5084  break;
5085  }
5086  }
5087 
5088  _I = ++_J;
5089  }
5090 
5091  std::swap(*_Begin, _Begin[--_I]);
5092 
5094  volatile size_t _Next_div = _Div_num / 2;
5095  auto _Handle = make_task([&]
5096  {
5097  _Parallel_quicksort_impl(_Begin + _J, _Size - _J, _Func, _Next_div, _Chunk_size, _Depth+1);
5098  });
5099  _Tg.run(_Handle);
5100 
5101  _Parallel_quicksort_impl(_Begin, _I, _Func, _Next_div, _Chunk_size, _Depth+1);
5102 
5103  // If at this point, the work hasn't been scheduled, then slow down creating new tasks
5104  if (_Div_num < _MAX_NUM_TASKS_PER_CORE)
5105  {
5106  _Next_div /= 2;
5107  }
5108 
5109  _Tg.wait();
5110 }
5111 
5112 // This function will be called to sort the elements in the "_Begin" buffer. However, we can't tell whether the result will end up in buffer
5113 // "_Begin", or buffer "_Output" when it returned. The return value is designed to indicate which buffer holds the sorted result.
5114 // Return true if the merge result is in the "_Begin" buffer; return false if the result is in the "_Output" buffer.
5115 // We can't always put the result into one assigned buffer because that may cause frequent buffer copies at return time.
5116 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
5117 inline bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func,
5118  int _Div_num, const size_t _Chunk_size)
5119 {
5120  static_assert(std::is_same<typename std::iterator_traits<_Random_iterator>::value_type, typename std::iterator_traits<_Random_buffer_iterator>::value_type>::value,
5121  "same value type expected");
5122 
5123  if (_Div_num <= 1 || _Size <= _Chunk_size)
5124  {
5125  _Parallel_quicksort_impl(_Begin, _Size, _Func, _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5126 
5127  // In case _Size <= _Chunk_size happened BEFORE the planned stop time (when _Div_num == 1) we need to calculate how many turns of
5128  // binary divisions are left. If there are an odd number of turns left, then the buffer move is necessary to make sure the final
5129  // merge result will be in the original input array.
5130  int _Left_div_turns = 0;
5131  while (_Div_num >>= 1)
5132  {
5133  _Left_div_turns++;
5134  }
5135 
5136  if (_Left_div_turns & 1)
5137  {
5138  std::move(_Begin, _Begin + _Size, _Output);
5139  return true;
5140  }
5141  else
5142  {
5143  return false;
5144  }
5145  }
5146  else
5147  {
5148  size_t _Mid = _Size / 2;
5150 
5151  auto _Handle = make_task([&, _Chunk_size]
5152  {
5153  _Parallel_buffered_sort_impl(_Begin, _Mid, _Output, _Func, _Div_num / 2, _Chunk_size);
5154  });
5155  _Tg.run(_Handle);
5156 
5157  bool _Is_buffer_swap = _Parallel_buffered_sort_impl(_Begin + _Mid, _Size - _Mid, _Output + _Mid, _Func, _Div_num / 2, _Chunk_size);
5158 
5159  _Tg.wait();
5160 
5161  if (_Is_buffer_swap)
5162  {
5163  _Parallel_merge(_Output, _Mid, _Output + _Mid, _Size - _Mid, _Begin, _Func, _Div_num);
5164  }
5165  else
5166  {
5167  _Parallel_merge(_Begin, _Mid, _Begin + _Mid, _Size - _Mid, _Output, _Func, _Div_num);
5168  }
5169 
5170  return !_Is_buffer_swap;
5171  }
5172 }
5173 
5174 // Disable the warning saying constant value in condition expression.
5175 // This is by design that lets the compiler optimize the trivial constructor.
5176 #pragma warning (push)
5177 #pragma warning (disable: 4127)
5178 
5179 // Allocate and construct a buffer
5180 template<typename _Allocator>
5181 inline typename _Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
5182 {
5183  typename _Allocator::pointer _P = _Alloc.allocate(_N);
5184 
5185  // If the objects being sorted have trivial default constructors, they do not need to be
5186  // constructed here. This can benefit performance.
5187  if (!std::has_trivial_default_constructor<typename _Allocator::value_type>::value)
5188  {
5189  for (size_t _I = 0; _I < _N; _I++)
5190  {
5191  // Objects being sorted must have a default constructor
5192  typename _Allocator::value_type _T;
5193  _Alloc.construct(_P + _I, std::forward<typename _Allocator::value_type>(_T));
5194  }
5195  }
5196 
5197  return _P;
5198 }
5199 
5200 // Destroy and deallocate a buffer
5201 template<typename _Allocator>
5202 inline void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
5203 {
5204  // If the objects being sorted have trivial default destructors, they do not need to be
5205  // destructed here. This can benefit performance.
5206  if (!std::has_trivial_destructor<typename _Allocator::value_type>::value)
5207  {
5208  for (size_t _I = 0; _I < _N; _I++)
5209  {
5210  _Alloc.destroy(_P + _I);
5211  }
5212  }
5213 
5214  _Alloc.deallocate(_P, _N);
5215 }
5216 
5217 //
5218 // Exception safe RAII wrapper for the allocated buffers
5219 //
5220 
5221 template<typename _Allocator>
5223 {
5224 public:
5225  _AllocatedBufferHolder(size_t _Size, const _Allocator & _Alloc): _M_alloc(_Alloc)
5226  {
5227  _M_size = _Size;
5228  _M_buffer = _Construct_buffer(_Size, _M_alloc);
5229  }
5230 
5232  {
5233  _Destroy_buffer(_M_buffer, _M_size, _M_alloc);
5234  }
5235 
5236  typename _Allocator::pointer _Get_buffer()
5237  {
5238  return _M_buffer;
5239  }
5240 
5241 private:
5242  size_t _M_size;
5243  _Allocator _M_alloc;
5244  typename _Allocator::pointer _M_buffer;
5245 };
5246 
5247 
5248 #pragma warning (pop)
5249 
5271 
5272 template<typename _Random_iterator>
5273 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5274 {
5275  parallel_sort(_Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5276 }
5277 
5310 
5311 template<typename _Random_iterator, typename _Function>
5312 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5313 {
5314  _CONCRT_ASSERT(_Chunk_size > 0);
5315 
5316  // Check for cancellation before the algorithm starts.
5318 
5319  size_t _Size = _End - _Begin;
5321 
5322  if (_Size <= _Chunk_size || _Core_num < 2)
5323  {
5324  return std::sort(_Begin, _End, _Func);
5325  }
5326 
5327  _Parallel_quicksort_impl(_Begin, _Size, _Func, _Core_num * _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5328 }
5329 
5355 
5356 template<typename _Random_iterator>
5357 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5358 {
5359  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End,
5360  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5361 }
5362 
5391 
5392 template<typename _Allocator, typename _Random_iterator>
5393 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5394 {
5395  parallel_buffered_sort<_Allocator>(_Begin, _End,
5396  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5397 }
5398 
5430 
5431 template<typename _Allocator, typename _Random_iterator>
5432 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5433 {
5434  parallel_buffered_sort<_Allocator>(_Alloc, _Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5435 }
5436 
5473 
5474 template<typename _Random_iterator, typename _Function>
5475 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5476 {
5477  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End, _Func, _Chunk_size);
5478 }
5479 
5519 
5520 template<typename _Allocator, typename _Random_iterator, typename _Function>
5521 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5522 {
5523  _Allocator _Alloc;
5524  return parallel_buffered_sort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Func, _Chunk_size);
5525 }
5526 
5569 
5570 template<typename _Allocator, typename _Random_iterator, typename _Function>
5571 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5572 {
5573  _CONCRT_ASSERT(_Chunk_size > 0);
5574 
5575  // Check cancellation before the algorithm starts.
5577 
5578  size_t _Size = _End - _Begin;
5580 
5581  if (_Size <= _Chunk_size || _Core_num < 2)
5582  {
5583  return std::sort(_Begin, _End, _Func);
5584  }
5585  const static size_t CORE_NUM_MASK = 0x55555555;
5586 
5587  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5588 
5589  // Prevent cancellation from happening during the algorithm in case it leaves buffers in unknown state.
5591  // This buffered sort algorithm will divide chunks and apply parallel quicksort on each chunk. In the end, it will
5592  // apply parallel merge to these sorted chunks.
5593  //
5594  // We need to decide on the number of chunks to divide the input buffer into. If we divide it into n chunks, log(n)
5595  // merges will be needed to get the final sorted result. In this algorithm, we have two buffers for each merge
5596  // operation, let's say buffer A and B. Buffer A is the original input array, buffer B is the additional allocated
5597  // buffer. Each turn's merge will put the merge result into the other buffer; for example, if we decided to split
5598  // into 8 chunks in buffer A at very beginning, after one pass of merging, there will be 4 chunks in buffer B.
5599  // If we apply one more pass of merging, there will be 2 chunks in buffer A again.
5600  //
5601  // The problem is we want to the final merge pass to put the result back in buffer A, so that we don't need
5602  // one extra copy to put the sorted data back to buffer A.
5603  // To make sure the final result is in buffer A (original input array), we need an even number of merge passes,
5604  // which means log(n) must be an even number. Thus n must be a number power(2, even number). For example, when the
5605  // even number is 2, n is power(2, 2) = 4, when even number is 4, n is power(2, 4) = 16. When we divide chunks
5606  // into these numbers, the final merge result will be in the original input array. Now we need to decide the chunk(split)
5607  // number based on this property and the number of cores.
5608  //
5609  // We want to get a chunk (split) number close to the core number (or a little more than the number of cores),
5610  // and it also needs to satisfy above property. For a 8 core machine, the best chunk number should be 16, because it's
5611  // the smallest number that satisfies the above property and is bigger than the core number (so that we can utilize all
5612  // cores, a little more than core number is OK, we need to split more tasks anyway).
5613  //
5614  // In this algorithm, we will make this alignment by bit operations (it's easy and clear). For a binary representation,
5615  // all the numbers that satisfy power(2, even number) will be 1, 100, 10000, 1000000, 100000000 ...
5616  // After OR-ing these numbers together, we will get a mask (... 0101 0101 0101) which is all possible combinations of
5617  // power(2, even number). We use _Core_num & CORE_NUM_MASK | _Core_num << 1 & CORE_NUM_MASK, a bit-wise operation to align
5618  // _Core_num's highest bit into a power(2, even number).
5619  //
5620  // It means if _Core_num = 8, the highest bit in binary is bin(1000) which is not power(2, even number). After this
5621  // bit-wise operation, it will align to bin(10000) = 16 which is power(2, even number). If the _Core_num = 16, after
5622  // alignment it still returns 16. The trick is to make sure the highest bit of _Core_num will align to the "1" bit of the
5623  // mask bin(... 0101 0101 0101) We don't care about the other bits on the aligned result except the highest bit, because they
5624  // will be ignored in the function.
5626  _Func, _Core_num & CORE_NUM_MASK | _Core_num << 1 & CORE_NUM_MASK, _Chunk_size);
5628 
5629 }
5630 
5631 #pragma warning(push)
5632 #pragma warning (disable: 4127)
5633 //
5634 // This is a default function used for parallel_radixsort which will return just the value.
5635 // It also performs compile-time checks to ensure that the data type is integral.
5636 //
5637 template <typename _DataType>
5639 {
5640  size_t operator()(const _DataType& val) const
5641  {
5642  // An instance of the type predicate returns the value if the type _DataType is one of the integral types, otherwise it
5643  // statically asserts.
5644  // An integral type is one of: bool, char, unsigned char, signed char, wchar_t, short, unsigned short, int, unsigned int, long,
5645  // and unsigned long.
5646  // In addition, with compilers that provide them, an integral type can be one of long long, unsigned long long, __int64, and
5647  // unsigned __int64
5648  static_assert(std::is_integral<_DataType>::value,
5649  "Type should be integral to use default radix function. For more information on integral types, please refer to http://msdn.microsoft.com/en-us/library/bb983099.aspx.");
5650  static_assert((sizeof(_DataType) <= sizeof(size_t)), "Passed Type is bigger than size_t.");
5651 
5652  if (std::is_unsigned<_DataType>::value)
5653  {
5654  return val;
5655  }
5656  else
5657  {
5658  // The default function needs to take the signed integer-like representation and map it to an unsigned one. The
5659  // following code will take the midpoint of the unsigned representable range (SIZE_MAX/2)+1 and does an unsigned
5660  // add of the value. Thus, it maps a [-signed_min,+signed_max] range into a [0, unsigned_max] range.
5661  return (((SIZE_MAX/2) + 1) + static_cast<size_t>(val));
5662  }
5663  }
5664 };
5665 #pragma warning (pop)
5666 
5691 
5692 template<typename _Random_iterator>
5693 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5694 {
5695  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5696 
5698 
5699  parallel_radixsort<std::allocator<_DataType>>(_Begin, _End, _Proj_func, 256 * 256);
5700 }
5701 
5729 
5730 template<typename _Allocator, typename _Random_iterator>
5731 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5732 {
5733  _Allocator _Alloc;
5734  return parallel_radixsort<_Allocator, _Random_iterator>(_Alloc, _Begin, _End);
5735 }
5736 
5767 
5768 template<typename _Allocator, typename _Random_iterator>
5769 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5770 {
5771  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5772 
5774 
5775  parallel_radixsort<_Allocator>(_Alloc, _Begin, _End, _Proj_func);
5776 }
5777 
5811 
5812 template<typename _Random_iterator, typename _Function>
5813 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5814 {
5815  parallel_radixsort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(
5816  _Begin, _End, _Proj_func, _Chunk_size);
5817 }
5818 
5855 
5856 template<typename _Allocator, typename _Random_iterator, typename _Function>
5857 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5858 {
5859  _Allocator _Alloc;
5860  return parallel_radixsort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Proj_func, _Chunk_size);
5861 }
5862 
5902 
5903 template<typename _Allocator, typename _Random_iterator, typename _Function>
5904 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5905 {
5906  _CONCRT_ASSERT(_Chunk_size > 0);
5907 
5908  // Check for cancellation before the algorithm starts.
5910 
5911  size_t _Size = _End - _Begin;
5912 
5913  // If _Size <= 1, no more sorting needs to be done.
5914  if (_Size <= 1)
5915  {
5916  return;
5917  }
5918 
5919  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5920 
5921  // Prevent cancellation from happening during the algorithm in case it leaves the buffers in unknown state.
5923  _Parallel_integer_sort_asc(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()), _Proj_func, _Chunk_size);
5925 }
5926 
5927 #pragma pop_macro("_SORT_MAX_RECURSION_DEPTH")
5928 #pragma pop_macro("_MAX_NUM_TASKS_PER_CORE")
5929 #pragma pop_macro("_FINE_GRAIN_CHUNK_SIZE")
5930 }
5931 
5932 namespace concurrency = Concurrency;
5933 
5934 #pragma pop_macro("new")
5935 #pragma pack(pop)
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2477
Definition: ppl.h:3116
_Type _Get_num_chunks(_Type)
Definition: ppl.h:1749
volatile long _M_completion_count
Definition: ppl.h:2119
_Check_return_opt_ _In_ long _Offset
Definition: io.h:334
A cancellation beacon is a flag which can be polled in an inlinable fashion using the is_signaled met...
Definition: concrt.h:5447
void _Wait_on_intrusive_steal()
Definition: ppl.h:2079
void _Parallel_for_each_chunk(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2812
Definition: concrt.h:378
void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
Definition: ppl.h:5006
size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
Definition: ppl.h:4613
::Concurrency::details::_TaskCollection _M_task_collection
Definition: ppl.h:841
_Ty & local()
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4322
void _Parallel_chunk_task_group_run(structured_task_group &_Task_group, task_handle< _Worker_class > *_Chunk_helpers, const Partitioner &, _Index_type _I)
Definition: ppl.h:2349
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1788
_CRTIMP _In_ int _Value
Definition: setjmp.h:190
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5611
void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, _Function _Proj_func, const size_t _Chunk_size)
Definition: ppl.h:4971
Definition: xutility:318
_CRTIMP void _Cancel()
Cancels work on the task collection.
const _Sub_function & _Sub_fun
Definition: ppl.h:3337
bool _Send_range(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2007
size_t _M_size
Definition: ppl.h:4529
_CRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of cancellation...
_Parallel_fixed_chunk_helper< _Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator > _Base
Definition: ppl.h:2318
Concurrency::details::_TaskCollectionBase * _OwningCollection() const
Definition: concrt.h:4340
The combinable object is intended to provide thread-private copies of data, to perform lock-free t...
Definition: ppl.h:4193
Structured task collections represent groups of work which follow a strictly LIFO ordered paradigm qu...
Definition: concrt.h:4620
unchecked_array_iterator< _Iterator > make_unchecked_array_iterator(_Iterator _Ptr)
Definition: iterator:729
void _Initialize_locations(unsigned int _Num_chunks)
Definition: ppl.h:1766
volatile _Index_type _M_current
Definition: ppl.h:1893
const _Unary_operator& _M_unary_op
Definition: ppl.h:3773
~combinable()
Destroys a combinable object.
Definition: ppl.h:4308
size_t _M_number
Definition: ppl.h:3225
void _Populate(_Forward_iterator &_First, size_t _Length)
Definition: ppl.h:3621
unsigned int _M_len
Definition: ppl.h:2802
std::tr1::function< _Ty()> _M_fnInitialize
Definition: ppl.h:4530
_Bucket * _M_root
Definition: ppl.h:3226
#define _TRACE_LEVEL_INFORMATION
Definition: ppl.h:37
void _NotifyCancel()
Definition: ppl.h:2093
_Worker_proxy * _M_pParent_worker
Definition: ppl.h:2113
Implements busy wait with no backoff
Definition: concrt.h:604
TaskProc m_pFunction
Definition: concrt.h:4313
void run_with_cancellation_token(const _Function &_Func, cancellation_token _Ct)
Executes a function object immediately and synchronously in the context of a given cancellation token...
Definition: ppl.h:864
_CRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
_CRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of a cancellation...
#define _CONCRT_ASSERT(x)
Definition: concrt.h:137
location & _M_chunk_location
Definition: ppl.h:2341
_Parallel_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2141
combinable(const combinable &_Copy)
Constructs a new combinable object.
Definition: ppl.h:4277
_Index_type _Number_of_iterations() const
Definition: ppl.h:1856
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:426
static _CRTIMP location __cdecl current()
Returns a location object representing the most specific place the calling thread is executing...
location * _M_pChunk_locations
Definition: ppl.h:1764
~_Worker_proxy()
Definition: ppl.h:1909
void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4738
task_handle< _Function > make_task(const _Function &_Func)
A factory method for creating a task_handle object.
Definition: ppl.h:164
_W64 unsigned int size_t
Definition: crtdefs.h:496
std::iterator_traits< _Forward_iterator >::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits< _Forward_iterator >::value_type &_Identity)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3040
_CRTIMP long __cdecl GetCurrentThreadId()
static cancellation_token none()
Returns a cancellation token which can never be subject to cancellation.
Definition: pplcancellation_token.h:628
_Node * _FindLocalItem(unsigned long _Key, size_t *_PIndex)
Definition: ppl.h:4493
void operator()() const
Definition: ppl.h:3706
_OutIt _Move(_InIt _First, _InIt _Last, _OutIt _Dest, _Nonscalar_ptr_iterator_tag)
Definition: xutility:2416
std::iterator_traits< _Random_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3683
void(__cdecl * TaskProc)(void *)
Concurrency::details contains definitions of support routines in the public namespaces and one or mor...
Definition: concrt.h:265
_Bucket * _Next
Definition: ppl.h:3202
_Parallel_transform_unary_helper(_Input_iterator &_First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op)
Definition: ppl.h:3752
void * align(size_t _Bound, size_t _Size, void *&_Ptr, size_t &_Space) _NOEXCEPT
Definition: memory:1951
void operator()() const
Definition: ppl.h:2329
_CRTIMP _In_opt_z_ const wchar_t _In_opt_z_ const wchar_t unsigned int
Definition: crtdefs.h:642
_N
Definition: wchar.h:1269
_CRTIMP2 bool __cdecl is_current_task_group_canceling()
Returns an indication of whether the task group which is currently executing inline on the current co...
_CRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
_Combinable_type & _Combinable
Definition: ppl.h:3340
void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output, const _Function &_Func, size_t _Div_num)
Definition: ppl.h:4703
void combine_each(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4459
_Parallel_chunk_helper(const _Random_iterator &_First, const _Index_type &_Step, const _Function &_Func, const _Range< _Index_type > &_Worker_range, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2150
void _IncrementConstructedElemsCount()
Definition: concrt.h:1095
_OutIt move(_InIt _First, _InIt _Last, _OutIt _Dest)
Definition: xutility:2447
location & _Get_chunk_location(unsigned int _ChunkIndex)
Definition: ppl.h:1743
_Bucket(_Bucket *_N)
Definition: ppl.h:3204
void operator()() const
Definition: ppl.h:2291
STL namespace.
__declspec(align(64)) struct _Node
Definition: ppl.h:4201
const _Function & _M_function
Definition: ppl.h:2270
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator&_Unary_op, const auto_partitioner &_Part=auto_partitioner())
Applies a specified function object to each element in a source range, or to a pair of elements from ...
Definition: ppl.h:3867
_Parallel_transform_binary_helper(_Input_iterator1 &_First1, _Input_iterator1 _Last1, _Input_iterator2 &_First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op)
Definition: ppl.h:3697
The Concurrency namespace provides classes and functions that provide access to the Concurrency Runti...
Definition: agents.h:42
void _Parallel_for_each_forward_impl(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2828
static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator &_Result, const _Unary_operator&_Unary_op, const auto_partitioner &)
Definition: ppl.h:3528
void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep=0)
Definition: ppl.h:4805
std::iterator_traits< _Forward_iterator >::value_type _Value_type
Definition: ppl.h:2773
void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, size_t _Deep=0)
Definition: ppl.h:4768
void _Disable_intrusive_steal()
Definition: ppl.h:2036
~task_handle()
Destroys the task_handle object.
Definition: ppl.h:109
void interruption_point()
Creates an interruption point for cancellation. If a cancellation is in progress in the context where...
Definition: ppl.h:879
__declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration
_ElemType * _InitOnRawMalloca(void *_MallocaRet)
Definition: concrt.h:1085
void clear()
Clears any intermediate computational results from a previous usage.
Definition: ppl.h:4372
_CRTIMP void __cdecl Free(_Pre_maybenull_ _Post_invalid_ void *_PAllocation)
Releases a block of memory previously allocated by the Alloc method to the Concurrency Runtime Cachin...
_CRTIMP2 size_t __cdecl _GetCombinableSize()
void _Construct(_Ty1 *_Ptr, _Ty2 &&_Val)
Definition: xmemory0:37
void parallel_invoke(const _Function1 &_Func1, const _Function2 &_Func2)
Executes the function objects supplied as parameters in parallel, and blocks until they have finished...
Definition: ppl.h:944
task_handle const & operator=(task_handle const &)
_Allocator _M_alloc
Definition: ppl.h:5243
#define NULL
Definition: crtdbg.h:30
_Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun)
Definition: ppl.h:3360
_Iterator_helper< _Input_iterator2, typename std::iterator_traits< _Input_iterator2 >::iterator_category > _M_input_helper2
Definition: ppl.h:3719
std::auto_ptr< task_handle< _Worker_class > > _Workers
Definition: ppl.h:3423
task_group()
Constructs a new task_group object.
Definition: ppl.h:503
void operator()() const
Definition: ppl.h:2163
void _Set_last_iteration(const _Index_type _I)
Definition: ppl.h:1882
_TaskCollectionStatus _Wait()
Waits for all chores running in the _StructuredTaskCollection to finish (normally or abnormally)...
Definition: concrt.h:4727
The static_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1640
_Iterator_helper< _Input_iterator1, typename std::iterator_traits< _Input_iterator1 >::iterator_category > _M_input_helper1
Definition: ppl.h:3718
void _SetRuntimeOwnsLifetime(bool fValue)
Definition: concrt.h:4347
The simple_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1669
structured_task_group()
Constructs a new structured_task_group object.
Definition: ppl.h:218
size_t _M_size
Definition: ppl.h:5242
void _Set_tree_done()
Definition: ppl.h:2060
void _Set_current_iteration(const _Index_type _I)
Definition: ppl.h:1868
_CRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
_CRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
_TaskCollectionStatus _Wait()
Waits for all chores running in the _TaskCollection to finish (normally or abnormally). This method encapsulates all the running tasks in an exception handling block, and will re-throw any exceptions that occur in any of it tasks (if those exceptions occur on another thread, they are marshaled from that thread to the thread where the _TaskCollection was created, and re-thrown). After this function returns, the _TaskCollection cannot be used for scheduling further work.
Definition: concrt.h:4930
size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
Definition: ppl.h:4630
Definition: ppl.h:1809
::Concurrency::details::_Cancellation_beacon _M_beacon
Definition: ppl.h:2116
bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func, int _Div_num, const size_t _Chunk_size)
Definition: ppl.h:5117
volatile _Index_type _M_last
Definition: ppl.h:1894
_Bucket * _Unsafe_push_back()
Definition: ppl.h:3276
_Parallel_for_each_helper(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func)
Definition: ppl.h:2776
bool _SpinOnce()
Spins for one time quantum,until a maximum spin is reached.
Definition: concrt.h:652
~task_group()
Destroys a task_group object. You are expected to call the either the wait or run_and_wait method on ...
Definition: ppl.h:535
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2445
const _Index_type _M_first_iteration
Definition: ppl.h:2308
_Worker_proxy(_Worker_proxy *_PParent_worker=NULL)
Definition: ppl.h:1903
_Ty _Serial_combine_release()
Definition: ppl.h:3254
void _InitCopy(const combinable &_Copy)
Definition: ppl.h:4478
_Range(_Index_type _Current_iteration, _Index_type _Last_iteration)
Definition: ppl.h:1814
void run(const _Function &_Func)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:572
~structured_task_group()
Destroys a structured_task_group object. You are expected to call either the wait or run_and_wait met...
Definition: ppl.h:251
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:284
#define _malloca(size)
Definition: malloc.h:228
void _Parallel_chunk_impl(const _Random_iterator &_First, _Index_type _Range_arg, const _Index_type &_Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2370
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:387
void _Put(const _Ty &_Cur)
Definition: ppl.h:3217
bool _Is_done()
Definition: ppl.h:2047
_Ty & local(bool &_Exists)
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4349
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:695
void operator()() const
Definition: ppl.h:3759
static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End, _Random_output_iterator &_Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3540
void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5273
std::iterator_traits< _Forward_iterator >::value_type value_type
Definition: ppl.h:3598
_Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&comb)
Definition: ppl.h:3345
_CRTIMP void __cdecl _Trace_ppl_function(const GUID &_Guid, unsigned char _Level, ConcRT_EventType _Type)
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function &_Fun)
Definition: ppl.h:3381
void _Parallel_for_each_partitioned_impl(const _Random_iterator &_First, _Index_type _Range_arg, _Index_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2860
const _Random_iterator & _M_first
Definition: ppl.h:2304
~auto_partitioner()
Destroys a auto_partitioner object.
Definition: ppl.h:1626
const _Index_type & _M_step
Definition: ppl.h:2305
_Bucket * _Construct(_Bucket *_Par=0)
Definition: ppl.h:3230
size_t _GetAllocationSize() const
Definition: concrt.h:1127
The auto_partitioner class represents the default method parallel_for, parallel_for_each and parallel...
Definition: ppl.h:1613
iterator_traits< _Iter >::iterator_category _Iter_cat(const _Iter &)
Definition: xutility:404
const _Binary_operator& _M_binary_op
Definition: ppl.h:3721
affinity_partitioner()
Constructs an affinity_partitioner object.
Definition: ppl.h:1730
static _CRTIMP void __cdecl _Yield()
void parallel_for_each(const extent< _Rank > &_Compute_domain, const _Kernel_type &_Kernel)
Invokes a parallel computation of a kernel function over a compute domain on an accelerator_view. The accelerator_view is determined from the arrays and/or array_views captured by the kernel function, or if no accelerator_view can be derived, the default is chosen.
Definition: amp.h:6988
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2559
~_AllocatedBufferHolder()
Definition: ppl.h:5231
void _Enable_intrusive_steal(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2030
auto_partitioner()
Constructs a auto_partitioner object.
Definition: ppl.h:1620
#define _FINE_GRAIN_CHUNK_SIZE
Definition: ppl.h:4560
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3678
#define _SORT_MAX_RECURSION_DEPTH
Definition: ppl.h:4563
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5606
void _Propagate_cancel()
Definition: ppl.h:2098
void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, const auto_partitioner &, std::forward_iterator_tag)
Definition: ppl.h:2845
_Node *volatile * _M_buckets
Definition: ppl.h:4528
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3720
structured_task_group(cancellation_token _CancellationToken)
Constructs a new structured_task_group object.
Definition: ppl.h:235
_CRTIMP void *__cdecl Alloc(size_t _NumBytes)
Allocates a block of memory of the size specified from the Concurrency Runtime Caching Suballocator...
_In_ wchar_t _C
Definition: wchar.h:1295
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:795
_Base _M_fixed_helper
Definition: ppl.h:2342
bool _GetRuntimeOwnsLifetime() const
Definition: concrt.h:4354
_Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other)
Definition: ppl.h:3446
_Allocator::pointer _M_buffer
Definition: ppl.h:5244
size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4595
static _Ty _DefaultInit()
Definition: ppl.h:4215
void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges elements in a specified range into an non descending order using a radix sorting algorithm...
Definition: ppl.h:5693
#define false
Definition: stdbool.h:11
_Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5181
const _Index_type _M_last_iteration
Definition: ppl.h:2273
unsigned long long _Size_type
Definition: ppl.h:1672
The structured_task_group class represents a highly structured collection of parallel work...
Definition: ppl.h:204
combinable & operator=(const combinable &_Copy)
Assigns to a combinable object from another combinable object.
Definition: ppl.h:4293
void * _InterlockedCompareExchangePointer(void *volatile *, void *, void *)
const _Functor & _M_fun
Definition: ppl.h:3371
combinable(_Function _FnInitialize)
Constructs a new combinable object.
Definition: ppl.h:4257
const _Reduce_type & _Identity_value
Definition: ppl.h:3338
task_group_status wait()
Waits until all work on the structured_task_group has completed or is canceled.
Definition: ppl.h:348
_Range< _Index_type > *volatile _M_pWorker_range
Definition: ppl.h:2122
void _Parallel_invoke_impl(const _Function1 &_Func1, const _Function2 &_Func2)
Definition: ppl.h:907
_Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, affinity_partitioner &_Part)
Definition: ppl.h:2320
Task collections represent groups of work which step outside the strict structuring of the _Structure...
Definition: concrt.h:4825
_Function _M_function
Definition: ppl.h:136
Definition: ppl.h:3594
~simple_partitioner()
Destroys a simple_partitioner object.
Definition: ppl.h:1694
_Size_type _M_chunk_size
Definition: ppl.h:1712
size_t _Radix_key(const _Ty &_Val, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4731
The task_group class represents a collection of parallel work which can be waited on or canceled...
Definition: ppl.h:489
_In_ wctype_t _Type
Definition: ctype.h:205
An abstraction of a physical location on hardware.
Definition: concrt.h:1902
const _Forward_iterator _M_end
Definition: ppl.h:3372
void run(const _Function &_Func, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:613
void _Insert(_Bucket *_Item)
Definition: ppl.h:3209
task_group(cancellation_token _CancellationToken)
Constructs a new task_group object.
Definition: ppl.h:519
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the structured_task_group object (although such certainly qualifies this method to return true). It may be the case that the structured_task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this structured_task_group object, true will be returned as well.
Definition: ppl.h:462
size_t _Populate(_Random_iterator &_First, _Random_iterator _Last)
Definition: ppl.h:3654
unsigned int _M_num_chunks
Definition: ppl.h:1761
void operator()() const
The function call operator that the runtime invokes to perform the work of the task handle...
Definition: ppl.h:125
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3629
void _Send_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1822
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1629
std::iterator_traits< _Forward_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3634
#define _MAX_NUM_TASKS_PER_CORE
Definition: ppl.h:4554
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:811
void swap(array< _Ty, _Size > &_Left, array< _Ty, _Size > &_Right) _NOEXCEPT_OP(_NOEXCEPT_OP(_Left.swap(_Right)))
Definition: array:429
_ElemType * _AddRawMallocaNode(void *_MallocaRet)
Definition: concrt.h:1147
void _Set_done()
Definition: ppl.h:2052
bool _Receive_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1923
bool _Verify_beacon_cancellation()
Definition: ppl.h:2071
~_Parallel_reduce_forward_executor_helper()
Definition: ppl.h:3461
#define _T(x)
Definition: tchar.h:2498
void _Steal_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1842
task_group_status
Describes the execution status of a task_group or structured_task_group object. A value of this type ...
Definition: pplinterface.h:114
void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5202
_Reduce_type _Reduce_type
Definition: ppl.h:3342
size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4566
_Iterator_helper< _Input_iterator, typename std::iterator_traits< _Input_iterator >::iterator_category > _M_input_helper
Definition: ppl.h:3771
_Allocator::pointer _Get_buffer()
Definition: ppl.h:5236
_Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function &_Func)
Definition: ppl.h:3426
#define _CRTIMP2
Definition: crtdefs.h:126
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:652
void operator()() const
Definition: ppl.h:2787
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:756
simple_partitioner(_Size_type _Chunk_size)
Constructs a simple_partitioner object.
Definition: ppl.h:1682
_Order_combinable(const _Sym_fun &_Fun)
Definition: ppl.h:3236
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1658
bool _Is_helper_registered()
Definition: ppl.h:2042
long __cdecl _InterlockedDecrement(long volatile *)
const _Sym_fun & _M_fun
Definition: ppl.h:3224
static_partitioner()
Constructs a static_partitioner object.
Definition: ppl.h:1647
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3772
_Diff _Count
Definition: algorithm:1941
const _Index_type _M_last_iteration
Definition: ppl.h:2309
_Worker_proxy< _Index_type > *const _M_parent_worker
Definition: ppl.h:2275
void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2, _Random_output_iterator _Output, const _Function &_Func)
Definition: ppl.h:4675
::Concurrency::details::_Context _M_context
Definition: ppl.h:2117
std::iterator_traits< _Random_iterator >::value_type value_type
Definition: ppl.h:3648
void _Populate(_Random_iterator &_First, size_t _Length)
Definition: ppl.h:3672
The affinity_partitioner class is similar to the static_partitioner class, but it improves cache affi...
Definition: ppl.h:1722
char _Value[(sizeof(_Ty)/sizeof(char))]
Definition: ppl.h:3201
_AllocatedBufferHolder(size_t _Size, const _Allocator &_Alloc)
Definition: ppl.h:5225
const _Index_type _M_first_iteration
Definition: ppl.h:2272
_Ty combine(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4403
static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1, _Random_input_iterator2 _Begin2, _Random_output_iterator &_Result, const _Binary_operator&_Binary_op, _Partitioner &&_Part)
Definition: ppl.h:3574
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:443
Definition: ppl.h:1900
size_t operator()(const _DataType &val) const
Definition: ppl.h:5640
~static_partitioner()
Destroys a static_partitioner object.
Definition: ppl.h:1655
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1800
_Type _Get_num_chunks(_Type _Range_arg) const
Definition: ppl.h:1697
_Functor::Bucket_type *const _M_bucket
Definition: ppl.h:3373
void _InitNew()
Definition: ppl.h:4471
long __cdecl _InterlockedIncrement(long volatile *)
_Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &)
Definition: ppl.h:2284
void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, task_group &_Tg)
Definition: ppl.h:3728
bool _Is_beacon_signaled()
Definition: ppl.h:2066
size_t _Populate(_Forward_iterator &_First, _Forward_iterator _Last)
Definition: ppl.h:3607
_Iterator_helper()
Definition: ppl.h:3600
_Parallel_reduce_fixed_worker< _Forward_iterator, _Function > _Worker_class
Definition: ppl.h:3422
_FwdIt const _Ty _Val
Definition: algorithm:1938
const _Function & _M_function
Definition: ppl.h:2306
#define SIZE_MAX
Definition: limits.h:81
The task_handle class represents an individual parallel work item. It encapsulates the instructions a...
Definition: ppl.h:84
_Index_type _Get_current_iteration() const
Definition: ppl.h:1862
Definition: xtr1common:380
_Check_return_ _In_ long _Size
Definition: io.h:325
static _CRTIMP _Context __cdecl _CurrentContext()
const _Function & _M_function
Definition: ppl.h:2800
_CRTIMP _Pre_notnull_ _Post_z_ char _In_ int _Radix
Definition: stdlib.h:483
~affinity_partitioner()
Destroys an affinity_partitioner object.
Definition: ppl.h:1738
_FwdIt _Last
Definition: algorithm:1936
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:322
combinable()
Constructs a new combinable object.
Definition: ppl.h:4232
_Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, std::forward_iterator_tag)
Definition: ppl.h:3284
_Combinable_type::_Bucket Bucket_type
Definition: ppl.h:3343
_Range< _Index_type > *volatile _M_pHelper_range
Definition: ppl.h:2110
_Index_type _Get_last_iteration() const
Definition: ppl.h:1876
const _Random_iterator & _M_first
Definition: ppl.h:2268
const _Index_type & _M_step
Definition: ppl.h:2269
void sort(_RanIt _First, _RanIt _Last, _Pr _Pred)
Definition: algorithm:3153
::Concurrency::details::_StructuredTaskCollection _M_task_collection
Definition: ppl.h:473
static _CRTIMP unsigned int __cdecl _GetNumberOfVirtualProcessors()
~_Order_combinable()
Definition: ppl.h:3242
void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5357
void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op, task_group &_Tg)
Definition: ppl.h:3780
_Node * _AddLocalItem(unsigned long _Key, size_t _Index)
Definition: ppl.h:4514
void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:3475
volatile long _M_stop_iterating
Definition: ppl.h:2123
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the task_group object (although such certainly qualifies this method to return true). It may be the case that the task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this task_group object, true will be returned as well.
Definition: ppl.h:830
_Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3801
The cancellation_token class represents the ability to determine whether some operation has been requ...
Definition: pplcancellation_token.h:616
_CRTIMP void _Cancel()
Cancels work on the task collection.
static _ChoreType * _InternalAlloc(const _Function &_Func)
Definition: concrt.h:4361
task_group_status wait()
Waits until all work on the task_group object has either completed or been canceled.
Definition: ppl.h:718
task_handle(const _Function &_Func)
Constructs a new task_handle object. The work of the task is performed by invoking the function speci...
Definition: ppl.h:100
static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, const auto_partitioner &)
Definition: ppl.h:3561