STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ppl.h
Go to the documentation of this file.
1 /***
2 * ==++==
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 * ==--==
7 * =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
8 *
9 * ppl.h
10 *
11 * Parallel Patterns Library
12 *
13 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
14 ****/
15 
16 #pragma once
17 
18 #include <crtdefs.h>
19 #include <concrt.h>
20 #include <stdexcept>
21 #include <iterator>
22 #include <functional>
23 #include <memory>
24 #include <type_traits>
25 #include <algorithm>
26 #include <malloc.h>
27 
28 #include <pplwin.h>
29 
30 #define _PPL_H
31 
32 #pragma pack(push,_CRT_PACKING)
33 #pragma push_macro("new")
34 #undef new
35 
36 // Define the level of tracing to use
37 
38 #define _TRACE_LEVEL_INFORMATION 4
39 
44 
45 namespace Concurrency
46 {
47 namespace details
48 {
49  _CONCRTIMP size_t __cdecl _GetCombinableSize();
50 } // namespace details
51 
52 class structured_task_group;
53 class task_group;
54 
83 
84 template<typename _Function>
86 {
87 public:
100 
101  task_handle(const _Function& _Func) : _M_function(_Func)
102  {
103  m_pFunction = reinterpret_cast <TaskProc> (&::Concurrency::details::_UnrealizedChore::_InvokeBridge<task_handle>);
104  }
105 
109 
111  {
112  //
113  // We only need to perform a liveness check if the client owns the lifetime of the handle. Doing this for runtime owned handles
114  // is not only unnecessary -- it is also dangerous.
115  //
117  {
119  }
120  }
121 
125 
126  void operator()() const
127  {
128  _M_function();
129  }
130 
131 private:
132 
133  friend class task_group;
134  friend class structured_task_group;
135 
136  // The function object invoked to perform the body of the task.
137  _Function _M_function;
138 
139  task_handle const & operator=(task_handle const&); // no assignment operator
140 
141 };
142 
163 
164 template <class _Function>
165 task_handle<_Function> make_task(const _Function& _Func)
166 {
167  return task_handle<_Function>(_Func);
168 }
169 
204 
206 {
207 public:
208 
218 
220  {
221  }
222 
235 
237  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())
238  {
239  }
240 
251 
253  {
254  }
255 
283 
284  template<class _Function>
285  void run(task_handle<_Function>& _Task_handle)
286  {
287  _Task_handle._SetRuntimeOwnsLifetime(false);
288  _M_task_collection._Schedule(&_Task_handle);
289  }
290 
321 
322  template<class _Function>
323  void run(task_handle<_Function>& _Task_handle, location& _Placement)
324  {
325  _Task_handle._SetRuntimeOwnsLifetime(false);
326  _M_task_collection._Schedule(&_Task_handle, &_Placement);
327  }
328 
348 
350  {
351  //
352  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
353  //
355  }
356 
386 
387  template<class _Function>
389  {
390  //
391  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
392  //
393  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
394  }
395 
425 
426  template<class _Function>
427  task_group_status run_and_wait(const _Function& _Func)
428  {
429  //
430  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
431  //
432  task_handle<_Function> _Task(_Func);
434  }
435 
443 
444  void cancel()
445  {
447  }
448 
462 
464  {
466  }
467 
468 private:
469 
470  // Disallow passing in an r-value for a task handle argument
471  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
472 
473  // The underlying group of tasks as known to the runtime.
475 };
476 
489 
491 {
492 public:
493 
503 
505  {
506  }
519 
520  task_group(cancellation_token _CancellationToken) :
521  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())
522  {
523  }
524 
535 
537  {
538  }
539 
571 
572  template<typename _Function>
573  void run(const _Function& _Func)
574  {
576  }
577 
612 
613  template<typename _Function>
614  void run(const _Function& _Func, location& _Placement)
615  {
617  }
618 
651 
652  template<typename _Function>
653  void run(task_handle<_Function>& _Task_handle)
654  {
655  _Task_handle._SetRuntimeOwnsLifetime(false);
656  _M_task_collection._Schedule(&_Task_handle);
657  }
658 
694 
695  template<typename _Function>
696  void run(task_handle<_Function>& _Task_handle, location& _Placement)
697  {
698  _Task_handle._SetRuntimeOwnsLifetime(false);
699  _M_task_collection._Schedule(&_Task_handle, &_Placement);
700  }
701 
718 
720  {
721  //
722  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
723  //
724  return static_cast<task_group_status>(_M_task_collection._Wait());
725  }
726 
755 
756  template<class _Function>
758  {
759  //
760  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
761  //
762  _Task_handle._SetRuntimeOwnsLifetime(false);
763  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
764  }
765 
794 
795  template<class _Function>
796  task_group_status run_and_wait(const _Function& _Func)
797  {
798  //
799  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
800  //
802  }
803 
811 
812  void cancel()
813  {
815  }
816 
830 
832  {
834  }
835 
836 private:
837 
838  // Disallow passing in an r-value for a task handle argument
839  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
840 
841  // The underlying group of tasks as known to the runtime.
843 };
844 
863 
864 template<typename _Function>
865 void run_with_cancellation_token(const _Function& _Func, cancellation_token _Ct)
866 {
867  structured_task_group _Stg(_Ct);
868  _Stg.run_and_wait(_Func);
869 }
870 
879 
880 inline void interruption_point()
881 {
883  _Stg.wait();
884 }
885 
899 
901 
902 // Parallel Algorithms and Patterns
903 
904 // Helper function that implements parallel_invoke with two functions
905 // Used by parallel_for and parallel_for_each implementations
906 
907 template <typename _Function1, typename _Function2>
908 void _Parallel_invoke_impl(const _Function1& _Func1, const _Function2& _Func2)
909 {
910  structured_task_group _Task_group;
911 
912  task_handle<_Function1> _Task_handle1(_Func1);
913  _Task_group.run(_Task_handle1);
914 
915  // We inline the last item to prevent the unnecessary push/pop on the work queue.
916  task_handle<_Function2> _Task_handle2(_Func2);
917  _Task_group.run_and_wait(_Task_handle2);
918 }
919 
943 
944 template <typename _Function1, typename _Function2>
945 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2)
946 {
948 
949  _Parallel_invoke_impl(_Func1, _Func2);
950 
952 }
953 
983 
984 template <typename _Function1, typename _Function2, typename _Function3>
985 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3)
986 {
988 
989  structured_task_group _Task_group;
990 
991  task_handle<_Function1> _Task_handle1(_Func1);
992  _Task_group.run(_Task_handle1);
993 
994  task_handle<_Function2> _Task_handle2(_Func2);
995  _Task_group.run(_Task_handle2);
996 
997  task_handle<_Function3> _Task_handle3(_Func3);
998  _Task_group.run_and_wait(_Task_handle3);
999 
1001 }
1002 
1038 
1039 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4>
1040 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4)
1041 {
1043 
1044  structured_task_group _Task_group;
1045 
1046  task_handle<_Function1> _Task_handle1(_Func1);
1047  _Task_group.run(_Task_handle1);
1048 
1049  task_handle<_Function2> _Task_handle2(_Func2);
1050  _Task_group.run(_Task_handle2);
1051 
1052  task_handle<_Function3> _Task_handle3(_Func3);
1053  _Task_group.run(_Task_handle3);
1054 
1055  task_handle<_Function4> _Task_handle4(_Func4);
1056  _Task_group.run_and_wait(_Task_handle4);
1057 
1059 }
1060 
1102 
1103 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5>
1104 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5)
1105 {
1107 
1108  structured_task_group _Task_group;
1109 
1110  task_handle<_Function1> _Task_handle1(_Func1);
1111  _Task_group.run(_Task_handle1);
1112 
1113  task_handle<_Function2> _Task_handle2(_Func2);
1114  _Task_group.run(_Task_handle2);
1115 
1116  task_handle<_Function3> _Task_handle3(_Func3);
1117  _Task_group.run(_Task_handle3);
1118 
1119  task_handle<_Function4> _Task_handle4(_Func4);
1120  _Task_group.run(_Task_handle4);
1121 
1122  task_handle<_Function5> _Task_handle5(_Func5);
1123  _Task_group.run_and_wait(_Task_handle5);
1124 
1126 }
1127 
1175 
1176 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1177  typename _Function6>
1178 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1179  const _Function6& _Func6)
1180 {
1182 
1183  structured_task_group _Task_group;
1184 
1185  task_handle<_Function1> _Task_handle1(_Func1);
1186  _Task_group.run(_Task_handle1);
1187 
1188  task_handle<_Function2> _Task_handle2(_Func2);
1189  _Task_group.run(_Task_handle2);
1190 
1191  task_handle<_Function3> _Task_handle3(_Func3);
1192  _Task_group.run(_Task_handle3);
1193 
1194  task_handle<_Function4> _Task_handle4(_Func4);
1195  _Task_group.run(_Task_handle4);
1196 
1197  task_handle<_Function5> _Task_handle5(_Func5);
1198  _Task_group.run(_Task_handle5);
1199 
1200  task_handle<_Function6> _Task_handle6(_Func6);
1201  _Task_group.run_and_wait(_Task_handle6);
1202 
1204 }
1205 
1259 
1260 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1261  typename _Function6, typename _Function7>
1262 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1263  const _Function6& _Func6, const _Function7& _Func7)
1264 {
1266 
1267  structured_task_group _Task_group;
1268 
1269  task_handle<_Function1> _Task_handle1(_Func1);
1270  _Task_group.run(_Task_handle1);
1271 
1272  task_handle<_Function2> _Task_handle2(_Func2);
1273  _Task_group.run(_Task_handle2);
1274 
1275  task_handle<_Function3> _Task_handle3(_Func3);
1276  _Task_group.run(_Task_handle3);
1277 
1278  task_handle<_Function4> _Task_handle4(_Func4);
1279  _Task_group.run(_Task_handle4);
1280 
1281  task_handle<_Function5> _Task_handle5(_Func5);
1282  _Task_group.run(_Task_handle5);
1283 
1284  task_handle<_Function6> _Task_handle6(_Func6);
1285  _Task_group.run(_Task_handle6);
1286 
1287  task_handle<_Function7> _Task_handle7(_Func7);
1288  _Task_group.run_and_wait(_Task_handle7);
1289 
1291 }
1292 
1352 
1353 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1354  typename _Function6, typename _Function7, typename _Function8>
1355 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1356  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8)
1357 {
1359 
1360  structured_task_group _Task_group;
1361 
1362  task_handle<_Function1> _Task_handle1(_Func1);
1363  _Task_group.run(_Task_handle1);
1364 
1365  task_handle<_Function2> _Task_handle2(_Func2);
1366  _Task_group.run(_Task_handle2);
1367 
1368  task_handle<_Function3> _Task_handle3(_Func3);
1369  _Task_group.run(_Task_handle3);
1370 
1371  task_handle<_Function4> _Task_handle4(_Func4);
1372  _Task_group.run(_Task_handle4);
1373 
1374  task_handle<_Function5> _Task_handle5(_Func5);
1375  _Task_group.run(_Task_handle5);
1376 
1377  task_handle<_Function6> _Task_handle6(_Func6);
1378  _Task_group.run(_Task_handle6);
1379 
1380  task_handle<_Function7> _Task_handle7(_Func7);
1381  _Task_group.run(_Task_handle7);
1382 
1383  task_handle<_Function8> _Task_handle8(_Func8);
1384  _Task_group.run_and_wait(_Task_handle8);
1385 
1387 }
1388 
1454 
1455 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1456  typename _Function6, typename _Function7, typename _Function8, typename _Function9>
1457 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1458  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9)
1459 {
1461 
1462  structured_task_group _Task_group;
1463 
1464  task_handle<_Function1> _Task_handle1(_Func1);
1465  _Task_group.run(_Task_handle1);
1466 
1467  task_handle<_Function2> _Task_handle2(_Func2);
1468  _Task_group.run(_Task_handle2);
1469 
1470  task_handle<_Function3> _Task_handle3(_Func3);
1471  _Task_group.run(_Task_handle3);
1472 
1473  task_handle<_Function4> _Task_handle4(_Func4);
1474  _Task_group.run(_Task_handle4);
1475 
1476  task_handle<_Function5> _Task_handle5(_Func5);
1477  _Task_group.run(_Task_handle5);
1478 
1479  task_handle<_Function6> _Task_handle6(_Func6);
1480  _Task_group.run(_Task_handle6);
1481 
1482  task_handle<_Function7> _Task_handle7(_Func7);
1483  _Task_group.run(_Task_handle7);
1484 
1485  task_handle<_Function8> _Task_handle8(_Func8);
1486  _Task_group.run(_Task_handle8);
1487 
1488  task_handle<_Function9> _Task_handle9(_Func9);
1489  _Task_group.run_and_wait(_Task_handle9);
1490 
1492 }
1493 
1565 
1566 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1567  typename _Function6, typename _Function7, typename _Function8, typename _Function9, typename _Function10>
1568 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1569  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9, const _Function10& _Func10)
1570 {
1572 
1573  structured_task_group _Task_group;
1574 
1575  task_handle<_Function1> _Task_handle1(_Func1);
1576  _Task_group.run(_Task_handle1);
1577 
1578  task_handle<_Function2> _Task_handle2(_Func2);
1579  _Task_group.run(_Task_handle2);
1580 
1581  task_handle<_Function3> _Task_handle3(_Func3);
1582  _Task_group.run(_Task_handle3);
1583 
1584  task_handle<_Function4> _Task_handle4(_Func4);
1585  _Task_group.run(_Task_handle4);
1586 
1587  task_handle<_Function5> _Task_handle5(_Func5);
1588  _Task_group.run(_Task_handle5);
1589 
1590  task_handle<_Function6> _Task_handle6(_Func6);
1591  _Task_group.run(_Task_handle6);
1592 
1593  task_handle<_Function7> _Task_handle7(_Func7);
1594  _Task_group.run(_Task_handle7);
1595 
1596  task_handle<_Function8> _Task_handle8(_Func8);
1597  _Task_group.run(_Task_handle8);
1598 
1599  task_handle<_Function9> _Task_handle9(_Func9);
1600  _Task_group.run(_Task_handle9);
1601 
1602  task_handle<_Function10> _Task_handle10(_Func10);
1603  _Task_group.run_and_wait(_Task_handle10);
1604 
1606 }
1607 
1613 
1615 {
1616 public:
1620 
1622 
1626 
1628 
1629  template<class _Type>
1630  _Type _Get_num_chunks(_Type ) const
1631  {
1633  }
1634 };
1635 
1640 
1642 {
1643 public:
1647 
1649  {
1650  }
1651 
1655 
1657 
1658  template<class _Type>
1659  _Type _Get_num_chunks(_Type ) const
1660  {
1662  }
1663 };
1664 
1669 
1671 {
1672 private:
1673  typedef unsigned long long _Size_type;
1674 
1675 public:
1682 
1683  explicit simple_partitioner(_Size_type _Chunk_size) : _M_chunk_size(_Chunk_size)
1684  {
1685  if (_Chunk_size == 0)
1686  {
1687  throw std::invalid_argument("_Chunk_size");
1688  }
1689  }
1690 
1694 
1696 
1697  template<class _Type>
1698  _Type _Get_num_chunks(_Type _Range_arg) const
1699  {
1700  static_assert(sizeof(_Type) <= sizeof(_Size_type), "Potential truncation of _Range_arg");
1701  _Size_type _Num_chunks = (static_cast<_Size_type>(_Range_arg) / _M_chunk_size);
1702 
1703  if (_Num_chunks == 0)
1704  {
1705  _Num_chunks = 1;
1706  }
1707 
1708  return static_cast<_Type>(_Num_chunks);
1709  }
1710 
1711 private:
1712 
1713  _Size_type _M_chunk_size;
1714 };
1715 
1722 
1724 {
1725 public:
1726 
1730 
1732  {
1733  }
1734 
1738 
1740  {
1741  delete [] _M_pChunk_locations;
1742  }
1743 
1744  location& _Get_chunk_location(unsigned int _ChunkIndex)
1745  {
1746  return _M_pChunk_locations[_ChunkIndex];
1747  }
1748 
1749  template<class _Type>
1750  _Type _Get_num_chunks(_Type )
1751  {
1752  if (_M_num_chunks == 0)
1753  {
1756  }
1757 
1758  return static_cast<_Type>(_M_num_chunks);
1759  }
1760 
1761 private:
1762  // The number of chunks the partitioner will record affinity for.
1763  unsigned int _M_num_chunks;
1764 
1765  // Array of remembered locations.
1767 };
1768 
1769 // Helper methods for scheduling and executing parallel tasks
1770 
1771 // Disable C4180: qualifier applied to function type has no meaning; ignored
1772 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
1773 #pragma warning(push)
1774 #pragma warning(disable: 4180)
1775 // Disable C6263: using _alloca in a loop; this can quickly overflow stack
1776 #pragma warning(disable: 6263)
1777 
1778 // Template class that invokes user function on a parallel_for_each
1779 
1780 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
1782 {
1783 public:
1784  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1785  {
1786  _Func(_First[_Index]);
1787  }
1788 };
1789 
1790 // Template specialized class that invokes user function on a parallel_for
1791 
1792 template <typename _Random_iterator, typename _Index_type, typename _Function>
1793 class _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, false>
1794 {
1795 public:
1796  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1797  {
1798  _Func(static_cast<_Random_iterator>(_First + _Index));
1799  }
1800 };
1801 
1802 // Represents a range of iteration
1803 
1804 template<typename _Index_type>
1805 class _Range
1806 {
1807 public:
1808 
1809  // Construct an object for the range [_Current_iteration, _Last_iteration)
1810  _Range(_Index_type _Current_iteration, _Index_type _Last_iteration) :
1811  _M_current(_Current_iteration), _M_last(_Last_iteration)
1812  {
1813  // On creation, the range shall have at least 1 iteration.
1815  }
1816 
1817  // Send a portion of the range to the helper
1818  void _Send_range(_Range<_Index_type> * _Helper_range)
1819  {
1820  // If there are no iterations other than the current one left until finish, there is no help
1821  // needed. Set the pointer to a special value that helper will understand and continue
1822  // doing the work.
1823  _Index_type _Remaining_iterations = _Number_of_iterations();
1824  if (_Remaining_iterations > 1)
1825  {
1826  // Compute the two pieces of the work range: one for the worker and one for helper class.
1827  _M_last_iteration = _M_current_iteration + _Remaining_iterations / 2;
1828 
1829  // There needs to be at least 1 iteration left because the current iteration cannot be sent.
1831  }
1832 
1833  // This is also a signal for the helper that a range has been sent to it.
1834  _Helper_range->_M_current_iteration = _M_last_iteration;
1835  }
1836 
1837  // Steal the entire range and give it to the helper
1838  void _Steal_range(_Range<_Index_type> * _Helper_range)
1839  {
1840  // We allow stealing only from a range that has at least 1 iteration
1842 
1843  _Index_type _Current_iter = _M_current_iteration;
1844 
1845  _Helper_range->_M_current_iteration = _Current_iter + 1;
1846  _Helper_range->_M_last_iteration = _M_last_iteration;
1847 
1848  _M_last_iteration = _Current_iter + 1;
1849  }
1850 
1851  // Returns the number of iterations in this range
1852  _Index_type _Number_of_iterations() const
1853  {
1854  return (_M_last_iteration - _M_current_iteration);
1855  }
1856 
1857  // Returns the current iteration in the range
1858  _Index_type _Get_current_iteration() const
1859  {
1860  return _M_current;
1861  }
1862 
1863  // Sets the current iteration in the range
1864  void _Set_current_iteration(const _Index_type _I)
1865  {
1866  _M_current = _I;
1867  }
1868 
1869  __declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration;
1870 
1871  // Returns the last iteration in the range
1872  _Index_type _Get_last_iteration() const
1873  {
1874  return _M_last;
1875  }
1876 
1877  // Sets the last iteration in the range
1878  void _Set_last_iteration(const _Index_type _I)
1879  {
1880  _M_last = _I;
1881  }
1882 
1883  __declspec(property(get=_Get_last_iteration, put=_Set_last_iteration)) _Index_type _M_last_iteration;
1884 
1885 private:
1886 
1887  // These members are volatile because they are updated by the helper
1888  // and used by the worker.
1889  volatile _Index_type _M_current;
1890  volatile _Index_type _M_last;
1891 };
1892 
1893 // A proxy for the worker responsible for maintaining communication with the helper
1894 
1895 template<typename _Index_type>
1897 {
1898 public:
1899  _Worker_proxy(_Worker_proxy *_PParent_worker = NULL) :
1900  _M_pHelper_range(NULL), _M_pParent_worker(_PParent_worker), _M_pWorker_range(NULL), _M_completion_count(0), _M_stop_iterating(0)
1901  {
1903  }
1904 
1906  {
1907  // Make the check to avoid doing extra work in the non-exceptional cases
1908  if (_M_completion_count != _Tree_Complete)
1909  {
1910  // On exception, notify our parent so it breaks out of its loop.
1911  _Propagate_cancel();
1912 
1913  // On exception, we need to set _M_completion_count to ensure that the helper breaks out of its spin wait.
1914  _Set_done();
1915  }
1916  }
1917 
1918  // Obtain a range from the worker
1919  bool _Receive_range(_Range<_Index_type> * _Helper_range)
1920  {
1921  // If the worker already finished, then there is no work left for the helper
1922  if (_M_completion_count)
1923  {
1924  return false;
1925  }
1926 
1927  _CONCRT_ASSERT(_Helper_range != NULL);
1928 
1929  // There are two special values for _M_current_iteration that are not valid: one is the
1930  // initial value of the working class which it will never share, and the other is
1931  // the last exclusive iteration of the working class, which has no work to be done.
1932  // We use the former value so that we can understand worker's response.
1933  _Index_type _Cached_first_iteration = _Helper_range->_M_current_iteration;
1934 
1935  // Following operation is not done via interlocked operation because it does not have to.
1936  // Helper lazily registers that it would like to help the worker, but it allows for some
1937  // time to elapse before that information has made it over to the worker. The idea
1938  // is not to disturb the worker if it is not necessary. It is possible to add interlocked
1939  // operation in the future if the time spent in the busy wait loop is too big.
1940  _CONCRT_ASSERT(_M_pHelper_range == NULL);
1941  _M_pHelper_range = _Helper_range;
1942 
1944 
1945  // If the worker is done, it will flush the store buffer and signal the helper by
1946  // changing _M_current_iteration in the helper's range.
1947  while ((_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1948  {
1949  if ((_M_pWorker_range != NULL) && _M_context._IsSynchronouslyBlocked())
1950  {
1951  // Attempt to steal the entire range from the worker if it is synchronously blocked.
1952 
1953  // Make sure that worker makes no forward progress while helper is attempting to
1954  // steal its range. If worker does get unblocked, simply back off in the helper.
1955  // Note that there could be another helper running if a range has already been
1956  // sent to us.
1957  long _Stop_iterating = _InterlockedIncrement(&_M_stop_iterating);
1958  _CONCRT_ASSERT(_Stop_iterating > 0);
1959 
1960  // We need to make a local copy as the pointer could be changed by the worker.
1961  _Range<_Index_type> * _Worker_range = _M_pWorker_range;
1962 
1963  // The order of comparison needs to be preserved. If the parent is blocked, then
1964  // it cannot send a range (because _M_stop_iterating is already set). If it sent a range
1965  // before being synchronously blocked, then we are no longer the helper. Refrain
1966  // from intrusively stealing the range.
1967  if ((_Worker_range != NULL) && _M_context._IsSynchronouslyBlocked()
1968  && (_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1969  {
1970  _CONCRT_ASSERT(_M_pHelper_range == _Helper_range);
1971 
1972  _M_pHelper_range = NULL;
1973  _Worker_range->_Steal_range(_Helper_range);
1974 
1975  _CONCRT_ASSERT(_Helper_range->_M_current_iteration != _Cached_first_iteration);
1976  }
1977 
1978  // At this point, worker is either:
1979  //
1980  // a) no longer blocked so range will come to the helper naturally, or
1981  // b) out of iterations because helper stole all of it
1982  _Stop_iterating = _InterlockedDecrement(&_M_stop_iterating);
1983  _CONCRT_ASSERT(_Stop_iterating >= 0);
1984  }
1985  else
1986  {
1987  // If there is no work received in a full spin, then start yielding the context
1988  spinWait._SpinOnce();
1989  }
1990  }
1991 
1992  // If the initial iteration is the same as the original first iteration then the
1993  // worker class is sending the signal that it does not need any help.
1994  if (_Helper_range->_M_current_iteration == _Cached_first_iteration)
1995  {
1996  return false;
1997  }
1998 
1999  return (_Helper_range->_Number_of_iterations() > 0);
2000  }
2001 
2002  // Send a portion of our range and notify the helper.
2003  bool _Send_range(_Range<_Index_type> * _Worker_range)
2004  {
2005  // Worker range shall not be available for stealing at this time.
2006  _CONCRT_ASSERT(_M_pWorker_range == NULL);
2007 
2008  // Helper shall be registered.
2009  _CONCRT_ASSERT(_M_pHelper_range != NULL);
2010 
2011  // Send the range
2012  _Worker_range->_Send_range(_M_pHelper_range);
2013 
2014  // Notify the helper. The fence ensures that the prior updates are visible.
2015  _InterlockedExchangePointer(reinterpret_cast<void * volatile *>(&_M_pHelper_range), NULL);
2016 
2017  // The current iteration should still be left
2018  _CONCRT_ASSERT(_Worker_range->_Number_of_iterations() >= 1);
2019 
2020  // Indicate if we need another helper
2021  return (_Worker_range->_Number_of_iterations() > 1);
2022  }
2023 
2024  // Let the helper know that it is ok to intrusively steal range from the worker by publishing the
2025  // remaining range.
2027  {
2028  _M_pWorker_range = _Worker_range;
2029  }
2030 
2031  // Prevent the helper from intrusively stealing range from the worker
2033  {
2034  _M_pWorker_range = NULL;
2035  _Wait_on_intrusive_steal();
2036  }
2037 
2039  {
2040  return (_M_pHelper_range != NULL);
2041  }
2042 
2043  bool _Is_done()
2044  {
2045  return (_M_completion_count != 0);
2046  }
2047 
2048  void _Set_done()
2049  {
2050  // Let the helper know that this class is done with work and flush the store buffer. This operation
2051  // ensures that any buffered store to helper range in _Send_range is flushed and
2052  // available in _Receive_range (so there will be no lost ranges).
2053  _InterlockedExchange(&_M_completion_count, 1);
2054  }
2055 
2057  {
2058  // Make sure that **WE** know when our destructor hits that the entire tree is complete.
2059  _M_completion_count = _Tree_Complete;
2060  }
2061 
2063  {
2064  return _M_beacon._Is_signaled();
2065  }
2066 
2068  {
2069  return _M_beacon._Confirm_cancel();
2070  }
2071 
2072 private:
2073 
2074  // Spin wait for any intrusive steal that is in progress.
2076  {
2077  // This code is used to synchronize with helper in case of worker cooperative blocking.
2078  if (_M_stop_iterating != 0)
2079  {
2081 
2082  while (_M_stop_iterating != 0)
2083  {
2084  spinWait._SpinOnce();
2085  }
2086  }
2087  }
2088 
2090  {
2091  _M_beacon._Raise();
2092  }
2093 
2095  {
2096  if (_M_pParent_worker != NULL)
2097  {
2098  _M_pParent_worker->_NotifyCancel();
2099  }
2100  }
2101 
2102  // Constant indicating sub-tree completion
2103  static const long _Tree_Complete = 2;
2104 
2105  // Read in the loop
2107 
2108  // Read at the end of the loop
2109  _Worker_proxy * _M_pParent_worker;
2110 
2111  // Written rarely
2114 
2115  volatile long _M_completion_count;
2116 
2117  // Written to in the loop
2119  volatile long _M_stop_iterating;
2120 
2121  _Worker_proxy const & operator=(_Worker_proxy const&); // no assignment operator
2122 
2123 };
2124 
2125 // parallel_for -- Performs parallel iteration over a range of indices from _First to _Last,
2126 // excluding _Last. The order in which each iteration is executed is unspecified and non-deterministic.
2127 
2128 // Closure (binding) classes for invoking parallel_for and parallel_for_each, with chunks
2129 
2130 // A dynamically rebalancing closure class used for packaging parallel_for or parallel_for_each for invocation in chunks.
2131 // If some tasks finish earlier than others, helper tasks get executed which ensures further distribution of work.
2132 
2133 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2135 {
2136 public:
2137  _Parallel_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2138  const _Function& _Func, const _Partitioner&, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2139  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func),
2140  _M_parent_worker(_Parent_data)
2141  {
2142  // Empty constructor because members are already assigned
2143  }
2144 
2145  // Constructor overload that accepts a range
2146  _Parallel_chunk_helper(const _Random_iterator& _First, const _Index_type& _Step, const _Function& _Func,
2147  const _Range<_Index_type>& _Worker_range, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2148  _M_first(_First), _M_first_iteration(_Worker_range._M_current_iteration), _M_last_iteration(_Worker_range._M_last_iteration), _M_step(_Step), _M_function(_Func),
2149  _M_parent_worker(_Parent_data)
2150  {
2151  // Empty constructor because members are already assigned
2152  }
2153 
2154  // The main helper function which iterates over the given collection and invokes user function on every iteration.
2155  // Function is marked as const even though it does mutate some of its members (those are declared as mutable). This is done
2156  // in order to easily communicate between a worker and a helper instance, without holding references to many local variables.
2157  // However, this function does not mutate any state that is visible to anyone outside of this class, nor would that be
2158  // possible due to the implicit copy of the functor that happens when a new task_handle is created.
2159  void operator()() const
2160  {
2161  _Range<_Index_type> _Worker_range(_M_first_iteration, _M_last_iteration);
2162 
2163  // This class has two modes: worker and helper. The originally split chunk is always a
2164  // worker, while any subsequent class spawned from this class is in the helper
2165  // mode, which is signified using a link to the worker class through _M_pOwning_worker
2166  // handle. So, it will wait for work to be dished out by the working class while in helper mode.
2167  if (_M_parent_worker != NULL && !_M_parent_worker->_Receive_range(&_Worker_range))
2168  {
2169  // If the worker class rejected the help, simply return
2170  return;
2171  }
2172 
2173  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2174  _Index_type _Current_iteration = _Worker_range._M_current_iteration;
2175  _Index_type _Scaled_index = _Current_iteration * _M_step;
2176 
2177  // If there is only one iteration to be executed there is no need to initialize any
2178  // helper classes (work is indivisible).
2179  if (_Worker_range._Number_of_iterations() == 1)
2180  {
2181  // Execute one iteration
2183  return;
2184  }
2185 
2186  // If the execution reaches this point it means that this class now has a chunk of work
2187  // that it needs to get done, so it has transitioned into the worker mode.
2188  structured_task_group _Helper_group;
2189 
2190  // Initialize fields that are needed in the helper
2191  _Worker_proxy<_Index_type> _Worker(_M_parent_worker);
2192 
2193  // Instantiate a helper class for this working class and put it on the work queue.
2194  // If some thread is idle it will be able to steal the helper and help this class
2195  // finish its work by stealing a piece of the work range.
2196  task_handle<_Parallel_chunk_helper> _Helper_task(_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2197 
2198  _Helper_group.run(_Helper_task);
2199 
2201 
2202  // Normally, for a cancellation semantic in cooperation with the helper, we would run_and_wait the below code on the Helper_group. Unfortunately,
2203  // the capture by reference of things which must be shared (_Worker, and so forth) will cause the loop below to add additional indirection
2204  // instructions. The loop below *MUST* be as tight as possible with the defined semantics. Instead, we will manually notify our parent if the
2205  // worker's destructor runs without hitting the bottom of our chunk. This is done through notification on the beacon.
2206 
2207  for (_Index_type _I = _Current_iteration; _I < _Worker_range._M_last_iteration; (_I++, _Worker_range._M_current_iteration =_I, _Scaled_index += _M_step))
2208  {
2209  if (_Worker._Is_beacon_signaled())
2210  {
2211  // Either a parent task group is canceled or one of the other iterations
2212  // threw an exception. Abort the remaining iterations
2213  //
2214  // Note that this could be a false positive that we must verify.
2215  if (_Worker._Is_done() || _Worker._Verify_beacon_cancellation())
2216  {
2217  break;
2218  }
2219  }
2220 
2221  if (_Worker._Is_helper_registered())
2222  {
2223  // The helper class (there can only be one) registered to help this class with the work.
2224  // Thus, figure out if this class needs help and split the range among the two classes.
2225 
2226  if (_Worker._Send_range(&_Worker_range))
2227  {
2228  // Construct every new instance of a helper class on the stack because it is beneficial to use
2229  // a structured task group where the class itself is responsible for task handle's lifetime.
2230  task_handle<_Parallel_chunk_helper> * _Helper_subtask = _Holder._AddRawMallocaNode(_malloca(_Holder._GetAllocationSize()));
2231 
2232  new(_Helper_subtask) task_handle<_Parallel_chunk_helper>
2233  (_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2234 
2235  // If _Send_range returns true, that means that there is still some non-trivial
2236  // work to be done, so this class will potentially need another helper.
2237  _Helper_group.run(*_Helper_subtask);
2238  }
2239  }
2240 
2241  // Allow intrusive stealing by the helper
2242  _Worker._Enable_intrusive_steal(&_Worker_range);
2243 
2244  // Execute one iteration: the element is at scaled index away from the first element.
2246 
2247  // Helper shall not steal a range after this call
2248  _Worker._Disable_intrusive_steal();
2249  }
2250 
2251  // Indicate that the worker is done with its iterations.
2252  _Worker._Set_done();
2253 
2254  // Wait for all worker/helper iterations to finish
2255  _Helper_group.wait();
2256 
2257  // Make sure that we've signaled that the tree is complete. This is used to detect any exception out of either _Parallel_chunk_helper_invoke or
2258  // _Helper_group.wait() above as a cancellation of the loop which must propagate upwards because we do not wrap the loop body in run_and_wait.
2259  _Worker._Set_tree_done();
2260  }
2261 
2262 private:
2263 
2264  const _Random_iterator& _M_first;
2265  const _Index_type& _M_step;
2266  const _Function& _M_function;
2267 
2268  const _Index_type _M_first_iteration;
2269  const _Index_type _M_last_iteration;
2270 
2272 
2273  _Parallel_chunk_helper const & operator=(_Parallel_chunk_helper const&); // no assignment operator
2274 };
2275 
2276 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2278 {
2279 public:
2280  _Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration,
2281  _Index_type _Last_iteration, const _Index_type& _Step, const _Function& _Func, const _Partitioner&) :
2282  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func)
2283  {
2284  // Empty constructor because members are already assigned
2285  }
2286 
2287  void operator()() const
2288  {
2289  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2290  _Index_type _Scaled_index = _M_first_iteration * _M_step;
2291 
2292  for (_Index_type _I = _M_first_iteration; _I < _M_last_iteration; (_I++, _Scaled_index += _M_step))
2293  {
2294  // Execute one iteration: the element is at scaled index away from the first element.
2296  }
2297  }
2298 private:
2299 
2300  const _Random_iterator& _M_first;
2301  const _Index_type& _M_step;
2302  const _Function& _M_function;
2303 
2304  const _Index_type _M_first_iteration;
2305  const _Index_type _M_last_iteration;
2306 
2307  _Parallel_fixed_chunk_helper const & operator=(_Parallel_fixed_chunk_helper const&); // no assignment operator
2308 };
2309 
2310 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
2312 {
2313 public:
2315 
2316  _Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2317  const _Function& _Func, affinity_partitioner& _Part) :
2318  _M_fixed_helper(_Chunk_index, _First, _First_iteration, _Last_iteration, _Step, _Func, static_partitioner()),
2319  _M_chunk_location(_Part._Get_chunk_location(static_cast<unsigned int>(_Chunk_index)))
2320  {
2321  // Empty constructor because members are already assigned
2322  }
2323 
2324  // Override the operator() in the base class. Note that this is not a virtual override.
2325  void operator()() const
2326  {
2327  // Check here if location needs to be saved.
2328  if (_M_chunk_location._Is_system())
2329  {
2330  _M_chunk_location = location::current();
2331  }
2332 
2333  _M_fixed_helper();
2334  }
2335 private:
2336 
2339  _Parallel_localized_chunk_helper const & operator=(_Parallel_localized_chunk_helper const&); // no assignment operator
2340 };
2341 
2342 #pragma warning(pop)
2343 
2344 template <typename _Worker_class, typename _Index_type, typename Partitioner>
2346  task_handle<_Worker_class>* _Chunk_helpers,
2347  const Partitioner&,
2348  _Index_type _I)
2349 {
2350  _Task_group.run(_Chunk_helpers[_I]);
2351 }
2352 
2353 template <typename _Worker_class, typename _Index_type>
2355  task_handle<_Worker_class>* _Chunk_helpers,
2356  affinity_partitioner& _Part,
2357  _Index_type _I)
2358 {
2359  _Task_group.run(_Chunk_helpers[_I], _Part._Get_chunk_location(static_cast<unsigned int>(_I)));
2360 }
2361 
2362 
2363 // Helper functions that implement parallel_for
2364 
2365 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner>
2366 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func, _Partitioner&& _Part)
2367 {
2368  _CONCRT_ASSERT(_Range_arg > 1);
2369  _CONCRT_ASSERT(_Step > 0);
2370 
2371  _Index_type _Num_iterations = (_Step == 1) ? _Range_arg : (((_Range_arg - 1) / _Step) + 1);
2372  _CONCRT_ASSERT(_Num_iterations > 1);
2373 
2374  _Index_type _Num_chunks = _Part._Get_num_chunks(_Num_iterations);
2375  _CONCRT_ASSERT(_Num_chunks > 0);
2376 
2377  // Allocate memory on the stack for task_handles to ensure everything is properly structured.
2379  task_handle<_Worker_class> * _Chunk_helpers = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker_class>) * static_cast<size_t>(_Num_chunks)));
2380 
2381  structured_task_group _Task_group;
2382 
2383  _Index_type _Iterations_per_chunk = _Num_iterations / _Num_chunks;
2384  _Index_type _Remaining_iterations = _Num_iterations % _Num_chunks;
2385 
2386  // If there are less iterations than desired chunks, set the chunk number
2387  // to be the number of iterations.
2388  if (_Iterations_per_chunk == 0)
2389  {
2390  _Num_chunks = _Remaining_iterations;
2391  }
2392 
2393  _Index_type _Work_size = 0;
2394  _Index_type _Start_iteration = 0;
2395  _Index_type _I;
2396 
2397  // Split the available work in chunks
2398  for (_I = 0; _I < _Num_chunks - 1; _I++)
2399  {
2400  if (_Remaining_iterations > 0)
2401  {
2402  // Iterations are not divided evenly, so add 1 remainder iteration each time
2403  _Work_size = _Iterations_per_chunk + 1;
2404  _Remaining_iterations--;
2405  }
2406  else
2407  {
2408  _Work_size = _Iterations_per_chunk;
2409  }
2410 
2411  // New up a task_handle "in-place", in the array preallocated on the stack
2412  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2414 
2415  // Run each of the chunk tasks in parallel
2416  _Parallel_chunk_task_group_run(_Task_group, _Chunk_helpers, std::forward<_Partitioner>(_Part), _I);
2417 
2418  // Prepare for the next iteration
2419  _Start_iteration += _Work_size;
2420  }
2421 
2422  // Because this is the last iteration, then work size might be different
2423  _CONCRT_ASSERT((_Remaining_iterations == 0) || ((_Iterations_per_chunk == 0) && (_Remaining_iterations == 1)));
2424  _Work_size = _Num_iterations - _Start_iteration;
2425 
2426  // New up a task_handle "in-place", in the array preallocated on the stack
2427  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2429 
2430  _Task_group.run_and_wait(_Chunk_helpers[_I]);
2431 }
2432 
2433 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function>
2434 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func)
2435 {
2436  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, auto_partitioner());
2437 }
2438 
2439 // Helper for the parallel for API with the default dynamic partitioner which implements range-stealing to balance load.
2440 template <typename _Index_type, typename _Diff_type, typename _Function>
2441 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2442 {
2444  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2445 }
2446 
2447 // Helper for the parallel_for API with a static partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2448 template <typename _Index_type, typename _Diff_type, typename _Function>
2449 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const static_partitioner& _Part)
2450 {
2452  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2453 }
2454 
2455 // Helper for the parallel_for API with a simple partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2456 template <typename _Index_type, typename _Diff_type, typename _Function>
2457 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2458 {
2460  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2461 }
2462 
2463 // Helper for the parallel_for API with an affinity partitioner - creates a fixed number of chunks up front with no range-stealing enabled. subsequent
2464 // calls to parallel_for with the same affinity partitioner (pass in as a non-const reference) are scheduled to the same location they previously ran on
2465 template <typename _Index_type, typename _Diff_type, typename _Function>
2466 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2467 {
2469  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2470 }
2471 
2472 template <typename _Index_type, typename _Function, typename _Partitioner>
2473 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2474 {
2475  // The step argument must be 1 or greater; otherwise it is an invalid argument
2476  if (_Step < 1)
2477  {
2478  throw std::invalid_argument("_Step");
2479  }
2480 
2481  // If there are no elements in this range we just return
2482  if (_First >= _Last)
2483  {
2484  return;
2485  }
2486 
2487  // Compute the difference type based on the arguments and avoid signed overflow for int, long, and long long
2488  typedef typename std::conditional<std::is_same<_Index_type, int>::value, unsigned int,
2489  typename std::conditional<std::is_same<_Index_type, long>::value, unsigned long,
2490  typename std::conditional<std::is_same<_Index_type, long long>::value, unsigned long long, decltype(_Last - _First)
2491  >::type
2492  >::type
2493  >::type _Diff_type;
2494 
2495  _Diff_type _Range_size = _Diff_type(_Last) - _Diff_type(_First);
2496  _Diff_type _Diff_step = _Step;
2497 
2498  if (_Range_size <= _Diff_step)
2499  {
2500  _Func(_First);
2501  }
2502  else
2503  {
2504  _Parallel_for_partitioned_impl<_Index_type, _Diff_type, _Function>(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2505  }
2506 }
2507 
2508 template <typename _Index_type, typename _Function>
2509 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2510 {
2511  _Parallel_for_impl(_First, _Last, _Step, _Func, auto_partitioner());
2512 }
2513 
2553 
2554 template <typename _Index_type, typename _Function, typename _Partitioner>
2555 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2556 {
2558  _Parallel_for_impl(_First, _Last, _Step, _Func, std::forward<_Partitioner>(_Part));
2560 }
2561 
2589 
2590 template <typename _Index_type, typename _Function>
2591 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2592 {
2593  parallel_for(_First, _Last, _Step, _Func, auto_partitioner());
2594 }
2595 
2628 
2629 template <typename _Index_type, typename _Function>
2630 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const auto_partitioner& _Part = auto_partitioner())
2631 {
2632  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2633 }
2634 
2667 
2668 template <typename _Index_type, typename _Function>
2669 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const static_partitioner& _Part)
2670 {
2671  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2672 }
2673 
2706 
2707 template <typename _Index_type, typename _Function>
2708 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const simple_partitioner& _Part)
2709 {
2710  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2711 }
2712 
2745 
2746 template <typename _Index_type, typename _Function>
2747 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, affinity_partitioner& _Part)
2748 {
2749  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2750 }
2751 
2752 // parallel_for_each -- This function will iterate over all elements in the iterator's range.
2753 
2754 // Closure (binding) classes for invoking parallel_for_each recursively
2755 
2756 // A closure class used for packaging chunk of elements in parallel_for_each for parallel invocation
2757 
2758 // Forward iterator for_each using unstructured task group
2759 
2760 // Disable C4180: qualifier applied to function type has no meaning; ignored
2761 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2762 #pragma warning(push)
2763 #pragma warning(disable: 4180)
2764 
2765 template <typename _Forward_iterator, typename _Function, unsigned int _Chunk_size>
2767 {
2768 public:
2769  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
2770  static const unsigned int _Size = _Chunk_size;
2771 
2772  _Parallel_for_each_helper(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func) :
2773  _M_function(_Func), _M_len(0)
2774  {
2775  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
2776  // Add a batch of work items to this functor's array
2777  for (unsigned int _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
2778  {
2779  _M_element[_M_len++] = &(*_First++);
2780  }
2781  }
2782 
2783  void operator()() const
2784  {
2785  // Invoke parallel_for on the batched up array of elements
2786  _Parallel_for_impl(0U, _M_len, 1U,
2787  [this] (unsigned int _Index)
2788  {
2789  _M_function(*(_M_element[_Index]));
2790  }
2791  );
2792  }
2793 
2794 private:
2795 
2796  const _Function& _M_function;
2797  typename std::iterator_traits<_Forward_iterator>::pointer _M_element[_Size];
2798  unsigned int _M_len;
2799 
2800  _Parallel_for_each_helper const & operator=(_Parallel_for_each_helper const&); // no assignment operator
2801 };
2802 
2803 #pragma warning(pop)
2804 
2805 // Helper functions that implement parallel_for_each
2806 
2807 template <typename _Forward_iterator, typename _Function>
2808 void _Parallel_for_each_chunk(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2809 {
2810  // The chunk size selection depends more on the internal implementation of parallel_for than
2811  // on the actual input. Also, it does not have to be dynamically computed, but it helps
2812  // parallel_for if it is a power of 2 (easy to divide).
2813  const unsigned int _Chunk_size = 1024;
2814 
2815  // This functor will be copied on the heap and will execute the chunk in parallel
2817 
2818  // Because this is an unstructured task group, running the task will make a copy of the necessary data
2819  // on the heap, ensuring that it is available at the time of execution.
2820  _Task_group.run(_Functor);
2821 }
2822 
2823 template <typename _Forward_iterator, typename _Function>
2824 void _Parallel_for_each_forward_impl(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2825 {
2826  _Parallel_for_each_chunk(_First, _Last, _Func, _Task_group);
2827 
2828  // If there is a tail, push the tail
2829  if (_First != _Last)
2830  {
2831  _Task_group.run(
2832  [&_First, &_Last, &_Func, &_Task_group]
2833  {
2834  ::Concurrency::_Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2835  }
2836  );
2837  }
2838 }
2839 
2840 template <typename _Forward_iterator, typename _Function>
2841 void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func, const auto_partitioner&, std::forward_iterator_tag)
2842 {
2843  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
2844  // it is up to the user to provide valid range.
2845  if (_First != _Last)
2846  {
2847  task_group _Task_group;
2848 
2849  _Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2850 
2851  _Task_group.wait();
2852  }
2853 }
2854 
2855 template <typename _Random_iterator, typename _Index_type, typename _Function>
2856 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2857 {
2859  // Use the same function that schedules work for parallel for
2860  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2861 }
2862 
2863 template <typename _Random_iterator, typename _Index_type, typename _Function>
2864 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const static_partitioner& _Part)
2865 {
2867  // Use the same function that schedules work for parallel for
2868  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2869 }
2870 
2871 template <typename _Random_iterator, typename _Index_type, typename _Function>
2872 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2873 {
2875  // Use the same function that schedules work for parallel for
2876  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2877 }
2878 
2879 template <typename _Random_iterator, typename _Index_type, typename _Function>
2880 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2881 {
2883  // Use the same function that schedules work for parallel for
2884  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2885 }
2886 
2887 template <typename _Random_iterator, typename _Function, typename _Partitioner>
2888 void _Parallel_for_each_impl(const _Random_iterator& _First, const _Random_iterator& _Last, const _Function& _Func, _Partitioner&& _Part, std::random_access_iterator_tag)
2889 {
2890  typedef typename std::iterator_traits<_Random_iterator>::difference_type _Index_type;
2891 
2892  // Exit early if there is nothing in the collection
2893  if (_First >= _Last)
2894  {
2895  return;
2896  }
2897 
2898  _Index_type _Range_size = _Last - _First;
2899 
2900  if (_Range_size == 1)
2901  {
2902  _Func(*_First);
2903  }
2904  else
2905  {
2906  _Index_type _Step = 1;
2907 
2908  _Parallel_for_each_partitioned_impl(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2909  }
2910 }
2911 
2939 
2940 template <typename _Iterator, typename _Function>
2941 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func)
2942 {
2943  parallel_for_each(_First, _Last, _Func, auto_partitioner());
2944 }
2945 
2982 
2983 template <typename _Iterator, typename _Function, typename _Partitioner>
2984 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func, _Partitioner&& _Part)
2985 {
2987  _Parallel_for_each_impl(_First, _Last, _Func, std::forward<_Partitioner>(_Part), typename std::iterator_traits<_Iterator>::iterator_category());
2989 }
2990 
2991 // Disable C4180: qualifier applied to function type has no meaning; ignored
2992 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2993 #pragma warning(push)
2994 #pragma warning(disable: 4180)
2995 
2996 // Helper function assemble all functors
2997 template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
2999 {
3000  const _Sub_function &_Sub_fun;
3002 
3003  _Combinable_type &_Combinable;
3004 
3006  typedef typename _Combinable_type::_Bucket _Bucket_type;
3007 
3008  _Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb):
3009  _Sub_fun(_Sub_fun), _Combinable(_Comb), _Identity_value(_Identity)
3010  {
3011  }
3012 
3013 private:
3014  _Reduce_functor_helper &operator =(const _Reduce_functor_helper &_Other);
3015 };
3016 
3017 // Ordered serial combinable object
3018 template<typename _Ty, typename _Sym_fun>
3020 {
3021 public:
3022  // Only write once, limited contention will be caused
3023  struct _Bucket
3024  {
3025  // Allocate enough space in the Bucket to hold a value
3026  char _Value[(sizeof(_Ty) / sizeof(char))];
3028 
3030  {
3031  _Next = _N;
3032  }
3033 
3034  void _Insert(_Bucket *_Item)
3035  {
3036  // No need to lock, only one thread will insert
3037  _Item->_Next = _Next;
3038  _Next = _Item;
3039  }
3040 
3041  // Construct value in bucket
3042  void _Put(const _Ty &_Cur)
3043  {
3044  new(reinterpret_cast<_Ty *>(&_Value)) _Ty(_Cur);
3045  }
3046  };
3047 
3048 private:
3049  const _Sym_fun &_M_fun;
3050  size_t _M_number;
3052  _Order_combinable &operator =(const _Order_combinable &_Other);
3053 
3054 public:
3056  {
3057  _Bucket * _Ret = static_cast<_Bucket *>(::Concurrency::Alloc(sizeof(_Bucket)));
3058  return new(_Ret)_Bucket(_Par);
3059  }
3060 
3061  _Order_combinable(const _Sym_fun &_Fun): _M_fun(_Fun)
3062  {
3063  _M_root = 0;
3064  _M_number = 0;
3065  }
3066 
3068  {
3069  while (_M_root)
3070  {
3071  _Bucket *_Cur = _M_root;
3072  _M_root = _M_root->_Next;
3073  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3074  ::Concurrency::Free(_Cur);
3075  }
3076  }
3077 
3078  // Serially combine and release the list, return result
3080  {
3081  _Ty _Ret(reinterpret_cast<_Ty &>(_M_root->_Value));
3082  _Bucket *_Cur = _M_root;
3083  _M_root = _M_root->_Next;
3084 
3085  while (_M_root)
3086  {
3087  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3088  ::Concurrency::Free(_Cur);
3089  _Cur = _M_root;
3090  _Ret = _M_fun(reinterpret_cast <_Ty &> (_Cur->_Value), _Ret);
3091  _M_root = _M_root->_Next;
3092  }
3093 
3094  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3095  ::Concurrency::Free(_Cur);
3096 
3097  return _Ret;
3098  }
3099 
3100  // allocate a bucket and push back to the list
3102  {
3103  return _M_root = _Construct(_M_root);
3104  }
3105 };
3106 
3164 
3165 template<typename _Reduce_type, typename _Forward_iterator, typename _Range_reduce_fun, typename _Sym_reduce_fun>
3166 inline _Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type& _Identity,
3167  const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
3168 {
3169  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
3170 
3171  static_assert(!std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::input_iterator_tag>::value
3172  && !std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::output_iterator_tag>::value,
3173  "iterator can not be input_iterator or output_iterator.");
3174 
3175  return _Parallel_reduce_impl(_Begin, _End,
3176  _Reduce_functor_helper<_Reduce_type, _Range_reduce_fun,
3178  typename std::iterator_traits<_Forward_iterator>::iterator_category());
3179 }
3180 
3227 
3228 template<typename _Forward_iterator, typename _Sym_reduce_fun>
3229 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End,
3230  const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity, _Sym_reduce_fun _Sym_fun)
3231 {
3232  typedef typename std::remove_cv<typename std::iterator_traits<_Forward_iterator>::value_type>::type _Reduce_type;
3233 
3234  return parallel_reduce(_Begin, _End, _Identity,
3235  [_Sym_fun](_Forward_iterator _Begin, _Forward_iterator _End, _Reduce_type _Init)->_Reduce_type
3236  {
3237  while (_Begin != _End)
3238  {
3239  _Init = _Sym_fun(_Init, *_Begin++);
3240  }
3241 
3242  return _Init;
3243  },
3244  _Sym_fun);
3245 }
3246 
3285 
3286 template<typename _Forward_iterator>
3287 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(
3288  _Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity)
3289 {
3290  return parallel_reduce(_Begin, _End, _Identity, std::plus<typename std::iterator_traits<_Forward_iterator>::value_type>());
3291 }
3292 
3293 // Implementation for the parallel reduce
3294 template <typename _Forward_iterator, typename _Function>
3295 typename _Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func,
3296  std::forward_iterator_tag)
3297 {
3298  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
3299  // it is up to the user to provide valid range.
3300  if (_First != _Last)
3301  {
3302  task_group _Task_group;
3303  _Parallel_reduce_forward_executor(_First, _Last, _Func, _Task_group);
3304  _Task_group.wait();
3305  return _Func._Combinable._Serial_combine_release();
3306  }
3307  else
3308  {
3309  return _Func._Identity_value;
3310  }
3311 }
3312 
3313 // All the code below is the worker without range stealing
3314 template<typename _Forward_iterator, typename _Functor>
3316 {
3317 public:
3318  // The bucket allocation order will depend on the worker construction order
3319  _Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun):
3320  _M_begin(_Begin), _M_end(_End), _M_fun(_Fun), _M_bucket(_M_fun._Combinable._Unsafe_push_back())
3321  {
3322  }
3323 
3324  void operator ()() const
3325  {
3326  _M_bucket->_Put(_M_fun._Sub_fun(_M_begin, _M_end, _M_fun._Identity_value));
3327  }
3328 
3329 private:
3330  const _Functor &_M_fun;
3331  const _Forward_iterator _M_begin, _M_end;
3332  typename _Functor::_Bucket_type * const _M_bucket;
3334 };
3335 
3336 template <typename _Worker, typename _Random_iterator, typename _Function>
3337 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun);
3338 
3339 template <typename _Random_iterator, typename _Function>
3340 typename _Function::_Reduce_type _Parallel_reduce_impl(_Random_iterator _First, _Random_iterator _Last, const _Function& _Func,
3341  std::random_access_iterator_tag)
3342 {
3344 
3345  // Special case for 0, 1 element
3346  if (_First >= _Last)
3347  {
3348  return _Func._Identity_value;
3349  }
3350  // Directly compute if size is too small
3351  else if (_Last - _First <= 1)
3352  {
3353  _Worker_class(_First, _Last, _Func)();
3354  return _Func._Combinable._Serial_combine_release();
3355  }
3356  else
3357  {
3358  // Use fixed ordered chunk partition to schedule works
3359  _Parallel_reduce_random_executor<_Worker_class>(_First, _Last, _Func);
3360  return _Func._Combinable._Serial_combine_release();
3361  }
3362 }
3363 
3364 // the parallel worker executor for fixed iterator
3365 // it will divide fixed number of chunks
3366 // almost same as fixed parallel for, except keep the chunk dividing order
3367 template <typename _Worker, typename _Random_iterator, typename _Function>
3368 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun)
3369 {
3370  size_t _Cpu_num = static_cast<size_t>(::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors()), _Size = _End - _Begin;
3371 
3374  task_handle<_Worker> *_Tasks = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker>) * (_Cpu_num - 1)));
3375 
3376  size_t _Begin_index = 0;
3377  size_t _Step = _Size / _Cpu_num;
3378  size_t _NumRemaining = _Size - _Step * _Cpu_num;
3379 
3380  for(size_t _I = 0; _I < _Cpu_num - 1; _I++)
3381  {
3382  size_t _Next = _Begin_index + _Step;
3383 
3384  // Add remaining to each chunk
3385  if (_NumRemaining)
3386  {
3387  --_NumRemaining;
3388  ++_Next;
3389  }
3390 
3391  // New up a task_handle "in-place", in the array preallocated on the stack
3392  new (_Tasks + _I) task_handle<_Worker>(_Worker(_Begin + _Begin_index, _Begin + _Next, _Fun));
3394 
3395  // Run each of the chunk _Tasks in parallel
3396  _Tg.run(_Tasks[_I]);
3397  _Begin_index = _Next;
3398  }
3399 
3400  task_handle<_Worker> _Tail(_Worker(_Begin + _Begin_index, _End, _Fun));
3401  _Tg.run_and_wait(_Tail);
3402 }
3403 
3404 // The parallel worker executor for forward iterators
3405 // Divide chunks on the fly
3406 template <typename _Forward_iterator, typename _Function, int _Default_worker_size, int _Default_chunk_size>
3408 {
3410  mutable std::auto_ptr<task_handle<_Worker_class>> _Workers;
3412 
3413  _Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function& _Func):
3414  _Workers(static_cast<task_handle<_Worker_class> *>(::Concurrency::Alloc(sizeof(task_handle<_Worker_class>) * _Default_worker_size)))
3415  {
3416  _Worker_size = 0;
3417  while (_Worker_size < _Default_worker_size && _First != _Last)
3418  {
3419  // Copy the range _Head
3420  _Forward_iterator _Head = _First;
3421 
3422  // Read from forward iterator
3423  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3424  {
3425  // Body is empty
3426  }
3427 
3428  // _First will be the end of current chunk
3429  new (_Workers.get() + _Worker_size++) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3430  }
3431  }
3432 
3434  _Workers(_Other._Workers), _Worker_size(_Other._Worker_size)
3435  {
3436  }
3437 
3438  void operator ()() const
3439  {
3441  for(int _I = 0; _I < _Worker_size; _I++)
3442  {
3443  _Tg.run(_Workers.get()[_I]);
3444  }
3445  _Tg.wait();
3446  }
3447 
3449  {
3450  if (_Workers.get())
3451  {
3452  for (int _I = 0; _I < _Worker_size; _I++)
3453  {
3454  _Workers.get()[_I].~task_handle<_Worker_class>();
3455  }
3456  ::Concurrency::Free(_Workers.release());
3457  }
3458  }
3459 };
3460 
3461 template <typename _Forward_iterator, typename _Function>
3462 void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function& _Func, task_group& _Task_group)
3463 {
3464  const static int _Internal_worker_number = 1024, _Default_chunk_size = 512;
3466 
3467  structured_task_group _Worker_group;
3469  task_handle<_Worker_class>* _Workers = _Holder._InitOnRawMalloca(_malloca(_Internal_worker_number * sizeof(task_handle<_Worker_class>)));
3470 
3471  // Start execution first
3472  int _Index = 0;
3473  while (_Index < _Internal_worker_number && _First != _Last)
3474  {
3475  // Copy the range _Head
3476  _Forward_iterator _Head = _First;
3477 
3478  // Read from forward iterator
3479  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3480  {
3481  // Body is empty
3482  };
3483 
3484  // Create a new task, _First is range _End
3485  new (_Workers + _Index) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3487  _Worker_group.run(_Workers[_Index]);
3488  ++_Index;
3489  }
3490 
3491  // Divide and append the left
3492  while (_First != _Last)
3493  {
3495  }
3496 
3497  _Worker_group.wait();
3498 }
3499 
3500 #pragma warning(pop)
3501 
3502 
3503 // Disable C4180: qualifier applied to function type has no meaning; ignored
3504 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
3505 #pragma warning(push)
3506 #pragma warning(disable: 4180)
3507 
3508 //
3509 // Dispatch the execution and handle the condition that all of the iterators are random access
3510 //
3511 template<typename _Any_input_traits, typename _Any_output_traits>
3513 {
3514  template<typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3515  static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator& _Result, const _Unary_operator& _Unary_op, const auto_partitioner&)
3516  {
3517  task_group _Tg;
3518  _Parallel_transform_unary_impl2(_Begin, _End, _Result, _Unary_op, _Tg);
3519  _Tg.wait();
3520  }
3521 };
3522 
3523 template<>
3524 struct _Unary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag>
3525 {
3526  template<typename _Random_input_iterator, typename _Random_output_iterator, typename _Unary_operator, typename _Partitioner>
3527  static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End,
3528  _Random_output_iterator& _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3529  {
3530  if (_Begin < _End)
3531  {
3532  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End - _Begin), static_cast<size_t>(1),
3533  [_Begin, &_Result, &_Unary_op](size_t _Index)
3534  {
3535  _Result[_Index] = _Unary_op(_Begin[_Index]);
3536  },
3537  std::forward<_Partitioner>(_Part));
3538  _Result += _End - _Begin;
3539  }
3540  }
3541 };
3542 
3543 template<typename _Any_input_traits1, typename _Any_input_traits2, typename _Any_output_traits>
3545 {
3546 
3547  template<typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3548  static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2,
3549  _Output_iterator& _Result, const _Binary_operator& _Binary_op, const auto_partitioner&)
3550  {
3551  task_group _Tg;
3552  _Parallel_transform_binary_impl2(_Begin1, _End1, _Begin2, _Result, _Binary_op, _Tg);
3553  _Tg.wait();
3554  }
3555 };
3556 
3557 template<>
3558 struct _Binary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag>
3559 {
3560  template<typename _Random_input_iterator1, typename _Random_input_iterator2, typename _Random_output_iterator, typename _Binary_operator, typename _Partitioner>
3561  static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1,
3562  _Random_input_iterator2 _Begin2, _Random_output_iterator& _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
3563  {
3564  if (_Begin1 < _End1)
3565  {
3566  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End1 - _Begin1), static_cast<size_t>(1),
3567  [_Begin1, _Begin2, &_Result, &_Binary_op](size_t _Index)
3568  {
3569  _Result[_Index] = _Binary_op(_Begin1[_Index], _Begin2[_Index]);
3570  },
3571  std::forward<_Partitioner>(_Part));
3572  _Result += _End1 - _Begin1;
3573  }
3574  }
3575 };
3576 
3577 //
3578 // The implementation for at least one of the iterator is forward iterator
3579 //
3580 template <typename _Forward_iterator, typename _Iterator_kind>
3582 {
3583 public:
3584  static const size_t _Size = 1024;
3585  typedef typename std::iterator_traits<_Forward_iterator>::value_type value_type;
3586 
3588  {
3589  static_assert(!std::is_same<_Iterator_kind, std::input_iterator_tag>::value
3590  && !std::is_same<_Iterator_kind, std::output_iterator_tag>::value,
3591  "iterator can not be input_iterator or output_iterator.");
3592  }
3593 
3594  size_t _Populate(_Forward_iterator& _First, _Forward_iterator _Last)
3595  {
3596  size_t _Length = 0;
3597  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
3598 
3599  for (size_t _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
3600  {
3601  // We only support l-value here, so it's safe
3602  _M_element_array[_Length++] = &(*_First++);
3603  }
3604 
3605  return _Length;
3606  }
3607 
3608  void _Populate(_Forward_iterator& _First, size_t _Length)
3609  {
3610  for (size_t _Index=0; _Index < _Length; _Index++)
3611  {
3612  _M_element_array[_Index] = &(*_First++);
3613  }
3614  }
3615 
3616  void _Store(const value_type& _Elem, size_t _Index) const
3617  {
3618  *(_M_element_array[_Index]) = _Elem;
3619  }
3620 
3621  typename std::iterator_traits<_Forward_iterator>::reference _Load(size_t _Index) const
3622  {
3623  return *(_M_element_array[_Index]);
3624  }
3625 
3626 private:
3627  typename std::iterator_traits<_Forward_iterator>::pointer _M_element_array[_Size];
3628 };
3629 
3630 template <typename _Random_iterator>
3632 {
3633 public:
3634  static const size_t _Size = 1024;
3635  typedef typename std::iterator_traits<_Random_iterator>::value_type value_type;
3636 
3638  {
3639  }
3640 
3641  size_t _Populate(_Random_iterator& _First, _Random_iterator _Last)
3642  {
3643  typename std::iterator_traits<_Random_iterator>::difference_type _Range_size = _Last - _First;
3644  typename std::iterator_traits<_Random_iterator>::difference_type _Sized = _Size;
3645  _M_first = _First;
3646 
3647  if (_Range_size > _Sized)
3648  {
3649  _First += _Size;
3650  return _Size;
3651  }
3652  else
3653  {
3654  _First += _Range_size;
3655  return static_cast<size_t>(_Range_size);
3656  }
3657  }
3658 
3659  void _Populate(_Random_iterator& _First, size_t _Length)
3660  {
3661  _M_first = _First;
3662  _First += _Length;
3663  }
3664 
3665  void _Store(const value_type& _Elem, size_t _Index) const
3666  {
3667  _M_first[_Index] = _Elem;
3668  }
3669 
3670  typename std::iterator_traits<_Random_iterator>::reference _Load(size_t _Index) const
3671  {
3672  // We only support l-value here
3673  return _M_first[_Index];
3674  }
3675 
3676 private:
3677  _Random_iterator _M_first;
3678 };
3679 
3680 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3682 {
3683 public:
3684  _Parallel_transform_binary_helper(_Input_iterator1& _First1, _Input_iterator1 _Last1, _Input_iterator2& _First2,
3685  _Output_iterator& _Result, const _Binary_operator& _Binary_op) :
3686  _M_binary_op(_Binary_op), _M_len(0)
3687  {
3688  _M_len = _M_input_helper1._Populate(_First1, _Last1);
3689  _M_input_helper2._Populate(_First2, _M_len);
3690  _M_output_helper._Populate(_Result, _M_len);
3691  }
3692 
3693  void operator()() const
3694  {
3695  // Invoke parallel_for on the batched up array of elements
3696  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3697  [this] (size_t _Index)
3698  {
3699  _M_output_helper._Store(_M_binary_op(_M_input_helper1._Load(_Index), _M_input_helper2._Load(_Index)), _Index);
3700  });
3701  }
3702 
3703 private:
3704 
3708  const _Binary_operator& _M_binary_op;
3709  size_t _M_len;
3710 
3711  _Parallel_transform_binary_helper const & operator=(_Parallel_transform_binary_helper const&); // no assignment operator
3712 };
3713 
3714 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3715 void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result,
3716  const _Binary_operator& _Binary_op, task_group& _Tg)
3717 {
3718  // This functor will be copied on the heap and will execute the chunk in parallel
3719  {
3721  _Tg.run(_Functor);
3722  }
3723 
3724  // If there is a tail, push the tail
3725  if (_First1 != _Last1)
3726  {
3727  _Tg.run(
3728  [=, &_Result, &_Binary_op, &_Tg]
3729  {
3730  _Parallel_transform_binary_impl2(_First1, _Last1, _First2, _Result, _Binary_op, _Tg);
3731  });
3732  }
3733 }
3734 
3735 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3737 {
3738 public:
3739  _Parallel_transform_unary_helper(_Input_iterator& _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator& _Unary_op) :
3740  _M_unary_op(_Unary_op), _M_len(0)
3741  {
3742  _M_len = _M_input_helper._Populate(_First, _Last);
3743  _M_output_helper._Populate(_Result, _M_len);
3744  }
3745 
3746  void operator()() const
3747  {
3748  // Invoke parallel_for on the batched up array of elements
3749  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3750  [this] (size_t _Index)
3751  {
3752  _M_output_helper._Store(_M_unary_op(_M_input_helper._Load(_Index)), _Index);
3753  });
3754  }
3755 
3756 private:
3757 
3760  const _Unary_operator& _M_unary_op;
3761  size_t _M_len;
3762 
3763  _Parallel_transform_unary_helper const & operator=(_Parallel_transform_unary_helper const&); // no assignment operator
3764 };
3765 
3766 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3767 void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result,
3768  const _Unary_operator& _Unary_op, task_group& _Tg)
3769 {
3770  // This functor will be copied on the heap and will execute the chunk in parallel
3771  {
3773  _Tg.run(_Functor);
3774  }
3775 
3776  // If there is a tail, push the tail
3777  if (_First != _Last)
3778  {
3779  _Tg.run(
3780  [=, &_Result, &_Unary_op, &_Tg]
3781  {
3782  _Parallel_transform_unary_impl2(_First, _Last, _Result, _Unary_op, _Tg);
3783  });
3784  }
3785 }
3786 
3787 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator, typename _Partitioner>
3788 _Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3789 {
3790  typedef typename std::iterator_traits<_Input_iterator>::iterator_category _Input_iterator_type;
3791  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
3792 
3793  if (_First != _Last)
3794  {
3796  ::_Parallel_transform_unary_impl(_First, _Last, _Result, _Unary_op, std::forward<_Partitioner>(_Part));
3797  }
3798 
3799  return _Result;
3800 }
3801 
3852 
3853 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3854 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const auto_partitioner& _Part = auto_partitioner())
3855 {
3856  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3857 }
3858 
3909 
3910 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3911 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const static_partitioner& _Part)
3912 {
3913  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3914 }
3915 
3966 
3967 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3968 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const simple_partitioner& _Part)
3969 {
3970  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3971 }
3972 
4023 
4024 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
4025 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, affinity_partitioner& _Part)
4026 {
4027  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
4028 }
4029 
4086 
4087 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator, typename _Partitioner>
4088 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4089  _Output_iterator _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
4090 {
4091  typedef typename std::iterator_traits<_Input_iterator1>::iterator_category _Input_iterator_type1;
4092  typedef typename std::iterator_traits<_Input_iterator2>::iterator_category _Input_iterator_type2;
4093  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
4094 
4095  if (_First1 != _Last1)
4096  {
4098  ::_Parallel_transform_binary_impl(_First1, _Last1, _First2, _Result, _Binary_op, std::forward<_Partitioner>(_Part));
4099  }
4100 
4101  return _Result;
4102 }
4103 
4151 
4152 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
4153 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4154  _Output_iterator _Result, const _Binary_operator& _Binary_op)
4155 {
4156  return parallel_transform(_First1, _Last1, _First2, _Result, _Binary_op, auto_partitioner());
4157 }
4158 
4159 #pragma warning(pop)
4160 
4161 #pragma warning(push)
4162 // object allocated on the heap may not be aligned 64
4163 #pragma warning(disable: 4316)
4164 
4178 
4179 template<typename _Ty>
4181 {
4182 private:
4183 
4184 // Disable warning C4324: structure was padded due to __declspec(align())
4185 // This padding is expected and necessary.
4186 #pragma warning(push)
4187 #pragma warning(disable: 4324)
4189  struct _Node
4190  {
4191  unsigned long _M_key;
4192  _Ty _M_value;
4193  _Node* _M_chain;
4194 
4195  _Node(unsigned long _Key, _Ty _InitialValue)
4196  : _M_key(_Key), _M_value(_InitialValue), _M_chain(NULL)
4197  {
4198  }
4199  };
4200 #pragma warning(pop)
4201 
4202  static _Ty _DefaultInit()
4203  {
4204  return _Ty();
4205  }
4206 
4207 public:
4218 
4220  : _M_fnInitialize(_DefaultInit)
4221  {
4222  _InitNew();
4223  }
4224 
4242 
4243  template <typename _Function>
4244  explicit combinable(_Function _FnInitialize)
4245  : _M_fnInitialize(_FnInitialize)
4246  {
4247  _InitNew();
4248  }
4249 
4263 
4264  combinable(const combinable& _Copy)
4265  : _M_size(_Copy._M_size), _M_fnInitialize(_Copy._M_fnInitialize)
4266  {
4267  _InitCopy(_Copy);
4268  }
4269 
4279 
4281  {
4282  clear();
4283  delete [] _M_buckets;
4284  _M_fnInitialize = _Copy._M_fnInitialize;
4285  _M_size = _Copy._M_size;
4286  _InitCopy(_Copy);
4287 
4288  return *this;
4289  }
4290 
4294 
4296  {
4297  clear();
4298  delete [] _M_buckets;
4299  }
4300 
4308 
4309  _Ty& local()
4310  {
4312  size_t _Index;
4313  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4314  if (_ExistingNode == NULL)
4315  {
4316  _ExistingNode = _AddLocalItem(_Key, _Index);
4317  }
4318 
4319  _CONCRT_ASSERT(_ExistingNode != NULL);
4320  return _ExistingNode->_M_value;
4321  }
4322 
4335 
4336  _Ty& local(bool& _Exists)
4337  {
4339  size_t _Index;
4340  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4341  if (_ExistingNode == NULL)
4342  {
4343  _Exists = false;
4344  _ExistingNode = _AddLocalItem(_Key, _Index);
4345  }
4346  else
4347  {
4348  _Exists = true;
4349  }
4350 
4351  _CONCRT_ASSERT(_ExistingNode != NULL);
4352  return _ExistingNode->_M_value;
4353  }
4354 
4358 
4359  void clear()
4360  {
4361  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4362  {
4363  _Node* _CurrentNode = _M_buckets[_Index];
4364  while (_CurrentNode != NULL)
4365  {
4366  _Node* _NextNode = _CurrentNode->_M_chain;
4367  delete _CurrentNode;
4368  _CurrentNode = _NextNode;
4369  }
4370  }
4371  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4372  }
4373 
4388 
4389  template<typename _Function>
4390  _Ty combine(_Function _FnCombine) const
4391  {
4392  _Node* _CurrentNode = NULL;
4393  size_t _Index;
4394 
4395  // Look for the first value in the set, and use (a copy of) that as the result.
4396  // This eliminates a single call (of unknown cost) to _M_fnInitialize.
4397  for (_Index = 0; _Index < _M_size; ++_Index)
4398  {
4399  _CurrentNode = _M_buckets[_Index];
4400  if (_CurrentNode != NULL)
4401  {
4402  break;
4403  }
4404  }
4405 
4406  // No values... return the initializer value.
4407  if (_CurrentNode == NULL)
4408  {
4409  return _M_fnInitialize();
4410  }
4411 
4412  // Accumulate the rest of the items in the current bucket.
4413  _Ty _Result = _CurrentNode->_M_value;
4414  for (_CurrentNode = _CurrentNode->_M_chain; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4415  {
4416  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4417  }
4418 
4419  // Accumulate values from the rest of the buckets.
4420  _CONCRT_ASSERT(_Index < _M_size);
4421  for (++_Index; _Index < _M_size; ++_Index)
4422  {
4423  for (_CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4424  {
4425  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4426  }
4427  }
4428 
4429  return _Result;
4430  }
4431 
4444 
4445  template<typename _Function>
4446  void combine_each(_Function _FnCombine) const
4447  {
4448  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4449  {
4450  for (_Node* _CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4451  {
4452  _FnCombine(_CurrentNode->_M_value);
4453  }
4454  }
4455  }
4456 
4457 private:
4458  void _InitNew()
4459  {
4461  _M_buckets = new _Node*[_M_size];
4462  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4463  }
4464 
4465  void _InitCopy(const combinable& _Copy)
4466  {
4467  _M_buckets = new _Node*[_M_size];
4468  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4469  {
4470  _M_buckets[_Index] = NULL;
4471  for (_Node* _CurrentNode = _Copy._M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4472  {
4473  _Node* _NewNode = new _Node(_CurrentNode->_M_key, _CurrentNode->_M_value);
4474  _NewNode->_M_chain = _M_buckets[_Index];
4475  _M_buckets[_Index] = _NewNode;
4476  }
4477  }
4478  }
4479 
4480  _Node* _FindLocalItem(unsigned long _Key, size_t* _PIndex)
4481  {
4482  _CONCRT_ASSERT(_PIndex != NULL);
4483 
4484  *_PIndex = _Key % _M_size;
4485 
4486  // Search at this index for an existing value.
4487  _Node* _CurrentNode = _M_buckets[*_PIndex];
4488  while (_CurrentNode != NULL)
4489  {
4490  if (_CurrentNode->_M_key == _Key)
4491  {
4492  return _CurrentNode;
4493  }
4494 
4495  _CurrentNode = _CurrentNode->_M_chain;
4496  }
4497 
4498  return NULL;
4499  }
4500 
4501  _Node* _AddLocalItem(unsigned long _Key, size_t _Index)
4502  {
4503  _Node* _NewNode = new _Node(_Key, _M_fnInitialize());
4504  _Node* _TopNode;
4505  do
4506  {
4507  _TopNode = _M_buckets[_Index];
4508  _NewNode->_M_chain = _TopNode;
4509  } while (_InterlockedCompareExchangePointer(reinterpret_cast<void * volatile *>(&_M_buckets[_Index]), _NewNode, _TopNode) != _TopNode);
4510 
4511  return _NewNode;
4512  }
4513 
4514 private:
4515  _Node *volatile * _M_buckets;
4516  size_t _M_size;
4517  std::function<_Ty ()> _M_fnInitialize;
4518 };
4519 
4520 #pragma warning(pop) // C4316
4521 
4522 #pragma push_macro("_MAX_NUM_TASKS_PER_CORE")
4523 #pragma push_macro("_FINE_GRAIN_CHUNK_SIZE")
4524 #pragma push_macro("_SORT_MAX_RECURSION_DEPTH")
4525 
4526 // This number is used to control dynamic task splitting
4527 // The ideal chunk (task) division is that the number of cores is equal to the number of tasks, but it will
4528 // perform very poorly when tasks are not balanced. The simple solution is to allocate more tasks than number
4529 // of cores. _MAX_NUM_TASKS_PER_CORE provides a maximum number of tasks that will be allocated per core.
4530 // If this number is too small, the load balancing problem will affect efficiency very seriously, especially
4531 // when the compare operation is expensive.
4532 //
4533 // Note that this number is a maximum number -- the dynamic partition system will reduce the number of partitions
4534 // per core based on the dynamic load. If all cores are very busy, the number of partitions will shrink to
4535 // reduce the scheduler overhead.
4536 //
4537 // Initially, the total tasks(chunks) number of partitions "_Div_num" will be: core number * _MAX_NUM_TASKS_PER_CORE.
4538 // The _Div_num will be divided by 2 after each task splitting. There are two special numbers for _Div_num:
4539 // 1. When _Div_num reaches the point that _Div_num < _MAX_NUM_TASKS_PER_CORE, it means we have split more tasks than cores.
4540 // 2. When _Div_num reaches the point that _Div_num <= 1, it means stop splitting more tasks and begin sorting serially.
4541 #define _MAX_NUM_TASKS_PER_CORE 1024
4542 
4543 // This is a number mainly is used to control the sampling and dynamic task splitting strategies.
4544 // If the user configurable minimal divisible chunk size (default is 2048) is smaller than FINE_GRAIN_CHUNK_SIZE,
4545 // the random sampling algorithm for quicksort will enter fine-grained mode, and take a strategy that reduces the sampling
4546 // overhead. Also, the dynamic task splitting will enter fine-grained mode, which will split as many tasks as possible.
4547 #define _FINE_GRAIN_CHUNK_SIZE 512
4548 
4549 // This is the maximum depth that the quicksort will be called recursively. If we allow too far, a stack overflow may occur.
4550 #define _SORT_MAX_RECURSION_DEPTH 64
4551 
4552 template<typename _Random_iterator, typename _Function>
4553 inline size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
4554 {
4555  _Potentially_equal = false;
4556  if (_Func(_Begin[_A], _Begin[_B]))
4557  {
4558  if (_Func(_Begin[_A], _Begin[_C]))
4559  {
4560  return _Func(_Begin[_B], _Begin[_C]) ? _B : _C;
4561  }
4562  else
4563  {
4564  return _A;
4565  }
4566  }
4567  else
4568  {
4569  if (_Func(_Begin[_B], _Begin[_C]))
4570  {
4571  return _Func(_Begin[_A], _Begin[_C]) ? _A : _C;
4572  }
4573  else
4574  {
4575  _Potentially_equal = true;
4576  return _B;
4577  }
4578  }
4579 }
4580 
4581 template<typename _Random_iterator, typename _Function>
4582 inline size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
4583 {
4584  size_t _Offset = _Size / 8;
4585  size_t _A = _Median_of_three(_Begin, 0, _Offset, _Offset * 2, _Func, _Potentially_equal),
4586  _B = _Median_of_three(_Begin, _Offset * 3, _Offset * 4, _Offset * 5, _Func, _Potentially_equal),
4587  _C = _Median_of_three(_Begin, _Offset * 6, _Offset * 7, _Size - 1, _Func, _Potentially_equal);
4588  _B = _Median_of_three(_Begin, _A, _B, _C, _Func, _Potentially_equal);
4589 
4590  if (_Potentially_equal)
4591  {
4592  _Potentially_equal = !_Func(_Begin[_C], _Begin[_A]);
4593  }
4594 
4595  return _B;
4596 }
4597 
4598 // _Potentially_equal means that potentially all the values in the buffer are equal to the pivot value
4599 template<typename _Random_iterator, typename _Function>
4600 inline size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
4601 {
4602  // Base on different chunk size, apply different sampling optimization
4603  if (_Chunk_size < _FINE_GRAIN_CHUNK_SIZE && _Size <= std::max<size_t>(_Chunk_size * 4, static_cast<size_t>(15)))
4604  {
4605  bool _Never_care_equal;
4606  return _Median_of_three(_Begin, 0, _Size / 2, _Size - 1, _Func, _Never_care_equal);
4607  }
4608  else
4609  {
4610  return _Median_of_nine(_Begin, _Size, _Func, _Potentially_equal);
4611  }
4612 }
4613 
4614 // Find out two middle points for two sorted arrays by binary search so that the number of total elements on the left part of two middle points is equal
4615 // to the number of total elements on the right part of two sorted arrays and all elements on the left part is smaller than right part.
4616 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4617 size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
4618 {
4619  size_t _Len = (_Len1 + _Len2) / 2, _Index1 = 0, _Index2 = 0;
4620 
4621  while (_Index1 < _Len1 && _Index2 < _Len2)
4622  {
4623  size_t _Mid1 = (_Index1 + _Len1) / 2, _Mid2 = (_Index2 + _Len2) / 2;
4624  if (_Func(_Begin1[_Mid1], _Begin2[_Mid2]))
4625  {
4626  if (_Mid1 + _Mid2 < _Len)
4627  {
4628  _Index1 = _Mid1 + 1;
4629  }
4630  else
4631  {
4632  _Len2 = _Mid2;
4633  }
4634  }
4635  else
4636  {
4637  if (_Mid1 + _Mid2 < _Len)
4638  {
4639  _Index2 = _Mid2 + 1;
4640  }
4641  else
4642  {
4643  _Len1 = _Mid1;
4644  }
4645  }
4646  }
4647 
4648  if (_Index1 == _Len1)
4649  {
4650  _Len2 = _Len - _Len1;
4651  }
4652  else
4653  {
4654  _Len1 = _Len - _Len2;
4655  }
4656 
4657  return _Len;
4658 }
4659 
4660 // "move" operation is applied between buffers
4661 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4662 void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2,
4663  _Random_output_iterator _Output, const _Function &_Func)
4664 {
4665  while (_Begin1 != _End1 && _Begin2 != _End2)
4666  {
4667  if (_Func(*_Begin1, *_Begin2))
4668  {
4669  *_Output++ = std::move(*_Begin1++);
4670  }
4671  else
4672  {
4673  *_Output++ = std::move(*_Begin2++);
4674  }
4675  }
4676 
4677  if (_Begin1 != _End1)
4678  {
4679  std::_Move_no_deprecate(_Begin1, _End1, _Output);
4680  }
4681  else if (_Begin2 != _End2)
4682  {
4683  std::_Move_no_deprecate(_Begin2, _End2, _Output);
4684  }
4685 }
4686 
4687 // _Div_num of threads(tasks) merge two chunks in parallel, _Div_num should be power of 2, if not, the largest power of 2 that is
4688 // smaller than _Div_num will be used
4689 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4690 void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output,
4691  const _Function &_Func, size_t _Div_num)
4692 {
4693  // Turn to serial merge or continue splitting chunks base on "_Div_num"
4694  if (_Div_num <= 1 || (_Len1 <= 1 && _Len2 <= 1))
4695  {
4696  _Merge_chunks(_Begin1, _Begin1 + _Len1, _Begin2, _Begin2 + _Len2, _Output, _Func);
4697  }
4698  else
4699  {
4700  size_t _Mid_len1 = _Len1, _Mid_len2 = _Len2;
4701  size_t _Mid = _Search_mid_point(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Func);
4702 
4704  auto _Handle = make_task([&]
4705  {
4706  _Parallel_merge(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Output, _Func, _Div_num / 2);
4707  });
4708  _Tg.run(_Handle);
4709 
4710  _Parallel_merge(_Begin1 + _Mid_len1, _Len1 - _Mid_len1, _Begin2 + _Mid_len2, _Len2 - _Mid_len2, _Output + _Mid, _Func, _Div_num / 2);
4711 
4712  _Tg.wait();
4713  }
4714 }
4715 
4716 // Return current sorting byte from key
4717 template<typename _Ty, typename _Function>
4718 inline size_t _Radix_key(const _Ty& _Val, size_t _Radix, _Function _Proj_func)
4719 {
4720  return static_cast<size_t>(_Proj_func(_Val) >> static_cast<int>(8 * _Radix) & 255);
4721 }
4722 
4723 // One pass of radix sort
4724 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4725 void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
4726 {
4727  if (!_Size)
4728  {
4729  return;
4730  }
4731 
4732  size_t _Pos[256] = {0};
4733 
4734  for (size_t _I = 0; _I < _Size; _I++)
4735  {
4736  ++_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)];
4737  }
4738 
4739  for (size_t _I = 1; _I < 256; _I++)
4740  {
4741  _Pos[_I] += _Pos[_I - 1];
4742  }
4743 
4744  // _Size > 0
4745  for (size_t _I = _Size - 1; _I != 0; _I--)
4746  {
4747  _Output[--_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)]] = std::move(_Begin[_I]);
4748  }
4749 
4750  _Output[--_Pos[_Radix_key(_Begin[0], _Radix, _Proj_func)]] = std::move(_Begin[0]);
4751 }
4752 
4753 // Serial least-significant-byte radix sort, it will sort base on last "_Radix" number of bytes
4754 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4755 void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4756  size_t _Radix, _Function _Proj_func, size_t _Deep = 0)
4757 {
4758  size_t _Cur_radix = 0;
4759  if (_Size == 0)
4760  {
4761  return;
4762  }
4763 
4764  while (_Cur_radix < _Radix)
4765  {
4766  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4767  _Integer_radix_pass(_Output, _Size, _Begin, _Cur_radix++, _Proj_func);
4768  }
4769 
4770  if (_Cur_radix == _Radix)
4771  {
4772  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4773  }
4774 
4775  // if odd round is passed, then move result back to input buffer
4776  if (_Deep + _Radix + 1 & 1)
4777  {
4778  if (_Radix + 1 & 1)
4779  {
4780  std::_Move_no_deprecate(_Output, _Output + _Size, _Begin);
4781  }
4782  else
4783  {
4784  std::_Move_no_deprecate(_Begin, _Begin + _Size, _Output);
4785  }
4786  }
4787 }
4788 
4789 // Parallel most-significant-byte _Radix sort.
4790 // In the end, it will turn to serial least-significant-byte radix sort
4791 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4792 void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4793  size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep = 0)
4794 {
4795  // If the chunk _Size is too small, then turn to serial least-significant-byte radix sort
4796  if (_Size <= _Chunk_size || _Radix < 1)
4797  {
4798  return _Integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Deep);
4799  }
4800 
4802  size_t _Buffer_size = sizeof(size_t) * 256 * _Threads_num;
4803  size_t _Step = _Size / _Threads_num;
4804  size_t _Remain = _Size % _Threads_num;
4805 
4807  size_t (*_Chunks)[256] = _Holder._InitOnRawMalloca(_malloca(_Buffer_size));
4808 
4809  memset(_Chunks, 0, _Buffer_size);
4810 
4811  // Our purpose is to map unsorted data in buffer "_Begin" to buffer "_Output" so that all elements who have the same
4812  // byte value in the "_Radix" position will be grouped together in the buffer "_Output"
4813  //
4814  // Serial version:
4815  // To understand this algorithm, first consider a serial version. In following example, we treat 1 digit as 1 byte, so we have a
4816  // total of 10 elements for each digit instead of 256 elements in each byte. Let's suppose "_Radix" == 1 (right most is 0), and:
4817  //
4818  // begin: [ 32 | 62 | 21 | 43 | 55 | 43 | 23 | 44 ]
4819  //
4820  // We want to divide the output buffer "_Output" into 10 chunks, and each the element in the "_Begin" buffer should be mapped into
4821  // the proper destination chunk based on its current digit (byte) indicated by "_Radix"
4822  //
4823  // Because "_Radix" == 1, after a pass of this function, the chunks in the "_Output" should look like:
4824  //
4825  // buffer: [ | | 21 23 | 32 | 43 43 44 | 55 | 62 | | | ]
4826  // 0 1 2 3 4 5 6 7 8 9
4827  //
4828  // The difficulty is determining where to insert values into the "_Output" to get the above result. The way to get the
4829  // start position of each chunk of the buffer is:
4830  // 1. Count the number of elements for each chunk (in above example, chunk0 is 0, chunk1 is 0, chunk2 is 2, chunk3 is 1 ...
4831  // 2. Make a partial sum for these chunks( in above example, we will get chunk0=chunk0=0, chunk1=chunk0+chunk1=0,
4832  // chunk2=chunk0+chunk1+chunk2=2, chunk3=chunk0+chunk1+chunk2+chunk3=3
4833  //
4834  // After these steps, we will get the end position of each chunk in the "_Output". The begin position of each chunk will be the end
4835  // point of last chunk (begin point is close but the end point is open). After that, we can scan the original array again and directly
4836  // put elements from original buffer "_Begin" into specified chunk on buffer "_Output".
4837  // Finally, we invoke _parallel_integer_radix_sort in parallel for each chunk and sort them in parallel based on the next digit (byte).
4838  // Because this is a STABLE sort algorithm, if two numbers has same key value on this byte (digit), their original order should be kept.
4839  //
4840  // Parallel version:
4841  // Almost the same as the serial version, the differences are:
4842  // 1. The count for each chunk is executed in parallel, and each thread will count one segment of the input buffer "_Begin".
4843  // The count result will be separately stored in their own chunk size counting arrays so we have a total of threads-number
4844  // of chunk count arrays.
4845  // For example, we may have chunk00, chunk01, ..., chunk09 for first thread, chunk10, chunk11, ..., chunk19 for second thread, ...
4846  // 2. The partial sum should be executed across these chunk counting arrays that belong to different threads, instead of just
4847  // making a partial sum in one counting array.
4848  // This is because we need to put values from different segments into one final buffer, and the absolute buffer position for
4849  // each chunkXX is needed.
4850  // 3. Make a parallel scan for original buffer again, and move numbers in parallel into the corresponding chunk on each buffer based
4851  // on these threads' chunk size counters.
4852 
4853  // Count in parallel and separately save their local results without reducing
4854  ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4855  {
4856  size_t _Beg_index, _End_index;
4857 
4858  // Calculate the segment position
4859  if (_Index < _Remain)
4860  {
4861  _Beg_index = _Index * (_Step + 1);
4862  _End_index = _Beg_index + (_Step + 1);
4863  }
4864  else
4865  {
4866  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4867  _End_index = _Beg_index + _Step;
4868  }
4869 
4870  // Do a counting
4871  while (_Beg_index != _End_index)
4872  {
4873  ++_Chunks[_Index][_Radix_key(_Begin[_Beg_index++], _Radix, _Proj_func)];
4874  }
4875  });
4876 
4877  int _Index = -1, _Count = 0;
4878 
4879  // Partial sum cross different threads' chunk counters
4880  for (int _I = 0; _I < 256; _I++)
4881  {
4882  size_t _Last = _I ? _Chunks[_Threads_num - 1][_I - 1] : 0;
4883  _Chunks[0][_I] += _Last;
4884 
4885  for (size_t _J = 1; _J < _Threads_num; _J++)
4886  {
4887  _Chunks[_J][_I] += _Chunks[_J - 1][_I];
4888  }
4889 
4890  // "_Chunks[_Threads_num - 1][_I] - _Last" will get the global _Size for chunk _I(including all threads local _Size for chunk _I)
4891  // this will chunk whether the chunk _I is empty or not. If it's not empty, it will be recorded.
4892  if (_Chunks[_Threads_num - 1][_I] - _Last)
4893  {
4894  ++_Count;
4895  _Index = _I;
4896  }
4897  }
4898 
4899  // If there is more than 1 chunk that has content, then continue the original algorithm
4900  if (_Count > 1)
4901  {
4902  // Move the elements in parallel into each chunk
4903  ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4904  {
4905  size_t _Beg_index, _End_index;
4906 
4907  // Calculate the segment position
4908  if (_Index < _Remain)
4909  {
4910  _Beg_index = _Index * (_Step + 1);
4911  _End_index = _Beg_index + (_Step + 1);
4912  }
4913  else
4914  {
4915  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4916  _End_index = _Beg_index + _Step;
4917  }
4918 
4919  // Do a move operation to directly put each value into its destination chunk
4920  // Chunk pointer is moved after each put operation.
4921  if (_Beg_index != _End_index--)
4922  {
4923  while (_Beg_index != _End_index)
4924  {
4925  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4926  --_End_index;
4927  }
4928  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4929  }
4930  });
4931 
4932  // Invoke _parallel_integer_radix_sort in parallel for each chunk
4933  ::Concurrency::parallel_for(static_cast<size_t>(0), static_cast<size_t>(256), [=](size_t _Index)
4934  {
4935  if (_Index < 256 - 1)
4936  {
4937  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Chunks[0][_Index + 1] - _Chunks[0][_Index],
4938  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4939  }
4940  else
4941  {
4942  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Size - _Chunks[0][_Index],
4943  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4944  }
4945  });
4946  }
4947  else
4948  {
4949  // Only one chunk has content
4950  // A special optimization is applied because one chunk means all numbers have a same value on this particular byte (digit).
4951  // Because we cannot sort them at all (they are all equal at this point), directly call _parallel_integer_radix_sort to
4952  // sort next byte (digit)
4953  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix - 1, _Proj_func, _Chunk_size, _Deep);
4954  }
4955 }
4956 
4957 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4958 void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4959  _Function _Proj_func, const size_t _Chunk_size)
4960 {
4961  typedef typename std::iterator_traits<_Random_iterator>::value_type _Value_type;
4962  // The key type of the radix sort, this must be an "unsigned integer-like" type, that is, it needs support:
4963  // operator>> (int), operator>>= (int), operator& (int), operator <, operator size_t ()
4964  typedef typename std::remove_const<typename std::remove_reference<decltype(_Proj_func(*_Begin))>::type>::type _Integer_type;
4965 
4966  // Find out the max value, which will be used to determine the highest differing byte (the radix position)
4967  _Integer_type _Max_val = ::Concurrency::parallel_reduce(_Begin, _Begin + _Size, _Proj_func(*_Begin),
4968  [=](_Random_iterator _Begin, _Random_iterator _End, _Integer_type _Init) -> _Integer_type
4969  {
4970  while (_Begin != _End)
4971  {
4972  _Integer_type _Ret = _Proj_func(*_Begin++);
4973  if (_Init < _Ret)
4974  {
4975  _Init = _Ret;
4976  }
4977  }
4978 
4979  return _Init;
4980  }, [](const _Integer_type &_A, const _Integer_type &_B) -> const _Integer_type& {return (_A < _B)? _B : _A;});
4981  size_t _Radix = 0;
4982 
4983  // Find out highest differing byte
4984  while (_Max_val >>= 8)
4985  {
4986  ++_Radix;
4987  }
4988 
4989  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Chunk_size);
4990 }
4991 
4992 template<typename _Random_iterator, typename _Function>
4993 void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
4994 {
4995  if (_Depth >= _SORT_MAX_RECURSION_DEPTH || _Size <= _Chunk_size || _Size <= static_cast<size_t>(3) || _Chunk_size >= _FINE_GRAIN_CHUNK_SIZE && _Div_num <= 1)
4996  {
4997  return std::sort(_Begin, _Begin + _Size, _Func);
4998  }
4999 
5000  // Determine whether we need to do a three-way quick sort
5001  // We benefit from three-way merge if there are a lot of elements that are EQUAL to the median value,
5002  // _Select_median_pivot function will test redundant density by sampling
5003  bool _Is_three_way_split = false;
5004  size_t _Mid_index = _Select_median_pivot(_Begin, _Size, _Func, _Chunk_size, _Is_three_way_split);
5005 
5006  // Move the median value to the _Begin position.
5007  if (_Mid_index)
5008  {
5009  std::swap(*_Begin, _Begin[_Mid_index]);
5010  }
5011  size_t _I = 1, _J = _Size - 1;
5012 
5013  // Three-way or two-way partition
5014  // _Div_num < _MAX_NUM_TASKS_PER_CORE is checked to make sure it will never do three-way split before splitting enough tasks
5015  if (_Is_three_way_split && _Div_num < _MAX_NUM_TASKS_PER_CORE)
5016  {
5017  while (_Func(*_Begin, _Begin[_J]))
5018  {
5019  --_J;
5020  }
5021 
5022  while (_Func(_Begin[_I], *_Begin))
5023  {
5024  ++_I;
5025  }
5026 
5027  // Starting from this point, left side of _I will less than median value, right side of _J will be greater than median value,
5028  // and the middle part will be equal to median. _K is used to scan between _I and _J
5029  size_t _K = _J;
5030  while (_I <= _K)
5031  {
5032  if (_Func(_Begin[_K], *_Begin))
5033  {
5034  std::swap(_Begin[_I++], _Begin[_K]);
5035  }
5036  else
5037  {
5038  --_K;
5039  }
5040 
5041  while (_Func(*_Begin, _Begin[_K]))
5042  {
5043  std::swap(_Begin[_K--], _Begin[_J--]);
5044  }
5045  }
5046 
5047  ++_J;
5048  }
5049  else
5050  {
5051  while (_I <= _J)
5052  {
5053  // Will stop before _Begin
5054  while (_Func(*_Begin, _Begin[_J]))
5055  {
5056  --_J;
5057  }
5058 
5059  // There must be another element equal or greater than *_Begin
5060  while (_Func(_Begin[_I], *_Begin))
5061  {
5062  ++_I;
5063  }
5064 
5065  if (_I < _J)
5066  {
5067  std::swap(_Begin[_I++], _Begin[_J--]);
5068  }
5069  else
5070  {
5071  break;
5072  }
5073  }
5074 
5075  _I = ++_J;
5076  }
5077 
5078  std::swap(*_Begin, _Begin[--_I]);
5079 
5081  volatile size_t _Next_div = _Div_num / 2;
5082  auto _Handle = make_task([&]
5083  {
5084  _Parallel_quicksort_impl(_Begin + _J, _Size - _J, _Func, _Next_div, _Chunk_size, _Depth+1);
5085  });
5086  _Tg.run(_Handle);
5087 
5088  _Parallel_quicksort_impl(_Begin, _I, _Func, _Next_div, _Chunk_size, _Depth+1);
5089 
5090  // If at this point, the work hasn't been scheduled, then slow down creating new tasks
5091  if (_Div_num < _MAX_NUM_TASKS_PER_CORE)
5092  {
5093  _Next_div /= 2;
5094  }
5095 
5096  _Tg.wait();
5097 }
5098 
5099 // This function will be called to sort the elements in the "_Begin" buffer. However, we can't tell whether the result will end up in buffer
5100 // "_Begin", or buffer "_Output" when it returned. The return value is designed to indicate which buffer holds the sorted result.
5101 // Return true if the merge result is in the "_Begin" buffer; return false if the result is in the "_Output" buffer.
5102 // We can't always put the result into one assigned buffer because that may cause frequent buffer copies at return time.
5103 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
5104 inline bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func,
5105  int _Div_num, const size_t _Chunk_size)
5106 {
5107  static_assert(std::is_same<typename std::iterator_traits<_Random_iterator>::value_type, typename std::iterator_traits<_Random_buffer_iterator>::value_type>::value,
5108  "same value type expected");
5109 
5110  if (_Div_num <= 1 || _Size <= _Chunk_size)
5111  {
5112  _Parallel_quicksort_impl(_Begin, _Size, _Func, _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5113 
5114  // In case _Size <= _Chunk_size happened BEFORE the planned stop time (when _Div_num == 1) we need to calculate how many turns of
5115  // binary divisions are left. If there are an odd number of turns left, then the buffer move is necessary to make sure the final
5116  // merge result will be in the original input array.
5117  int _Left_div_turns = 0;
5118  while (_Div_num >>= 1)
5119  {
5120  _Left_div_turns++;
5121  }
5122 
5123  if (_Left_div_turns & 1)
5124  {
5125  std::move(_Begin, _Begin + _Size, _Output);
5126  return true;
5127  }
5128  else
5129  {
5130  return false;
5131  }
5132  }
5133  else
5134  {
5135  size_t _Mid = _Size / 2;
5137 
5138  auto _Handle = make_task([&, _Chunk_size]
5139  {
5140  _Parallel_buffered_sort_impl(_Begin, _Mid, _Output, _Func, _Div_num / 2, _Chunk_size);
5141  });
5142  _Tg.run(_Handle);
5143 
5144  bool _Is_buffer_swap = _Parallel_buffered_sort_impl(_Begin + _Mid, _Size - _Mid, _Output + _Mid, _Func, _Div_num / 2, _Chunk_size);
5145 
5146  _Tg.wait();
5147 
5148  if (_Is_buffer_swap)
5149  {
5150  _Parallel_merge(_Output, _Mid, _Output + _Mid, _Size - _Mid, _Begin, _Func, _Div_num);
5151  }
5152  else
5153  {
5154  _Parallel_merge(_Begin, _Mid, _Begin + _Mid, _Size - _Mid, _Output, _Func, _Div_num);
5155  }
5156 
5157  return !_Is_buffer_swap;
5158  }
5159 }
5160 
5161 // Disable the warning saying constant value in condition expression.
5162 // This is by design that lets the compiler optimize the trivial constructor.
5163 #pragma warning (push)
5164 #pragma warning (disable: 4127)
5165 
5166 // Allocate and construct a buffer
5167 template<typename _Allocator>
5168 inline typename _Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
5169 {
5170  typename _Allocator::pointer _P = _Alloc.allocate(_N);
5171 
5172  // If the objects being sorted have trivial default constructors, they do not need to be
5173  // constructed here. This can benefit performance.
5174  if (!std::is_trivially_default_constructible<typename _Allocator::value_type>::value)
5175  {
5176  for (size_t _I = 0; _I < _N; _I++)
5177  {
5178  // Objects being sorted must have a default constructor
5179  typename _Allocator::value_type _T;
5180  _Alloc.construct(_P + _I, std::forward<typename _Allocator::value_type>(_T));
5181  }
5182  }
5183 
5184  return _P;
5185 }
5186 
5187 // Destroy and deallocate a buffer
5188 template<typename _Allocator>
5189 inline void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
5190 {
5191  // If the objects being sorted have trivial destructors, they do not need to be
5192  // destructed here. This can benefit performance.
5193  if (!std::is_trivially_destructible<typename _Allocator::value_type>::value)
5194  {
5195  for (size_t _I = 0; _I < _N; _I++)
5196  {
5197  _Alloc.destroy(_P + _I);
5198  }
5199  }
5200 
5201  _Alloc.deallocate(_P, _N);
5202 }
5203 
5204 //
5205 // Exception safe RAII wrapper for the allocated buffers
5206 //
5207 
5208 template<typename _Allocator>
5210 {
5211 public:
5212  _AllocatedBufferHolder(size_t _Size, const _Allocator & _Alloc): _M_alloc(_Alloc)
5213  {
5214  _M_size = _Size;
5215  _M_buffer = _Construct_buffer(_Size, _M_alloc);
5216  }
5217 
5219  {
5220  _Destroy_buffer(_M_buffer, _M_size, _M_alloc);
5221  }
5222 
5223  typename _Allocator::pointer _Get_buffer()
5224  {
5225  return _M_buffer;
5226  }
5227 
5228 private:
5229  size_t _M_size;
5230  _Allocator _M_alloc;
5231  typename _Allocator::pointer _M_buffer;
5232 };
5233 
5234 
5235 #pragma warning (pop)
5236 
5269 
5270 template<typename _Random_iterator, typename _Function>
5271 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5272 {
5273  _CONCRT_ASSERT(_Chunk_size > 0);
5274 
5275  // Check for cancellation before the algorithm starts.
5277 
5278  size_t _Size = _End - _Begin;
5280 
5281  if (_Size <= _Chunk_size || _Core_num < 2)
5282  {
5283  return std::sort(_Begin, _End, _Func);
5284  }
5285 
5286  _Parallel_quicksort_impl(_Begin, _Size, _Func, _Core_num * _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5287 }
5288 
5310 
5311 template<typename _Random_iterator>
5312 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5313 {
5314  parallel_sort(_Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5315 }
5316 
5359 
5360 template<typename _Allocator, typename _Random_iterator, typename _Function>
5361 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5362 {
5363  _CONCRT_ASSERT(_Chunk_size > 0);
5364 
5365  // Check cancellation before the algorithm starts.
5367 
5368  size_t _Size = _End - _Begin;
5370 
5371  if (_Size <= _Chunk_size || _Core_num < 2)
5372  {
5373  return std::sort(_Begin, _End, _Func);
5374  }
5375  const static size_t _CORE_NUM_MASK = 0x55555555;
5376 
5377  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5378 
5379  // Prevent cancellation from happening during the algorithm in case it leaves buffers in unknown state.
5381  // This buffered sort algorithm will divide chunks and apply parallel quicksort on each chunk. In the end, it will
5382  // apply parallel merge to these sorted chunks.
5383  //
5384  // We need to decide on the number of chunks to divide the input buffer into. If we divide it into n chunks, log(n)
5385  // merges will be needed to get the final sorted result. In this algorithm, we have two buffers for each merge
5386  // operation, let's say buffer A and B. Buffer A is the original input array, buffer B is the additional allocated
5387  // buffer. Each turn's merge will put the merge result into the other buffer; for example, if we decided to split
5388  // into 8 chunks in buffer A at very beginning, after one pass of merging, there will be 4 chunks in buffer B.
5389  // If we apply one more pass of merging, there will be 2 chunks in buffer A again.
5390  //
5391  // The problem is we want to the final merge pass to put the result back in buffer A, so that we don't need
5392  // one extra copy to put the sorted data back to buffer A.
5393  // To make sure the final result is in buffer A (original input array), we need an even number of merge passes,
5394  // which means log(n) must be an even number. Thus n must be a number power(2, even number). For example, when the
5395  // even number is 2, n is power(2, 2) = 4, when even number is 4, n is power(2, 4) = 16. When we divide chunks
5396  // into these numbers, the final merge result will be in the original input array. Now we need to decide the chunk(split)
5397  // number based on this property and the number of cores.
5398  //
5399  // We want to get a chunk (split) number close to the core number (or a little more than the number of cores),
5400  // and it also needs to satisfy above property. For a 8 core machine, the best chunk number should be 16, because it's
5401  // the smallest number that satisfies the above property and is bigger than the core number (so that we can utilize all
5402  // cores, a little more than core number is OK, we need to split more tasks anyway).
5403  //
5404  // In this algorithm, we will make this alignment by bit operations (it's easy and clear). For a binary representation,
5405  // all the numbers that satisfy power(2, even number) will be 1, 100, 10000, 1000000, 100000000 ...
5406  // After OR-ing these numbers together, we will get a mask (... 0101 0101 0101) which is all possible combinations of
5407  // power(2, even number). We use _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, a bit-wise operation to align
5408  // _Core_num's highest bit into a power(2, even number).
5409  //
5410  // It means if _Core_num = 8, the highest bit in binary is bin(1000) which is not power(2, even number). After this
5411  // bit-wise operation, it will align to bin(10000) = 16 which is power(2, even number). If the _Core_num = 16, after
5412  // alignment it still returns 16. The trick is to make sure the highest bit of _Core_num will align to the "1" bit of the
5413  // mask bin(... 0101 0101 0101) We don't care about the other bits on the aligned result except the highest bit, because they
5414  // will be ignored in the function.
5416  _Func, _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, _Chunk_size);
5418 
5419 }
5420 
5460 
5461 template<typename _Allocator, typename _Random_iterator, typename _Function>
5462 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5463 {
5464  _Allocator _Alloc;
5465  return parallel_buffered_sort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Func, _Chunk_size);
5466 }
5467 
5504 
5505 template<typename _Random_iterator, typename _Function>
5506 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5507 {
5508  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End, _Func, _Chunk_size);
5509 }
5510 
5536 
5537 template<typename _Random_iterator>
5538 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5539 {
5540  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End,
5541  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5542 }
5543 
5572 
5573 template<typename _Allocator, typename _Random_iterator>
5574 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5575 {
5576  parallel_buffered_sort<_Allocator>(_Begin, _End,
5577  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5578 }
5579 
5611 
5612 template<typename _Allocator, typename _Random_iterator>
5613 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5614 {
5615  parallel_buffered_sort<_Allocator>(_Alloc, _Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5616 }
5617 
5618 #pragma warning(push)
5619 #pragma warning (disable: 4127)
5620 //
5621 // This is a default function used for parallel_radixsort which will return just the value.
5622 // It also performs compile-time checks to ensure that the data type is integral.
5623 //
5624 template <typename _DataType>
5626 {
5627  size_t operator()(const _DataType& _Val) const
5628  {
5629  // An instance of the type predicate returns the value if the type _DataType is one of the integral types, otherwise it
5630  // statically asserts.
5631  // An integral type is one of: bool, char, unsigned char, signed char, wchar_t, short, unsigned short, int, unsigned int, long,
5632  // and unsigned long.
5633  // In addition, with compilers that provide them, an integral type can be one of long long, unsigned long long, __int64, and
5634  // unsigned __int64
5635  static_assert(std::is_integral<_DataType>::value,
5636  "Type should be integral to use default radix function. For more information on integral types, please refer to https://msdn.microsoft.com/en-us/library/bb983099.aspx.");
5637  static_assert((sizeof(_DataType) <= sizeof(size_t)), "Passed Type is bigger than size_t.");
5638 
5639  if (std::is_unsigned<_DataType>::value)
5640  {
5641  return _Val;
5642  }
5643  else
5644  {
5645  // The default function needs to take the signed integer-like representation and map it to an unsigned one. The
5646  // following code will take the midpoint of the unsigned representable range (SIZE_MAX/2)+1 and does an unsigned
5647  // add of the value. Thus, it maps a [-signed_min,+signed_max] range into a [0, unsigned_max] range.
5648  return (((SIZE_MAX/2) + 1) + static_cast<size_t>(_Val));
5649  }
5650  }
5651 };
5652 #pragma warning (pop)
5653 
5678 
5679 template<typename _Random_iterator>
5680 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5681 {
5682  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5683 
5685 
5686  parallel_radixsort<std::allocator<_DataType>>(_Begin, _End, _Proj_func, 256 * 256);
5687 }
5688 
5719 
5720 template<typename _Allocator, typename _Random_iterator>
5721 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5722 {
5723  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5724 
5726 
5727  parallel_radixsort<_Allocator>(_Alloc, _Begin, _End, _Proj_func);
5728 }
5729 
5757 
5758 template<typename _Allocator, typename _Random_iterator>
5759 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5760 {
5761  _Allocator _Alloc;
5762  return parallel_radixsort<_Allocator, _Random_iterator>(_Alloc, _Begin, _End);
5763 }
5764 
5804 
5805 template<typename _Allocator, typename _Random_iterator, typename _Function>
5806 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5807 {
5808  _CONCRT_ASSERT(_Chunk_size > 0);
5809 
5810  // Check for cancellation before the algorithm starts.
5812 
5813  size_t _Size = _End - _Begin;
5814 
5815  // If _Size <= 1, no more sorting needs to be done.
5816  if (_Size <= 1)
5817  {
5818  return;
5819  }
5820 
5821  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5822 
5823  // Prevent cancellation from happening during the algorithm in case it leaves the buffers in unknown state.
5825  _Parallel_integer_sort_asc(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()), _Proj_func, _Chunk_size);
5827 }
5828 
5865 
5866 template<typename _Allocator, typename _Random_iterator, typename _Function>
5867 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5868 {
5869  _Allocator _Alloc;
5870  return parallel_radixsort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Proj_func, _Chunk_size);
5871 }
5872 
5906 
5907 template<typename _Random_iterator, typename _Function>
5908 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5909 {
5910  parallel_radixsort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(
5911  _Begin, _End, _Proj_func, _Chunk_size);
5912 }
5913 
5914 #pragma pop_macro("_SORT_MAX_RECURSION_DEPTH")
5915 #pragma pop_macro("_MAX_NUM_TASKS_PER_CORE")
5916 #pragma pop_macro("_FINE_GRAIN_CHUNK_SIZE")
5917 }
5918 
5919 namespace concurrency = ::Concurrency;
5920 
5921 #pragma pop_macro("new")
5922 #pragma pack(pop)
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2473
Definition: ppl.h:3019
_Type _Get_num_chunks(_Type)
Definition: ppl.h:1750
volatile long _M_completion_count
Definition: ppl.h:2115
A cancellation beacon is a flag which can be polled in an inlinable fashion using the is_signaled met...
Definition: concrt.h:5299
void _Wait_on_intrusive_steal()
Definition: ppl.h:2075
void _Parallel_for_each_chunk(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2808
Definition: concrt.h:364
_CRTIMP2 long __cdecl GetCurrentThreadId()
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of cancellation...
void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
Definition: ppl.h:4993
size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
Definition: ppl.h:4600
std::function< _Ty()> _M_fnInitialize
Definition: ppl.h:4517
::Concurrency::details::_TaskCollection _M_task_collection
Definition: ppl.h:842
#define NULL
Definition: vcruntime.h:236
static _CONCRTIMP void __cdecl _Yield()
_Ty & local()
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4309
void _Parallel_chunk_task_group_run(structured_task_group &_Task_group, task_handle< _Worker_class > *_Chunk_helpers, const Partitioner &, _Index_type _I)
Definition: ppl.h:2345
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1784
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5465
void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, _Function _Proj_func, const size_t _Chunk_size)
Definition: ppl.h:4958
Definition: xutility:529
const _Sub_function & _Sub_fun
Definition: ppl.h:3000
bool _Send_range(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2003
size_t _M_size
Definition: ppl.h:4516
_Parallel_fixed_chunk_helper< _Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator > _Base
Definition: ppl.h:2314
The combinable object is intended to provide thread-private copies of data, to perform lock-free t...
Definition: ppl.h:4180
Structured task collections represent groups of work which follow a strictly LIFO ordered paradigm qu...
Definition: concrt.h:4495
unchecked_array_iterator< _Iterator > make_unchecked_array_iterator(_Iterator _Ptr)
Definition: iterator:725
volatile _Index_type _M_current
Definition: ppl.h:1889
const _Unary_operator& _M_unary_op
Definition: ppl.h:3760
~combinable()
Destroys a combinable object.
Definition: ppl.h:4295
size_t _M_number
Definition: ppl.h:3050
void _Populate(_Forward_iterator &_First, size_t _Length)
Definition: ppl.h:3608
unsigned int _M_len
Definition: ppl.h:2798
_Bucket * _M_root
Definition: ppl.h:3051
unsigned int _Count
Definition: xcomplex:668
#define _TRACE_LEVEL_INFORMATION
Definition: ppl.h:38
void _NotifyCancel()
Definition: ppl.h:2089
_Worker_proxy * _M_pParent_worker
Definition: ppl.h:2109
Implements busy wait with no backoff
Definition: concrt.h:578
TaskProc m_pFunction
Definition: concrt.h:4191
void run_with_cancellation_token(const _Function &_Func, cancellation_token _Ct)
Executes a function object immediately and synchronously in the context of a given cancellation token...
Definition: ppl.h:865
#define _CONCRT_ASSERT(x)
Definition: concrt.h:123
location & _M_chunk_location
Definition: ppl.h:2337
_CONCRTIMP void __cdecl _Trace_ppl_function(const GUID &_Guid, unsigned char _Level, ConcRT_EventType _Type)
_Parallel_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2137
combinable(const combinable &_Copy)
Constructs a new combinable object.
Definition: ppl.h:4264
_Index_type _Number_of_iterations() const
Definition: ppl.h:1852
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:427
location * _M_pChunk_locations
Definition: ppl.h:1766
~_Worker_proxy()
Definition: ppl.h:1905
void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4725
task_handle< _Function > make_task(const _Function &_Func)
A factory method for creating a task_handle object.
Definition: ppl.h:165
std::iterator_traits< _Forward_iterator >::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits< _Forward_iterator >::value_type &_Identity)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3287
static cancellation_token none()
Returns a cancellation token which can never be subject to cancellation.
Definition: pplcancellation_token.h:628
_Node * _FindLocalItem(unsigned long _Key, size_t *_PIndex)
Definition: ppl.h:4480
_CONCRTIMP const GUID PPLParallelForEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_CONCRTIMP void _Cancel()
Cancels work on the task collection.
void operator()() const
Definition: ppl.h:3693
std::iterator_traits< _Random_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3670
void(__cdecl * TaskProc)(void *)
Concurrency::details contains definitions of support routines in the public namespaces and one or mor...
Definition: concrt.h:251
_CONCRTIMP bool __cdecl is_current_task_group_canceling()
Returns an indication of whether the task group which is currently executing inline on the current co...
_Bucket * _Next
Definition: ppl.h:3027
_Parallel_transform_unary_helper(_Input_iterator &_First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op)
Definition: ppl.h:3739
void * align(size_t _Bound, size_t _Size, void *&_Ptr, size_t &_Space) _NOEXCEPT
Definition: memory:1985
void operator()() const
Definition: ppl.h:2325
_CONCRTIMP const GUID PPLParallelInvokeEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
_Combinable_type & _Combinable
Definition: ppl.h:3003
void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output, const _Function &_Func, size_t _Div_num)
Definition: ppl.h:4690
_In_ int _Val
Definition: vcruntime_string.h:62
void combine_each(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4446
_Parallel_chunk_helper(const _Random_iterator &_First, const _Index_type &_Step, const _Function &_Func, const _Range< _Index_type > &_Worker_range, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2146
void _IncrementConstructedElemsCount()
Definition: concrt.h:1072
location & _Get_chunk_location(unsigned int _ChunkIndex)
Definition: ppl.h:1744
_Functor::_Bucket_type *const _M_bucket
Definition: ppl.h:3332
_Bucket(_Bucket *_N)
Definition: ppl.h:3029
void operator()() const
Definition: ppl.h:2287
STL namespace.
_CONCRTIMP void *__cdecl Alloc(size_t _NumBytes)
Allocates a block of memory of the size specified from the Concurrency Runtime Caching Suballocator...
__declspec(align(64)) struct _Node
Definition: ppl.h:4188
const _Function & _M_function
Definition: ppl.h:2266
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator&_Unary_op, const auto_partitioner &_Part=auto_partitioner())
Applies a specified function object to each element in a source range, or to a pair of elements from ...
Definition: ppl.h:3854
_Parallel_transform_binary_helper(_Input_iterator1 &_First1, _Input_iterator1 _Last1, _Input_iterator2 &_First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op)
Definition: ppl.h:3684
The Concurrency namespace provides classes and functions that provide access to the Concurrency Runti...
Definition: agents.h:43
void _Parallel_for_each_forward_impl(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2824
static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator &_Result, const _Unary_operator&_Unary_op, const auto_partitioner &)
Definition: ppl.h:3515
void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep=0)
Definition: ppl.h:4792
std::iterator_traits< _Forward_iterator >::value_type _Value_type
Definition: ppl.h:2769
void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, size_t _Deep=0)
Definition: ppl.h:4755
void _Disable_intrusive_steal()
Definition: ppl.h:2032
~task_handle()
Destroys the task_handle object.
Definition: ppl.h:110
void interruption_point()
Creates an interruption point for cancellation. If a cancellation is in progress in the context where...
Definition: ppl.h:880
_CONCRTIMP void _Cancel()
Cancels work on the task collection.
__declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration
_ElemType * _InitOnRawMalloca(void *_MallocaRet)
Definition: concrt.h:1062
void clear()
Clears any intermediate computational results from a previous usage.
Definition: ppl.h:4359
void _Construct(_Ty1 *_Ptr, _Ty2 &&_Val)
Definition: xmemory0:138
void parallel_invoke(const _Function1 &_Func1, const _Function2 &_Func2)
Executes the function objects supplied as parameters in parallel, and blocks until they have finished...
Definition: ppl.h:945
task_handle const & operator=(task_handle const &)
_Allocator _M_alloc
Definition: ppl.h:5230
_Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun)
Definition: ppl.h:3319
_Iterator_helper< _Input_iterator2, typename std::iterator_traits< _Input_iterator2 >::iterator_category > _M_input_helper2
Definition: ppl.h:3706
std::auto_ptr< task_handle< _Worker_class > > _Workers
Definition: ppl.h:3410
task_group()
Constructs a new task_group object.
Definition: ppl.h:504
_CONCRTIMP const GUID PPLParallelForeachEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_OutIt _Move_no_deprecate(_InIt _First, _InIt _Last, _OutIt _Dest)
Definition: xutility:2623
void operator()() const
Definition: ppl.h:2159
void _Set_last_iteration(const _Index_type _I)
Definition: ppl.h:1878
unsigned int
Definition: vccorlib.h:2468
_TaskCollectionStatus _Wait()
Waits for all chores running in the _StructuredTaskCollection to finish (normally or abnormally)...
Definition: concrt.h:4604
static _CONCRTIMP _Context __cdecl _CurrentContext()
The static_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1641
_Iterator_helper< _Input_iterator1, typename std::iterator_traits< _Input_iterator1 >::iterator_category > _M_input_helper1
Definition: ppl.h:3705
The simple_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1670
structured_task_group()
Constructs a new structured_task_group object.
Definition: ppl.h:219
size_t _M_size
Definition: ppl.h:5229
void _Set_tree_done()
Definition: ppl.h:2056
void _Set_current_iteration(const _Index_type _I)
Definition: ppl.h:1864
_TaskCollectionStatus _Wait()
Waits for all chores running in the _TaskCollection to finish (normally or abnormally). This method encapsulates all the running tasks in an exception handling block, and will re-throw any exceptions that occur in any of it tasks (if those exceptions occur on another thread, they are marshaled from that thread to the thread where the _TaskCollection was created, and re-thrown). After this function returns, the _TaskCollection cannot be used for scheduling further work.
Definition: concrt.h:4809
size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
Definition: ppl.h:4617
Definition: ppl.h:1805
::Concurrency::details::_Cancellation_beacon _M_beacon
Definition: ppl.h:2112
bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func, int _Div_num, const size_t _Chunk_size)
Definition: ppl.h:5104
volatile _Index_type _M_last
Definition: ppl.h:1890
_Bucket * _Unsafe_push_back()
Definition: ppl.h:3101
_Parallel_for_each_helper(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func)
Definition: ppl.h:2772
bool _SpinOnce()
Spins for one time quantum,until a maximum spin is reached.
Definition: concrt.h:626
~task_group()
Destroys a task_group object. You are expected to call the either the wait or run_and_wait method on ...
Definition: ppl.h:536
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2441
const _Index_type _M_first_iteration
Definition: ppl.h:2304
_Worker_proxy(_Worker_proxy *_PParent_worker=NULL)
Definition: ppl.h:1899
_Ty _Serial_combine_release()
Definition: ppl.h:3079
void _InitCopy(const combinable &_Copy)
Definition: ppl.h:4465
_Range(_Index_type _Current_iteration, _Index_type _Last_iteration)
Definition: ppl.h:1810
void run(const _Function &_Func)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:573
~structured_task_group()
Destroys a structured_task_group object. You are expected to call either the wait or run_and_wait met...
Definition: ppl.h:252
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:285
void _Parallel_chunk_impl(const _Random_iterator &_First, _Index_type _Range_arg, const _Index_type &_Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2366
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:388
void _Put(const _Ty &_Cur)
Definition: ppl.h:3042
bool _Is_done()
Definition: ppl.h:2043
_Ty & local(bool &_Exists)
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4336
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:696
void operator()() const
Definition: ppl.h:3746
static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End, _Random_output_iterator &_Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3527
std::iterator_traits< _Forward_iterator >::value_type value_type
Definition: ppl.h:3585
unsigned int size_t
Definition: sourceannotations.h:19
_Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type &_Identity, const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3166
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function &_Fun)
Definition: ppl.h:3368
_CONCRTIMP void __cdecl Free(_Pre_maybenull_ _Post_invalid_ void *_PAllocation)
Releases a block of memory previously allocated by the Alloc method to the Concurrency Runtime Cachin...
void _Parallel_for_each_partitioned_impl(const _Random_iterator &_First, _Index_type _Range_arg, _Index_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2856
const _Random_iterator & _M_first
Definition: ppl.h:2300
void parallel_buffered_sort(const _Allocator &_Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5361
~auto_partitioner()
Destroys a auto_partitioner object.
Definition: ppl.h:1627
const _Index_type & _M_step
Definition: ppl.h:2301
_Bucket * _Construct(_Bucket *_Par=0)
Definition: ppl.h:3055
size_t _GetAllocationSize() const
Definition: concrt.h:1104
The auto_partitioner class represents the default method parallel_for, parallel_for_each and parallel...
Definition: ppl.h:1614
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
const _Binary_operator& _M_binary_op
Definition: ppl.h:3708
void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5271
affinity_partitioner()
Constructs an affinity_partitioner object.
Definition: ppl.h:1731
void parallel_for_each(const extent< _Rank > &_Compute_domain, const _Kernel_type &_Kernel)
Invokes a parallel computation of a kernel function over a compute domain on an accelerator_view. The accelerator_view is determined from the arrays and/or array_views captured by the kernel function, or if no accelerator_view can be derived, the default is chosen.
Definition: amp.h:7020
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2555
~_AllocatedBufferHolder()
Definition: ppl.h:5218
void _Enable_intrusive_steal(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2026
void swap(array< _Ty, _Size > &_Left, array< _Ty, _Size > &_Right) _NOEXCEPT_OP(_NOEXCEPT_OP(_Left.swap(_Right)))
Definition: array:433
auto_partitioner()
Constructs a auto_partitioner object.
Definition: ppl.h:1621
#define _FINE_GRAIN_CHUNK_SIZE
Definition: ppl.h:4547
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3665
#define _SORT_MAX_RECURSION_DEPTH
Definition: ppl.h:4550
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5460
void _Propagate_cancel()
Definition: ppl.h:2094
void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, const auto_partitioner &, std::forward_iterator_tag)
Definition: ppl.h:2841
_Node *volatile * _M_buckets
Definition: ppl.h:4515
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3707
structured_task_group(cancellation_token _CancellationToken)
Constructs a new structured_task_group object.
Definition: ppl.h:236
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:796
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of a cancellation...
_Base _M_fixed_helper
Definition: ppl.h:2338
bool _GetRuntimeOwnsLifetime() const
Definition: concrt.h:4232
_Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other)
Definition: ppl.h:3433
_Allocator::pointer _M_buffer
Definition: ppl.h:5231
size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4582
static _Ty _DefaultInit()
Definition: ppl.h:4202
void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges elements in a specified range into an non descending order using a radix sorting algorithm...
Definition: ppl.h:5680
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
#define false
Definition: stdbool.h:16
_Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5168
::Concurrency::details::_TaskCollectionBase * _OwningCollection() const
Definition: concrt.h:4218
const _Index_type _M_last_iteration
Definition: ppl.h:2269
unsigned long long _Size_type
Definition: ppl.h:1673
The structured_task_group class represents a highly structured collection of parallel work...
Definition: ppl.h:205
combinable & operator=(const combinable &_Copy)
Assigns to a combinable object from another combinable object.
Definition: ppl.h:4280
void * _InterlockedCompareExchangePointer(void *volatile *, void *, void *)
const _Functor & _M_fun
Definition: ppl.h:3330
combinable(_Function _FnInitialize)
Constructs a new combinable object.
Definition: ppl.h:4244
const _Reduce_type & _Identity_value
Definition: ppl.h:3001
task_group_status wait()
Waits until all work on the structured_task_group has completed or is canceled.
Definition: ppl.h:349
void _SetRuntimeOwnsLifetime(bool _FValue)
Definition: concrt.h:4225
_Range< _Index_type > *volatile _M_pWorker_range
Definition: ppl.h:2118
void _Parallel_invoke_impl(const _Function1 &_Func1, const _Function2 &_Func2)
Definition: ppl.h:908
_Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, affinity_partitioner &_Part)
Definition: ppl.h:2316
Task collections represent groups of work which step outside the strict structuring of the _Structure...
Definition: concrt.h:4702
_Function _M_function
Definition: ppl.h:137
Definition: ppl.h:3581
~simple_partitioner()
Destroys a simple_partitioner object.
Definition: ppl.h:1695
_Size_type _M_chunk_size
Definition: ppl.h:1713
size_t _Radix_key(const _Ty &_Val, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4718
The task_group class represents a collection of parallel work which can be waited on or canceled...
Definition: ppl.h:490
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
An abstraction of a physical location on hardware.
Definition: concrt.h:1825
const _Forward_iterator _M_end
Definition: ppl.h:3331
void run(const _Function &_Func, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:614
void _Insert(_Bucket *_Item)
Definition: ppl.h:3034
static _CONCRTIMP location __cdecl current()
Returns a location object representing the most specific place the calling thread is executing...
task_group(cancellation_token _CancellationToken)
Constructs a new task_group object.
Definition: ppl.h:520
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the structured_task_group object (although such certainly qualifies this method to return true). It may be the case that the structured_task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this structured_task_group object, true will be returned as well.
Definition: ppl.h:463
size_t _Populate(_Random_iterator &_First, _Random_iterator _Last)
Definition: ppl.h:3641
unsigned int _M_num_chunks
Definition: ppl.h:1763
void operator()() const
The function call operator that the runtime invokes to perform the work of the task handle...
Definition: ppl.h:126
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3616
void _Send_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1818
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1630
std::iterator_traits< _Forward_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3621
#define _MAX_NUM_TASKS_PER_CORE
Definition: ppl.h:4541
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:812
_ElemType * _AddRawMallocaNode(void *_MallocaRet)
Definition: concrt.h:1124
void _Set_done()
Definition: ppl.h:2048
bool _Receive_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1919
bool _Verify_beacon_cancellation()
Definition: ppl.h:2067
~_Parallel_reduce_forward_executor_helper()
Definition: ppl.h:3448
void _Steal_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1838
task_group_status
Describes the execution status of a task_group or structured_task_group object. A value of this type ...
Definition: pplinterface.h:102
void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5189
_Reduce_type _Reduce_type
Definition: ppl.h:3005
size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4553
_Iterator_helper< _Input_iterator, typename std::iterator_traits< _Input_iterator >::iterator_category > _M_input_helper
Definition: ppl.h:3758
_Allocator::pointer _Get_buffer()
Definition: ppl.h:5223
_Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function &_Func)
Definition: ppl.h:3413
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:653
void operator()() const
Definition: ppl.h:2783
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:757
simple_partitioner(_Size_type _Chunk_size)
Constructs a simple_partitioner object.
Definition: ppl.h:1683
_Order_combinable(const _Sym_fun &_Fun)
Definition: ppl.h:3061
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1659
bool _Is_helper_registered()
Definition: ppl.h:2038
long __cdecl _InterlockedDecrement(long volatile *)
const _Sym_fun & _M_fun
Definition: ppl.h:3049
static_partitioner()
Constructs a static_partitioner object.
Definition: ppl.h:1648
#define _CONCRTIMP
Definition: crtdefs.h:48
_CONCRTIMP size_t __cdecl _GetCombinableSize()
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3759
constexpr remove_reference< _Ty >::type && move(_Ty &&_Arg) _NOEXCEPT
Definition: type_traits:1290
const _Index_type _M_last_iteration
Definition: ppl.h:2305
_Worker_proxy< _Index_type > *const _M_parent_worker
Definition: ppl.h:2271
static _CONCRTIMP unsigned int __cdecl _GetNumberOfVirtualProcessors()
void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2, _Random_output_iterator _Output, const _Function &_Func)
Definition: ppl.h:4662
::Concurrency::details::_Context _M_context
Definition: ppl.h:2113
std::iterator_traits< _Random_iterator >::value_type value_type
Definition: ppl.h:3635
void _Populate(_Random_iterator &_First, size_t _Length)
Definition: ppl.h:3659
The affinity_partitioner class is similar to the static_partitioner class, but it improves cache affi...
Definition: ppl.h:1723
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func)
Definition: ppl.h:2509
char _Value[(sizeof(_Ty)/sizeof(char))]
Definition: ppl.h:3026
_AllocatedBufferHolder(size_t _Size, const _Allocator &_Alloc)
Definition: ppl.h:5212
const _Index_type _M_first_iteration
Definition: ppl.h:2268
_Ty combine(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4390
static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1, _Random_input_iterator2 _Begin2, _Random_output_iterator &_Result, const _Binary_operator&_Binary_op, _Partitioner &&_Part)
Definition: ppl.h:3561
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:444
Definition: ppl.h:1896
~static_partitioner()
Destroys a static_partitioner object.
Definition: ppl.h:1656
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1796
_Type _Get_num_chunks(_Type _Range_arg) const
Definition: ppl.h:1698
void _InitNew()
Definition: ppl.h:4458
long __cdecl _InterlockedIncrement(long volatile *)
_Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &)
Definition: ppl.h:2280
void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, task_group &_Tg)
Definition: ppl.h:3715
bool _Is_beacon_signaled()
Definition: ppl.h:2062
size_t _Populate(_Forward_iterator &_First, _Forward_iterator _Last)
Definition: ppl.h:3594
_Iterator_helper()
Definition: ppl.h:3587
_Combinable_type::_Bucket _Bucket_type
Definition: ppl.h:3006
_Parallel_reduce_fixed_worker< _Forward_iterator, _Function > _Worker_class
Definition: ppl.h:3409
const _Function & _M_function
Definition: ppl.h:2302
#define SIZE_MAX
Definition: limits.h:76
size_t operator()(const _DataType &_Val) const
Definition: ppl.h:5627
The task_handle class represents an individual parallel work item. It encapsulates the instructions a...
Definition: ppl.h:85
_Index_type _Get_current_iteration() const
Definition: ppl.h:1858
_Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb)
Definition: ppl.h:3008
Definition: xtr1common:326
_In_ int _Value
Definition: setjmp.h:173
const _Function & _M_function
Definition: ppl.h:2796
~affinity_partitioner()
Destroys an affinity_partitioner object.
Definition: ppl.h:1739
_FwdIt _Last
Definition: algorithm:1936
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:323
combinable()
Constructs a new combinable object.
Definition: ppl.h:4219
_Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, std::forward_iterator_tag)
Definition: ppl.h:3295
_Range< _Index_type > *volatile _M_pHelper_range
Definition: ppl.h:2106
_Index_type _Get_last_iteration() const
Definition: ppl.h:1872
_Size
Definition: vcruntime_string.h:36
const _Random_iterator & _M_first
Definition: ppl.h:2264
const _Index_type & _M_step
Definition: ppl.h:2265
void sort(_RanIt _First, _RanIt _Last, _Pr _Pred)
Definition: algorithm:2781
::Concurrency::details::_StructuredTaskCollection _M_task_collection
Definition: ppl.h:474
~_Order_combinable()
Definition: ppl.h:3067
void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op, task_group &_Tg)
Definition: ppl.h:3767
void parallel_for(_Index_type _First, _Index_type _Last, const _Function &_Func, affinity_partitioner &_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2747
_Node * _AddLocalItem(unsigned long _Key, size_t _Index)
Definition: ppl.h:4501
void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:3462
volatile long _M_stop_iterating
Definition: ppl.h:2119
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the task_group object (although such certainly qualifies this method to return true). It may be the case that the task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this task_group object, true will be returned as well.
Definition: ppl.h:831
_Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3788
The cancellation_token class represents the ability to determine whether some operation has been requ...
Definition: pplcancellation_token.h:616
static _ChoreType * _InternalAlloc(const _Function &_Func)
Definition: concrt.h:4239
task_group_status wait()
Waits until all work on the task_group object has either completed or been canceled.
Definition: ppl.h:719
task_handle(const _Function &_Func)
Constructs a new task_handle object. The work of the task is performed by invoking the function speci...
Definition: ppl.h:101
static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, const auto_partitioner &)
Definition: ppl.h:3548