STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ppl.h
Go to the documentation of this file.
1 /***
2 * ==++==
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 * ==--==
7 * =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
8 *
9 * ppl.h
10 *
11 * Parallel Patterns Library
12 *
13 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
14 ****/
15 
16 #pragma once
17 
18 #include <crtdefs.h>
19 #include <concrt.h>
20 #include <stdexcept>
21 #include <iterator>
22 #include <functional>
23 #include <memory>
24 #include <type_traits>
25 #include <algorithm>
26 #include <malloc.h>
27 
28 #include <pplwin.h>
29 
30 #define _PPL_H
31 
32 #pragma pack(push,_CRT_PACKING)
33 #pragma push_macro("new")
34 #undef new
35 
36 // Define the level of tracing to use
37 
38 #define _TRACE_LEVEL_INFORMATION 4
39 
44 
45 namespace Concurrency
46 {
47 namespace details
48 {
49  _CONCRTIMP size_t __cdecl _GetCombinableSize();
50 } // namespace details
51 
52 class structured_task_group;
53 class task_group;
54 
83 
84 template<typename _Function>
86 {
87 public:
100 
101  task_handle(const _Function& _Func) : _M_function(_Func)
102  {
103  m_pFunction = reinterpret_cast <TaskProc> (&::Concurrency::details::_UnrealizedChore::_InvokeBridge<task_handle>);
104  }
105 
109 
111  {
112  //
113  // We only need to perform a liveness check if the client owns the lifetime of the handle. Doing this for runtime owned handles
114  // is not only unnecessary -- it is also dangerous.
115  //
117  {
119  }
120  }
121 
125 
126  void operator()() const
127  {
128  _M_function();
129  }
130 
131 private:
132 
133  friend class task_group;
134  friend class structured_task_group;
135 
136  // The function object invoked to perform the body of the task.
137  _Function _M_function;
138 
139  task_handle const & operator=(task_handle const&); // no assignment operator
140 
141 };
142 
163 
164 template <class _Function>
165 task_handle<_Function> make_task(const _Function& _Func)
166 {
167  return task_handle<_Function>(_Func);
168 }
169 
204 
206 {
207 public:
208 
218 
220  {
221  }
222 
235 
237  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())
238  {
239  }
240 
251 
253  {
254  }
255 
283 
284  template<class _Function>
285  void run(task_handle<_Function>& _Task_handle)
286  {
287  _Task_handle._SetRuntimeOwnsLifetime(false);
288  _M_task_collection._Schedule(&_Task_handle);
289  }
290 
321 
322  template<class _Function>
323  void run(task_handle<_Function>& _Task_handle, location& _Placement)
324  {
325  _Task_handle._SetRuntimeOwnsLifetime(false);
326  _M_task_collection._Schedule(&_Task_handle, &_Placement);
327  }
328 
348 
350  {
351  //
352  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
353  //
355  }
356 
386 
387  template<class _Function>
389  {
390  //
391  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
392  //
393  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
394  }
395 
425 
426  template<class _Function>
427  task_group_status run_and_wait(const _Function& _Func)
428  {
429  //
430  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
431  //
432  task_handle<_Function> _Task(_Func);
434  }
435 
443 
444  void cancel()
445  {
447  }
448 
462 
464  {
466  }
467 
468 private:
469 
470  // Disallow passing in an r-value for a task handle argument
471  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
472 
473  // The underlying group of tasks as known to the runtime.
475 };
476 
489 
491 {
492 public:
493 
503 
505  {
506  }
519 
520  task_group(cancellation_token _CancellationToken) :
521  _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())
522  {
523  }
524 
535 
537  {
538  }
539 
571 
572  template<typename _Function>
573  void run(const _Function& _Func)
574  {
576  }
577 
612 
613  template<typename _Function>
614  void run(const _Function& _Func, location& _Placement)
615  {
617  }
618 
651 
652  template<typename _Function>
653  void run(task_handle<_Function>& _Task_handle)
654  {
655  _Task_handle._SetRuntimeOwnsLifetime(false);
656  _M_task_collection._Schedule(&_Task_handle);
657  }
658 
694 
695  template<typename _Function>
696  void run(task_handle<_Function>& _Task_handle, location& _Placement)
697  {
698  _Task_handle._SetRuntimeOwnsLifetime(false);
699  _M_task_collection._Schedule(&_Task_handle, &_Placement);
700  }
701 
718 
720  {
721  //
722  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
723  //
724  return static_cast<task_group_status>(_M_task_collection._Wait());
725  }
726 
755 
756  template<class _Function>
758  {
759  //
760  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
761  //
762  _Task_handle._SetRuntimeOwnsLifetime(false);
763  return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);
764  }
765 
794 
795  template<class _Function>
796  task_group_status run_and_wait(const _Function& _Func)
797  {
798  //
799  // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.
800  //
802  }
803 
811 
812  void cancel()
813  {
815  }
816 
830 
832  {
834  }
835 
836 private:
837 
838  // Disallow passing in an r-value for a task handle argument
839  template<class _Function> void run(task_handle<_Function>&& _Task_handle);
840 
841  // The underlying group of tasks as known to the runtime.
843 };
844 
863 
864 template<typename _Function>
865 void run_with_cancellation_token(const _Function& _Func, cancellation_token _Ct)
866 {
867  structured_task_group _Stg(_Ct);
868  _Stg.run_and_wait(_Func);
869 }
870 
879 
880 inline void interruption_point()
881 {
883  _Stg.wait();
884 }
885 
899 
901 
902 // Parallel Algorithms and Patterns
903 
904 // Helper function that implements parallel_invoke with two functions
905 // Used by parallel_for and parallel_for_each implementations
906 
907 template <typename _Function1, typename _Function2>
908 void _Parallel_invoke_impl(const _Function1& _Func1, const _Function2& _Func2)
909 {
910  structured_task_group _Task_group;
911 
912  task_handle<_Function1> _Task_handle1(_Func1);
913  _Task_group.run(_Task_handle1);
914 
915  // We inline the last item to prevent the unnecessary push/pop on the work queue.
916  task_handle<_Function2> _Task_handle2(_Func2);
917  _Task_group.run_and_wait(_Task_handle2);
918 }
919 
943 
944 template <typename _Function1, typename _Function2>
945 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2)
946 {
948 
949  _Parallel_invoke_impl(_Func1, _Func2);
950 
952 }
953 
983 
984 template <typename _Function1, typename _Function2, typename _Function3>
985 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3)
986 {
988 
989  structured_task_group _Task_group;
990 
991  task_handle<_Function1> _Task_handle1(_Func1);
992  _Task_group.run(_Task_handle1);
993 
994  task_handle<_Function2> _Task_handle2(_Func2);
995  _Task_group.run(_Task_handle2);
996 
997  task_handle<_Function3> _Task_handle3(_Func3);
998  _Task_group.run_and_wait(_Task_handle3);
999 
1001 }
1002 
1038 
1039 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4>
1040 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4)
1041 {
1043 
1044  structured_task_group _Task_group;
1045 
1046  task_handle<_Function1> _Task_handle1(_Func1);
1047  _Task_group.run(_Task_handle1);
1048 
1049  task_handle<_Function2> _Task_handle2(_Func2);
1050  _Task_group.run(_Task_handle2);
1051 
1052  task_handle<_Function3> _Task_handle3(_Func3);
1053  _Task_group.run(_Task_handle3);
1054 
1055  task_handle<_Function4> _Task_handle4(_Func4);
1056  _Task_group.run_and_wait(_Task_handle4);
1057 
1059 }
1060 
1102 
1103 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5>
1104 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5)
1105 {
1107 
1108  structured_task_group _Task_group;
1109 
1110  task_handle<_Function1> _Task_handle1(_Func1);
1111  _Task_group.run(_Task_handle1);
1112 
1113  task_handle<_Function2> _Task_handle2(_Func2);
1114  _Task_group.run(_Task_handle2);
1115 
1116  task_handle<_Function3> _Task_handle3(_Func3);
1117  _Task_group.run(_Task_handle3);
1118 
1119  task_handle<_Function4> _Task_handle4(_Func4);
1120  _Task_group.run(_Task_handle4);
1121 
1122  task_handle<_Function5> _Task_handle5(_Func5);
1123  _Task_group.run_and_wait(_Task_handle5);
1124 
1126 }
1127 
1175 
1176 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1177  typename _Function6>
1178 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1179  const _Function6& _Func6)
1180 {
1182 
1183  structured_task_group _Task_group;
1184 
1185  task_handle<_Function1> _Task_handle1(_Func1);
1186  _Task_group.run(_Task_handle1);
1187 
1188  task_handle<_Function2> _Task_handle2(_Func2);
1189  _Task_group.run(_Task_handle2);
1190 
1191  task_handle<_Function3> _Task_handle3(_Func3);
1192  _Task_group.run(_Task_handle3);
1193 
1194  task_handle<_Function4> _Task_handle4(_Func4);
1195  _Task_group.run(_Task_handle4);
1196 
1197  task_handle<_Function5> _Task_handle5(_Func5);
1198  _Task_group.run(_Task_handle5);
1199 
1200  task_handle<_Function6> _Task_handle6(_Func6);
1201  _Task_group.run_and_wait(_Task_handle6);
1202 
1204 }
1205 
1259 
1260 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1261  typename _Function6, typename _Function7>
1262 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1263  const _Function6& _Func6, const _Function7& _Func7)
1264 {
1266 
1267  structured_task_group _Task_group;
1268 
1269  task_handle<_Function1> _Task_handle1(_Func1);
1270  _Task_group.run(_Task_handle1);
1271 
1272  task_handle<_Function2> _Task_handle2(_Func2);
1273  _Task_group.run(_Task_handle2);
1274 
1275  task_handle<_Function3> _Task_handle3(_Func3);
1276  _Task_group.run(_Task_handle3);
1277 
1278  task_handle<_Function4> _Task_handle4(_Func4);
1279  _Task_group.run(_Task_handle4);
1280 
1281  task_handle<_Function5> _Task_handle5(_Func5);
1282  _Task_group.run(_Task_handle5);
1283 
1284  task_handle<_Function6> _Task_handle6(_Func6);
1285  _Task_group.run(_Task_handle6);
1286 
1287  task_handle<_Function7> _Task_handle7(_Func7);
1288  _Task_group.run_and_wait(_Task_handle7);
1289 
1291 }
1292 
1352 
1353 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1354  typename _Function6, typename _Function7, typename _Function8>
1355 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1356  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8)
1357 {
1359 
1360  structured_task_group _Task_group;
1361 
1362  task_handle<_Function1> _Task_handle1(_Func1);
1363  _Task_group.run(_Task_handle1);
1364 
1365  task_handle<_Function2> _Task_handle2(_Func2);
1366  _Task_group.run(_Task_handle2);
1367 
1368  task_handle<_Function3> _Task_handle3(_Func3);
1369  _Task_group.run(_Task_handle3);
1370 
1371  task_handle<_Function4> _Task_handle4(_Func4);
1372  _Task_group.run(_Task_handle4);
1373 
1374  task_handle<_Function5> _Task_handle5(_Func5);
1375  _Task_group.run(_Task_handle5);
1376 
1377  task_handle<_Function6> _Task_handle6(_Func6);
1378  _Task_group.run(_Task_handle6);
1379 
1380  task_handle<_Function7> _Task_handle7(_Func7);
1381  _Task_group.run(_Task_handle7);
1382 
1383  task_handle<_Function8> _Task_handle8(_Func8);
1384  _Task_group.run_and_wait(_Task_handle8);
1385 
1387 }
1388 
1454 
1455 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1456  typename _Function6, typename _Function7, typename _Function8, typename _Function9>
1457 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1458  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9)
1459 {
1461 
1462  structured_task_group _Task_group;
1463 
1464  task_handle<_Function1> _Task_handle1(_Func1);
1465  _Task_group.run(_Task_handle1);
1466 
1467  task_handle<_Function2> _Task_handle2(_Func2);
1468  _Task_group.run(_Task_handle2);
1469 
1470  task_handle<_Function3> _Task_handle3(_Func3);
1471  _Task_group.run(_Task_handle3);
1472 
1473  task_handle<_Function4> _Task_handle4(_Func4);
1474  _Task_group.run(_Task_handle4);
1475 
1476  task_handle<_Function5> _Task_handle5(_Func5);
1477  _Task_group.run(_Task_handle5);
1478 
1479  task_handle<_Function6> _Task_handle6(_Func6);
1480  _Task_group.run(_Task_handle6);
1481 
1482  task_handle<_Function7> _Task_handle7(_Func7);
1483  _Task_group.run(_Task_handle7);
1484 
1485  task_handle<_Function8> _Task_handle8(_Func8);
1486  _Task_group.run(_Task_handle8);
1487 
1488  task_handle<_Function9> _Task_handle9(_Func9);
1489  _Task_group.run_and_wait(_Task_handle9);
1490 
1492 }
1493 
1565 
1566 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,
1567  typename _Function6, typename _Function7, typename _Function8, typename _Function9, typename _Function10>
1568 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,
1569  const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9, const _Function10& _Func10)
1570 {
1572 
1573  structured_task_group _Task_group;
1574 
1575  task_handle<_Function1> _Task_handle1(_Func1);
1576  _Task_group.run(_Task_handle1);
1577 
1578  task_handle<_Function2> _Task_handle2(_Func2);
1579  _Task_group.run(_Task_handle2);
1580 
1581  task_handle<_Function3> _Task_handle3(_Func3);
1582  _Task_group.run(_Task_handle3);
1583 
1584  task_handle<_Function4> _Task_handle4(_Func4);
1585  _Task_group.run(_Task_handle4);
1586 
1587  task_handle<_Function5> _Task_handle5(_Func5);
1588  _Task_group.run(_Task_handle5);
1589 
1590  task_handle<_Function6> _Task_handle6(_Func6);
1591  _Task_group.run(_Task_handle6);
1592 
1593  task_handle<_Function7> _Task_handle7(_Func7);
1594  _Task_group.run(_Task_handle7);
1595 
1596  task_handle<_Function8> _Task_handle8(_Func8);
1597  _Task_group.run(_Task_handle8);
1598 
1599  task_handle<_Function9> _Task_handle9(_Func9);
1600  _Task_group.run(_Task_handle9);
1601 
1602  task_handle<_Function10> _Task_handle10(_Func10);
1603  _Task_group.run_and_wait(_Task_handle10);
1604 
1606 }
1607 
1613 
1615 {
1616 public:
1620 
1622 
1626 
1628 
1629  template<class _Type>
1631  {
1633  }
1634 };
1635 
1640 
1642 {
1643 public:
1647 
1649  {
1650  }
1651 
1655 
1657 
1658  template<class _Type>
1660  {
1662  }
1663 };
1664 
1669 
1671 {
1672 private:
1673  typedef unsigned long long _Size_type;
1674 
1675 public:
1682 
1683  explicit simple_partitioner(_Size_type _Chunk_size) : _M_chunk_size(_Chunk_size)
1684  {
1685  if (_Chunk_size == 0)
1686  {
1687  throw std::invalid_argument("_Chunk_size");
1688  }
1689  }
1690 
1694 
1696 
1697  template<class _Type>
1698  _Type _Get_num_chunks(_Type _Range_arg) const
1699  {
1700  static_assert(sizeof(_Type) <= sizeof(_Size_type), "Potential truncation of _Range_arg");
1701  _Size_type _Num_chunks = (static_cast<_Size_type>(_Range_arg) / _M_chunk_size);
1702 
1703  if (_Num_chunks == 0)
1704  {
1705  _Num_chunks = 1;
1706  }
1707 
1708  return static_cast<_Type>(_Num_chunks);
1709  }
1710 
1711 private:
1712 
1713  _Size_type _M_chunk_size;
1714 };
1715 
1722 
1724 {
1725 public:
1726 
1730 
1732  {
1733  }
1734 
1738 
1740  {
1741  delete [] _M_pChunk_locations;
1742  }
1743 
1744  location& _Get_chunk_location(unsigned int _ChunkIndex)
1745  {
1746  return _M_pChunk_locations[_ChunkIndex];
1747  }
1748 
1749  template<class _Type>
1751  {
1752  if (_M_num_chunks == 0)
1753  {
1756  }
1757 
1758  return static_cast<_Type>(_M_num_chunks);
1759  }
1760 
1761 private:
1762  // The number of chunks the partitioner will record affinity for.
1763  unsigned int _M_num_chunks;
1764 
1765  // Array of remembered locations.
1767 };
1768 
1769 // Helper methods for scheduling and executing parallel tasks
1770 
1771 // Disable C4180: qualifier applied to function type has no meaning; ignored
1772 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
1773 #pragma warning(push)
1774 #pragma warning(disable: 4180)
1775 // Disable C6263: using _alloca in a loop; this can quickly overflow stack
1776 #pragma warning(disable: 6263)
1777 
1778 // Template class that invokes user function on a parallel_for_each
1779 
1780 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
1782 {
1783 public:
1784  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1785  {
1786  _Func(_First[_Index]);
1787  }
1788 };
1789 
1790 // Template specialized class that invokes user function on a parallel_for
1791 
1792 template <typename _Random_iterator, typename _Index_type, typename _Function>
1793 class _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, false>
1794 {
1795 public:
1796  static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)
1797  {
1798  _Func(static_cast<_Random_iterator>(_First + _Index));
1799  }
1800 };
1801 
1802 // Represents a range of iteration
1803 
1804 template<typename _Index_type>
1805 class _Range
1806 {
1807 public:
1808 
1809  // Construct an object for the range [_Current_iteration, _Last_iteration)
1810  _Range(_Index_type _Current_iteration, _Index_type _Last_iteration) :
1811  _M_current(_Current_iteration), _M_last(_Last_iteration)
1812  {
1813  // On creation, the range shall have at least 1 iteration.
1815  }
1816 
1817  // Send a portion of the range to the helper
1818  void _Send_range(_Range<_Index_type> * _Helper_range)
1819  {
1820  // If there are no iterations other than the current one left until finish, there is no help
1821  // needed. Set the pointer to a special value that helper will understand and continue
1822  // doing the work.
1823  _Index_type _Remaining_iterations = _Number_of_iterations();
1824  if (_Remaining_iterations > 1)
1825  {
1826  // Compute the two pieces of the work range: one for the worker and one for helper class.
1827  _M_last_iteration = _M_current_iteration + _Remaining_iterations / 2;
1828 
1829  // There needs to be at least 1 iteration left because the current iteration cannot be sent.
1831  }
1832 
1833  // This is also a signal for the helper that a range has been sent to it.
1834  _Helper_range->_M_current_iteration = _M_last_iteration;
1835  }
1836 
1837  // Steal the entire range and give it to the helper
1838  void _Steal_range(_Range<_Index_type> * _Helper_range)
1839  {
1840  // We allow stealing only from a range that has at least 1 iteration
1842 
1843  _Index_type _Current_iter = _M_current_iteration;
1844 
1845  _Helper_range->_M_current_iteration = _Current_iter + 1;
1846  _Helper_range->_M_last_iteration = _M_last_iteration;
1847 
1848  _M_last_iteration = _Current_iter + 1;
1849  }
1850 
1851  // Returns the number of iterations in this range
1852  _Index_type _Number_of_iterations() const
1853  {
1854  return (_M_last_iteration - _M_current_iteration);
1855  }
1856 
1857  // Returns the current iteration in the range
1858  _Index_type _Get_current_iteration() const
1859  {
1860  return _M_current;
1861  }
1862 
1863  // Sets the current iteration in the range
1864  void _Set_current_iteration(const _Index_type _I)
1865  {
1866  _M_current = _I;
1867  }
1868 
1869  __declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration;
1870 
1871  // Returns the last iteration in the range
1872  _Index_type _Get_last_iteration() const
1873  {
1874  return _M_last;
1875  }
1876 
1877  // Sets the last iteration in the range
1878  void _Set_last_iteration(const _Index_type _I)
1879  {
1880  _M_last = _I;
1881  }
1882 
1883  __declspec(property(get=_Get_last_iteration, put=_Set_last_iteration)) _Index_type _M_last_iteration;
1884 
1885 private:
1886 
1887  // These members are volatile because they are updated by the helper
1888  // and used by the worker.
1889  volatile _Index_type _M_current;
1890  volatile _Index_type _M_last;
1891 };
1892 
1893 // A proxy for the worker responsible for maintaining communication with the helper
1894 
1895 template<typename _Index_type>
1897 {
1898 public:
1899  _Worker_proxy(_Worker_proxy *_PParent_worker = NULL) :
1900  _M_pHelper_range(NULL), _M_pParent_worker(_PParent_worker), _M_pWorker_range(NULL), _M_completion_count(0), _M_stop_iterating(0)
1901  {
1903  }
1904 
1906  {
1907  // Make the check to avoid doing extra work in the non-exceptional cases
1908  if (_M_completion_count != _Tree_Complete)
1909  {
1910  // On exception, notify our parent so it breaks out of its loop.
1911  _Propagate_cancel();
1912 
1913  // On exception, we need to set _M_completion_count to ensure that the helper breaks out of its spin wait.
1914  _Set_done();
1915  }
1916  }
1917 
1918  // Obtain a range from the worker
1919  bool _Receive_range(_Range<_Index_type> * _Helper_range)
1920  {
1921  // If the worker already finished, then there is no work left for the helper
1922  if (_M_completion_count)
1923  {
1924  return false;
1925  }
1926 
1927  _CONCRT_ASSERT(_Helper_range != NULL);
1928 
1929  // There are two special values for _M_current_iteration that are not valid: one is the
1930  // initial value of the working class which it will never share, and the other is
1931  // the last exclusive iteration of the working class, which has no work to be done.
1932  // We use the former value so that we can understand worker's response.
1933  _Index_type _Cached_first_iteration = _Helper_range->_M_current_iteration;
1934 
1935  // Following operation is not done via interlocked operation because it does not have to.
1936  // Helper lazily registers that it would like to help the worker, but it allows for some
1937  // time to elapse before that information has made it over to the worker. The idea
1938  // is not to disturb the worker if it is not necessary. It is possible to add interlocked
1939  // operation in the future if the time spent in the busy wait loop is too big.
1940  _CONCRT_ASSERT(_M_pHelper_range == NULL);
1941  _M_pHelper_range = _Helper_range;
1942 
1944 
1945  // If the worker is done, it will flush the store buffer and signal the helper by
1946  // changing _M_current_iteration in the helper's range.
1947  while ((_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1948  {
1949  if ((_M_pWorker_range != NULL) && _M_context._IsSynchronouslyBlocked())
1950  {
1951  // Attempt to steal the entire range from the worker if it is synchronously blocked.
1952 
1953  // Make sure that worker makes no forward progress while helper is attempting to
1954  // steal its range. If worker does get unblocked, simply back off in the helper.
1955  // Note that there could be another helper running if a range has already been
1956  // sent to us.
1957  long _Stop_iterating = _InterlockedIncrement(&_M_stop_iterating);
1958  _CONCRT_ASSERT(_Stop_iterating > 0);
1959 
1960  // We need to make a local copy as the pointer could be changed by the worker.
1961  _Range<_Index_type> * _Worker_range = _M_pWorker_range;
1962 
1963  // The order of comparison needs to be preserved. If the parent is blocked, then
1964  // it cannot send a range (because _M_stop_iterating is already set). If it sent a range
1965  // before being synchronously blocked, then we are no longer the helper. Refrain
1966  // from intrusively stealing the range.
1967  if ((_Worker_range != NULL) && _M_context._IsSynchronouslyBlocked()
1968  && (_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)
1969  {
1970  _CONCRT_ASSERT(_M_pHelper_range == _Helper_range);
1971 
1972  _M_pHelper_range = NULL;
1973  _Worker_range->_Steal_range(_Helper_range);
1974 
1975  _CONCRT_ASSERT(_Helper_range->_M_current_iteration != _Cached_first_iteration);
1976  }
1977 
1978  // At this point, worker is either:
1979  //
1980  // a) no longer blocked so range will come to the helper naturally, or
1981  // b) out of iterations because helper stole all of it
1982  _Stop_iterating = _InterlockedDecrement(&_M_stop_iterating);
1983  _CONCRT_ASSERT(_Stop_iterating >= 0);
1984  }
1985  else
1986  {
1987  // If there is no work received in a full spin, then start yielding the context
1988  spinWait._SpinOnce();
1989  }
1990  }
1991 
1992  // If the initial iteration is the same as the original first iteration then the
1993  // worker class is sending the signal that it does not need any help.
1994  if (_Helper_range->_M_current_iteration == _Cached_first_iteration)
1995  {
1996  return false;
1997  }
1998 
1999  return (_Helper_range->_Number_of_iterations() > 0);
2000  }
2001 
2002  // Send a portion of our range and notify the helper.
2003  bool _Send_range(_Range<_Index_type> * _Worker_range)
2004  {
2005  // Worker range shall not be available for stealing at this time.
2006  _CONCRT_ASSERT(_M_pWorker_range == NULL);
2007 
2008  // Helper shall be registered.
2009  _CONCRT_ASSERT(_M_pHelper_range != NULL);
2010 
2011  // Send the range
2012  _Worker_range->_Send_range(_M_pHelper_range);
2013 
2014  // Notify the helper. The fence ensures that the prior updates are visible.
2015  _InterlockedExchangePointer(reinterpret_cast<void * volatile *>(&_M_pHelper_range), NULL);
2016 
2017  // The current iteration should still be left
2018  _CONCRT_ASSERT(_Worker_range->_Number_of_iterations() >= 1);
2019 
2020  // Indicate if we need another helper
2021  return (_Worker_range->_Number_of_iterations() > 1);
2022  }
2023 
2024  // Let the helper know that it is ok to intrusively steal range from the worker by publishing the
2025  // remaining range.
2027  {
2028  _M_pWorker_range = _Worker_range;
2029  }
2030 
2031  // Prevent the helper from intrusively stealing range from the worker
2033  {
2034  _M_pWorker_range = NULL;
2035  _Wait_on_intrusive_steal();
2036  }
2037 
2039  {
2040  return (_M_pHelper_range != NULL);
2041  }
2042 
2043  bool _Is_done()
2044  {
2045  return (_M_completion_count != 0);
2046  }
2047 
2048  void _Set_done()
2049  {
2050  // Let the helper know that this class is done with work and flush the store buffer. This operation
2051  // ensures that any buffered store to helper range in _Send_range is flushed and
2052  // available in _Receive_range (so there will be no lost ranges).
2053  _InterlockedExchange(&_M_completion_count, 1);
2054  }
2055 
2057  {
2058  // Make sure that **WE** know when our destructor hits that the entire tree is complete.
2059  _M_completion_count = _Tree_Complete;
2060  }
2061 
2063  {
2064  return _M_beacon._Is_signaled();
2065  }
2066 
2068  {
2069  return _M_beacon._Confirm_cancel();
2070  }
2071 
2072 private:
2073 
2074  // Spin wait for any intrusive steal that is in progress.
2076  {
2077  // This code is used to synchronize with helper in case of worker cooperative blocking.
2078  if (_M_stop_iterating != 0)
2079  {
2081 
2082  while (_M_stop_iterating != 0)
2083  {
2084  spinWait._SpinOnce();
2085  }
2086  }
2087  }
2088 
2090  {
2091  _M_beacon._Raise();
2092  }
2093 
2095  {
2096  if (_M_pParent_worker != NULL)
2097  {
2098  _M_pParent_worker->_NotifyCancel();
2099  }
2100  }
2101 
2102  // Constant indicating sub-tree completion
2103  static const long _Tree_Complete = 2;
2104 
2105  // Read in the loop
2107 
2108  // Read at the end of the loop
2109  _Worker_proxy * _M_pParent_worker;
2110 
2111  // Written rarely
2114 
2115  volatile long _M_completion_count;
2116 
2117  // Written to in the loop
2119  volatile long _M_stop_iterating;
2120 
2121  _Worker_proxy const & operator=(_Worker_proxy const&); // no assignment operator
2122 
2123 };
2124 
2125 // parallel_for -- Performs parallel iteration over a range of indices from _First to _Last,
2126 // excluding _Last. The order in which each iteration is executed is unspecified and non-deterministic.
2127 
2128 // Closure (binding) classes for invoking parallel_for and parallel_for_each, with chunks
2129 
2130 // A dynamically rebalancing closure class used for packaging parallel_for or parallel_for_each for invocation in chunks.
2131 // If some tasks finish earlier than others, helper tasks get executed which ensures further distribution of work.
2132 
2133 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2135 {
2136 public:
2137  _Parallel_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2138  const _Function& _Func, const _Partitioner&, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2139  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func),
2140  _M_parent_worker(_Parent_data)
2141  {
2142  // Empty constructor because members are already assigned
2143  }
2144 
2145  // Constructor overload that accepts a range
2146  _Parallel_chunk_helper(const _Random_iterator& _First, const _Index_type& _Step, const _Function& _Func,
2147  const _Range<_Index_type>& _Worker_range, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :
2148  _M_first(_First), _M_first_iteration(_Worker_range._M_current_iteration), _M_last_iteration(_Worker_range._M_last_iteration), _M_step(_Step), _M_function(_Func),
2149  _M_parent_worker(_Parent_data)
2150  {
2151  // Empty constructor because members are already assigned
2152  }
2153 
2154  // The main helper function which iterates over the given collection and invokes user function on every iteration.
2155  // Function is marked as const even though it does mutate some of its members (those are declared as mutable). This is done
2156  // in order to easily communicate between a worker and a helper instance, without holding references to many local variables.
2157  // However, this function does not mutate any state that is visible to anyone outside of this class, nor would that be
2158  // possible due to the implicit copy of the functor that happens when a new task_handle is created.
2159  void operator()() const
2160  {
2161  _Range<_Index_type> _Worker_range(_M_first_iteration, _M_last_iteration);
2162 
2163  // This class has two modes: worker and helper. The originally split chunk is always a
2164  // worker, while any subsequent class spawned from this class is in the helper
2165  // mode, which is signified using a link to the worker class through _M_pOwning_worker
2166  // handle. So, it will wait for work to be dished out by the working class while in helper mode.
2167  if (_M_parent_worker != NULL && !_M_parent_worker->_Receive_range(&_Worker_range))
2168  {
2169  // If the worker class rejected the help, simply return
2170  return;
2171  }
2172 
2173  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2174  _Index_type _Current_iteration = _Worker_range._M_current_iteration;
2175  _Index_type _Scaled_index = _Current_iteration * _M_step;
2176 
2177  // If there is only one iteration to be executed there is no need to initialize any
2178  // helper classes (work is indivisible).
2179  if (_Worker_range._Number_of_iterations() == 1)
2180  {
2181  // Execute one iteration
2183  return;
2184  }
2185 
2186  // If the execution reaches this point it means that this class now has a chunk of work
2187  // that it needs to get done, so it has transitioned into the worker mode.
2188  structured_task_group _Helper_group;
2189 
2190  // Initialize fields that are needed in the helper
2191  _Worker_proxy<_Index_type> _Worker(_M_parent_worker);
2192 
2193  // Instantiate a helper class for this working class and put it on the work queue.
2194  // If some thread is idle it will be able to steal the helper and help this class
2195  // finish its work by stealing a piece of the work range.
2196  task_handle<_Parallel_chunk_helper> _Helper_task(_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2197 
2198  _Helper_group.run(_Helper_task);
2199 
2201 
2202  // Normally, for a cancellation semantic in cooperation with the helper, we would run_and_wait the below code on the Helper_group. Unfortunately,
2203  // the capture by reference of things which must be shared (_Worker, and so forth) will cause the loop below to add additional indirection
2204  // instructions. The loop below *MUST* be as tight as possible with the defined semantics. Instead, we will manually notify our parent if the
2205  // worker's destructor runs without hitting the bottom of our chunk. This is done through notification on the beacon.
2206 
2207  for (_Index_type _I = _Current_iteration; _I < _Worker_range._M_last_iteration; (_I++, _Worker_range._M_current_iteration =_I, _Scaled_index += _M_step))
2208  {
2209  if (_Worker._Is_beacon_signaled())
2210  {
2211  // Either a parent task group is canceled or one of the other iterations
2212  // threw an exception. Abort the remaining iterations
2213  //
2214  // Note that this could be a false positive that we must verify.
2215  if (_Worker._Is_done() || _Worker._Verify_beacon_cancellation())
2216  {
2217  break;
2218  }
2219  }
2220 
2221  if (_Worker._Is_helper_registered())
2222  {
2223  // The helper class (there can only be one) registered to help this class with the work.
2224  // Thus, figure out if this class needs help and split the range among the two classes.
2225 
2226  if (_Worker._Send_range(&_Worker_range))
2227  {
2228  // Construct every new instance of a helper class on the stack because it is beneficial to use
2229  // a structured task group where the class itself is responsible for task handle's lifetime.
2231 
2232  new(_Helper_subtask) task_handle<_Parallel_chunk_helper>
2233  (_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));
2234 
2235  // If _Send_range returns true, that means that there is still some non-trivial
2236  // work to be done, so this class will potentially need another helper.
2237  _Helper_group.run(*_Helper_subtask);
2238  }
2239  }
2240 
2241  // Allow intrusive stealing by the helper
2242  _Worker._Enable_intrusive_steal(&_Worker_range);
2243 
2244  // Execute one iteration: the element is at scaled index away from the first element.
2246 
2247  // Helper shall not steal a range after this call
2248  _Worker._Disable_intrusive_steal();
2249  }
2250 
2251  // Indicate that the worker is done with its iterations.
2252  _Worker._Set_done();
2253 
2254  // Wait for all worker/helper iterations to finish
2255  _Helper_group.wait();
2256 
2257  // Make sure that we've signaled that the tree is complete. This is used to detect any exception out of either _Parallel_chunk_helper_invoke or
2258  // _Helper_group.wait() above as a cancellation of the loop which must propagate upwards because we do not wrap the loop body in run_and_wait.
2259  _Worker._Set_tree_done();
2260  }
2261 
2262 private:
2263 
2264  const _Random_iterator& _M_first;
2265  const _Index_type& _M_step;
2266  const _Function& _M_function;
2267 
2268  const _Index_type _M_first_iteration;
2269  const _Index_type _M_last_iteration;
2270 
2272 
2273  _Parallel_chunk_helper const & operator=(_Parallel_chunk_helper const&); // no assignment operator
2274 };
2275 
2276 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>
2278 {
2279 public:
2280  _Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration,
2281  _Index_type _Last_iteration, const _Index_type& _Step, const _Function& _Func, const _Partitioner&) :
2282  _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func)
2283  {
2284  // Empty constructor because members are already assigned
2285  }
2286 
2287  void operator()() const
2288  {
2289  // Keep the secondary, scaled, loop index for quick indexing into the data structure
2290  _Index_type _Scaled_index = _M_first_iteration * _M_step;
2291 
2292  for (_Index_type _I = _M_first_iteration; _I < _M_last_iteration; (_I++, _Scaled_index += _M_step))
2293  {
2294  // Execute one iteration: the element is at scaled index away from the first element.
2296  }
2297  }
2298 private:
2299 
2300  const _Random_iterator& _M_first;
2301  const _Index_type& _M_step;
2302  const _Function& _M_function;
2303 
2304  const _Index_type _M_first_iteration;
2305  const _Index_type _M_last_iteration;
2306 
2307  _Parallel_fixed_chunk_helper const & operator=(_Parallel_fixed_chunk_helper const&); // no assignment operator
2308 };
2309 
2310 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>
2312 {
2313 public:
2315 
2316  _Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,
2317  const _Function& _Func, affinity_partitioner& _Part) :
2318  _M_fixed_helper(_Chunk_index, _First, _First_iteration, _Last_iteration, _Step, _Func, static_partitioner()),
2319  _M_chunk_location(_Part._Get_chunk_location(static_cast<unsigned int>(_Chunk_index)))
2320  {
2321  // Empty constructor because members are already assigned
2322  }
2323 
2324  // Override the operator() in the base class. Note that this is not a virtual override.
2325  void operator()() const
2326  {
2327  // Check here if location needs to be saved.
2328  if (_M_chunk_location._Is_system())
2329  {
2330  _M_chunk_location = location::current();
2331  }
2332 
2333  _M_fixed_helper();
2334  }
2335 private:
2336 
2339  _Parallel_localized_chunk_helper const & operator=(_Parallel_localized_chunk_helper const&); // no assignment operator
2340 };
2341 
2342 #pragma warning(pop)
2343 
2344 template <typename _Worker_class, typename _Index_type, typename Partitioner>
2346  task_handle<_Worker_class>* _Chunk_helpers,
2347  const Partitioner&,
2348  _Index_type _I)
2349 {
2350  _Task_group.run(_Chunk_helpers[_I]);
2351 }
2352 
2353 template <typename _Worker_class, typename _Index_type>
2355  task_handle<_Worker_class>* _Chunk_helpers,
2356  affinity_partitioner& _Part,
2357  _Index_type _I)
2358 {
2359  _Task_group.run(_Chunk_helpers[_I], _Part._Get_chunk_location(static_cast<unsigned int>(_I)));
2360 }
2361 
2362 
2363 // Helper functions that implement parallel_for
2364 
2365 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner>
2366 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func, _Partitioner&& _Part)
2367 {
2368  _CONCRT_ASSERT(_Range_arg > 1);
2369  _CONCRT_ASSERT(_Step > 0);
2370 
2371  _Index_type _Num_iterations = (_Step == 1) ? _Range_arg : (((_Range_arg - 1) / _Step) + 1);
2372  _CONCRT_ASSERT(_Num_iterations > 1);
2373 
2374  _Index_type _Num_chunks = _Part._Get_num_chunks(_Num_iterations);
2375  _CONCRT_ASSERT(_Num_chunks > 0);
2376 
2377  // Allocate memory on the stack for task_handles to ensure everything is properly structured.
2379  task_handle<_Worker_class> * _Chunk_helpers = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker_class>) * static_cast<size_t>(_Num_chunks)));
2380 
2381  structured_task_group _Task_group;
2382 
2383  _Index_type _Iterations_per_chunk = _Num_iterations / _Num_chunks;
2384  _Index_type _Remaining_iterations = _Num_iterations % _Num_chunks;
2385 
2386  // If there are less iterations than desired chunks, set the chunk number
2387  // to be the number of iterations.
2388  if (_Iterations_per_chunk == 0)
2389  {
2390  _Num_chunks = _Remaining_iterations;
2391  }
2392 
2393  _Index_type _Work_size = 0;
2394  _Index_type _Start_iteration = 0;
2395  _Index_type _I;
2396 
2397  // Split the available work in chunks
2398  for (_I = 0; _I < _Num_chunks - 1; _I++)
2399  {
2400  if (_Remaining_iterations > 0)
2401  {
2402  // Iterations are not divided evenly, so add 1 remainder iteration each time
2403  _Work_size = _Iterations_per_chunk + 1;
2404  _Remaining_iterations--;
2405  }
2406  else
2407  {
2408  _Work_size = _Iterations_per_chunk;
2409  }
2410 
2411  // New up a task_handle "in-place", in the array preallocated on the stack
2412  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2414 
2415  // Run each of the chunk tasks in parallel
2416  _Parallel_chunk_task_group_run(_Task_group, _Chunk_helpers, std::forward<_Partitioner>(_Part), _I);
2417 
2418  // Prepare for the next iteration
2419  _Start_iteration += _Work_size;
2420  }
2421 
2422  // Because this is the last iteration, then work size might be different
2423  _CONCRT_ASSERT((_Remaining_iterations == 0) || ((_Iterations_per_chunk == 0) && (_Remaining_iterations == 1)));
2424  _Work_size = _Num_iterations - _Start_iteration;
2425 
2426  // New up a task_handle "in-place", in the array preallocated on the stack
2427  new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));
2429 
2430  _Task_group.run_and_wait(_Chunk_helpers[_I]);
2431 }
2432 
2433 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function>
2434 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func)
2435 {
2436  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, auto_partitioner());
2437 }
2438 
2439 // Helper for the parallel for API with the default dynamic partitioner which implements range-stealing to balance load.
2440 template <typename _Index_type, typename _Diff_type, typename _Function>
2441 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2442 {
2444  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2445 }
2446 
2447 // Helper for the parallel_for API with a static partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2448 template <typename _Index_type, typename _Diff_type, typename _Function>
2449 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const static_partitioner& _Part)
2450 {
2452  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2453 }
2454 
2455 // Helper for the parallel_for API with a simple partitioner - creates a fixed number of chunks up front with no range-stealing enabled.
2456 template <typename _Index_type, typename _Diff_type, typename _Function>
2457 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2458 {
2460  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2461 }
2462 
2463 // Helper for the parallel_for API with an affinity partitioner - creates a fixed number of chunks up front with no range-stealing enabled. subsequent
2464 // calls to parallel_for with the same affinity partitioner (pass in as a non-const reference) are scheduled to the same location they previously ran on
2465 template <typename _Index_type, typename _Diff_type, typename _Function>
2466 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2467 {
2469  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2470 }
2471 
2472 template <typename _Index_type, typename _Function, typename _Partitioner>
2473 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2474 {
2475  // The step argument must be 1 or greater; otherwise it is an invalid argument
2476  if (_Step < 1)
2477  {
2478  throw std::invalid_argument("_Step");
2479  }
2480 
2481  // If there are no elements in this range we just return
2482  if (_First >= _Last)
2483  {
2484  return;
2485  }
2486 
2487  // Compute the difference type based on the arguments and avoid signed overflow for int, long, and long long
2488  typedef typename std::conditional<std::is_same<_Index_type, int>::value, unsigned int,
2489  typename std::conditional<std::is_same<_Index_type, long>::value, unsigned long,
2490  typename std::conditional<std::is_same<_Index_type, long long>::value, unsigned long long, decltype(_Last - _First)
2491  >::type
2492  >::type
2493  >::type _Diff_type;
2494 
2495  _Diff_type _Range_size = _Diff_type(_Last) - _Diff_type(_First);
2496  _Diff_type _Diff_step = _Step;
2497 
2498  if (_Range_size <= _Diff_step)
2499  {
2500  _Func(_First);
2501  }
2502  else
2503  {
2504  _Parallel_for_partitioned_impl<_Index_type, _Diff_type, _Function>(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2505  }
2506 }
2507 
2508 template <typename _Index_type, typename _Function>
2509 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2510 {
2511  _Parallel_for_impl(_First, _Last, _Step, _Func, auto_partitioner());
2512 }
2513 
2553 
2554 template <typename _Index_type, typename _Function, typename _Partitioner>
2555 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)
2556 {
2558  _Parallel_for_impl(_First, _Last, _Step, _Func, std::forward<_Partitioner>(_Part));
2560 }
2561 
2589 
2590 template <typename _Index_type, typename _Function>
2591 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)
2592 {
2593  parallel_for(_First, _Last, _Step, _Func, auto_partitioner());
2594 }
2595 
2628 
2629 template <typename _Index_type, typename _Function>
2630 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const auto_partitioner& _Part = auto_partitioner())
2631 {
2632  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2633 }
2634 
2667 
2668 template <typename _Index_type, typename _Function>
2669 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const static_partitioner& _Part)
2670 {
2671  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2672 }
2673 
2706 
2707 template <typename _Index_type, typename _Function>
2708 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const simple_partitioner& _Part)
2709 {
2710  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2711 }
2712 
2745 
2746 template <typename _Index_type, typename _Function>
2747 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, affinity_partitioner& _Part)
2748 {
2749  parallel_for(_First, _Last, _Index_type(1), _Func, _Part);
2750 }
2751 
2752 // parallel_for_each -- This function will iterate over all elements in the iterator's range.
2753 
2754 // Closure (binding) classes for invoking parallel_for_each recursively
2755 
2756 // A closure class used for packaging chunk of elements in parallel_for_each for parallel invocation
2757 
2758 // Forward iterator for_each using unstructured task group
2759 
2760 // Disable C4180: qualifier applied to function type has no meaning; ignored
2761 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2762 #pragma warning(push)
2763 #pragma warning(disable: 4180)
2764 
2765 template <typename _Forward_iterator, typename _Function, unsigned int _Chunk_size>
2767 {
2768 public:
2769  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
2770  static const unsigned int _Size = _Chunk_size;
2771 
2772  _Parallel_for_each_helper(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func) :
2773  _M_function(_Func), _M_len(0)
2774  {
2775  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
2776  // Add a batch of work items to this functor's array
2777  for (unsigned int _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
2778  {
2779  _M_element[_M_len++] = &(*_First++);
2780  }
2781  }
2782 
2783  void operator()() const
2784  {
2785  // Invoke parallel_for on the batched up array of elements
2786  _Parallel_for_impl(0U, _M_len, 1U,
2787  [this] (unsigned int _Index)
2788  {
2789  _M_function(*(_M_element[_Index]));
2790  }
2791  );
2792  }
2793 
2794 private:
2795 
2796  const _Function& _M_function;
2797  typename std::iterator_traits<_Forward_iterator>::pointer _M_element[_Size];
2798  unsigned int _M_len;
2799 
2800  _Parallel_for_each_helper const & operator=(_Parallel_for_each_helper const&); // no assignment operator
2801 };
2802 
2803 #pragma warning(pop)
2804 
2805 // Helper functions that implement parallel_for_each
2806 
2807 template <typename _Forward_iterator, typename _Function>
2808 void _Parallel_for_each_chunk(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2809 {
2810  // The chunk size selection depends more on the internal implementation of parallel_for than
2811  // on the actual input. Also, it does not have to be dynamically computed, but it helps
2812  // parallel_for if it is a power of 2 (easy to divide).
2813  const unsigned int _Chunk_size = 1024;
2814 
2815  // This functor will be copied on the heap and will execute the chunk in parallel
2817 
2818  // Because this is an unstructured task group, running the task will make a copy of the necessary data
2819  // on the heap, ensuring that it is available at the time of execution.
2820  _Task_group.run(_Functor);
2821 }
2822 
2823 template <typename _Forward_iterator, typename _Function>
2824 void _Parallel_for_each_forward_impl(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)
2825 {
2826  _Parallel_for_each_chunk(_First, _Last, _Func, _Task_group);
2827 
2828  // If there is a tail, push the tail
2829  if (_First != _Last)
2830  {
2831  _Task_group.run(
2832  [&_First, &_Last, &_Func, &_Task_group]
2833  {
2834  ::Concurrency::_Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2835  }
2836  );
2837  }
2838 }
2839 
2840 template <typename _Forward_iterator, typename _Function>
2841 void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func, const auto_partitioner&, std::forward_iterator_tag)
2842 {
2843  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
2844  // it is up to the user to provide valid range.
2845  if (_First != _Last)
2846  {
2847  task_group _Task_group;
2848 
2849  _Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);
2850 
2851  _Task_group.wait();
2852  }
2853 }
2854 
2855 template <typename _Random_iterator, typename _Index_type, typename _Function>
2856 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const auto_partitioner& _Part)
2857 {
2859  // Use the same function that schedules work for parallel for
2860  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2861 }
2862 
2863 template <typename _Random_iterator, typename _Index_type, typename _Function>
2864 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const static_partitioner& _Part)
2865 {
2867  // Use the same function that schedules work for parallel for
2868  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2869 }
2870 
2871 template <typename _Random_iterator, typename _Index_type, typename _Function>
2872 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const simple_partitioner& _Part)
2873 {
2875  // Use the same function that schedules work for parallel for
2876  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2877 }
2878 
2879 template <typename _Random_iterator, typename _Index_type, typename _Function>
2880 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, affinity_partitioner& _Part)
2881 {
2883  // Use the same function that schedules work for parallel for
2884  _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);
2885 }
2886 
2887 template <typename _Random_iterator, typename _Function, typename _Partitioner>
2888 void _Parallel_for_each_impl(const _Random_iterator& _First, const _Random_iterator& _Last, const _Function& _Func, _Partitioner&& _Part, std::random_access_iterator_tag)
2889 {
2890  typedef typename std::iterator_traits<_Random_iterator>::difference_type _Index_type;
2891 
2892  // Exit early if there is nothing in the collection
2893  if (_First >= _Last)
2894  {
2895  return;
2896  }
2897 
2898  _Index_type _Range_size = _Last - _First;
2899 
2900  if (_Range_size == 1)
2901  {
2902  _Func(*_First);
2903  }
2904  else
2905  {
2906  _Index_type _Step = 1;
2907 
2908  _Parallel_for_each_partitioned_impl(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));
2909  }
2910 }
2911 
2939 
2940 template <typename _Iterator, typename _Function>
2941 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func)
2942 {
2943  parallel_for_each(_First, _Last, _Func, auto_partitioner());
2944 }
2945 
2982 
2983 template <typename _Iterator, typename _Function, typename _Partitioner>
2984 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func, _Partitioner&& _Part)
2985 {
2987  _Parallel_for_each_impl(_First, _Last, _Func, std::forward<_Partitioner>(_Part), typename std::iterator_traits<_Iterator>::iterator_category());
2989 }
2990 
2991 // Disable C4180: qualifier applied to function type has no meaning; ignored
2992 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
2993 #pragma warning(push)
2994 #pragma warning(disable: 4180)
2995 
2996 // Helper function assemble all functors
2997 template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>
2999 {
3000  const _Sub_function &_Sub_fun;
3002 
3003  _Combinable_type &_Combinable;
3004 
3006  typedef typename _Combinable_type::_Bucket _Bucket_type;
3007 
3008  _Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb):
3009  _Sub_fun(_Sub_fun), _Combinable(_Comb), _Identity_value(_Identity)
3010  {
3011  }
3012 
3013 private:
3015 };
3016 
3017 // Ordered serial combinable object
3018 template<typename _Ty, typename _Sym_fun>
3020 {
3021 public:
3022  // Only write once, limited contention will be caused
3023  struct _Bucket
3024  {
3025  // Allocate enough space in the Bucket to hold a value
3026  char _Value[(sizeof(_Ty) / sizeof(char))];
3028 
3030  {
3031  _Next = _N;
3032  }
3033 
3034  void _Insert(_Bucket *_Item)
3035  {
3036  // No need to lock, only one thread will insert
3037  _Item->_Next = _Next;
3038  _Next = _Item;
3039  }
3040 
3041  // Construct value in bucket
3042  void _Put(const _Ty &_Cur)
3043  {
3044  new(reinterpret_cast<_Ty *>(&_Value)) _Ty(_Cur);
3045  }
3046  };
3047 
3048 private:
3049  const _Sym_fun &_M_fun;
3050  size_t _M_number;
3053 
3054 public:
3056  {
3057  _Bucket * _Ret = static_cast<_Bucket *>(::Concurrency::Alloc(sizeof(_Bucket)));
3058  return new(_Ret)_Bucket(_Par);
3059  }
3060 
3061  _Order_combinable(const _Sym_fun &_Fun): _M_fun(_Fun)
3062  {
3063  _M_root = 0;
3064  _M_number = 0;
3065  }
3066 
3068  {
3069  while (_M_root)
3070  {
3071  _Bucket *_Cur = _M_root;
3072  _M_root = _M_root->_Next;
3073  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3074  ::Concurrency::Free(_Cur);
3075  }
3076  }
3077 
3078  // Serially combine and release the list, return result
3080  {
3081  _Ty _Ret(reinterpret_cast<_Ty &>(_M_root->_Value));
3082  _Bucket *_Cur = _M_root;
3083  _M_root = _M_root->_Next;
3084 
3085  while (_M_root)
3086  {
3087  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3088  ::Concurrency::Free(_Cur);
3089  _Cur = _M_root;
3090  _Ret = _M_fun(reinterpret_cast <_Ty &> (_Cur->_Value), _Ret);
3091  _M_root = _M_root->_Next;
3092  }
3093 
3094  reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();
3095  ::Concurrency::Free(_Cur);
3096 
3097  return _Ret;
3098  }
3099 
3100  // allocate a bucket and push back to the list
3102  {
3103  return _M_root = _Construct(_M_root);
3104  }
3105 };
3106 
3164 
3165 template<typename _Reduce_type, typename _Forward_iterator, typename _Range_reduce_fun, typename _Sym_reduce_fun>
3166 inline _Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type& _Identity,
3167  const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
3168 {
3169  typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;
3170 
3171  static_assert(!std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::input_iterator_tag>::value
3172  && !std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::output_iterator_tag>::value,
3173  "iterator can not be input_iterator or output_iterator.");
3174 
3175  return _Parallel_reduce_impl(_Begin, _End,
3176  _Reduce_functor_helper<_Reduce_type, _Range_reduce_fun,
3178  typename std::iterator_traits<_Forward_iterator>::iterator_category());
3179 }
3180 
3227 
3228 template<typename _Forward_iterator, typename _Sym_reduce_fun>
3229 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End,
3230  const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity, _Sym_reduce_fun _Sym_fun)
3231 {
3232  typedef typename std::remove_cv<typename std::iterator_traits<_Forward_iterator>::value_type>::type _Reduce_type;
3233 
3234  return parallel_reduce(_Begin, _End, _Identity,
3235  [_Sym_fun](_Forward_iterator _Begin, _Forward_iterator _End, _Reduce_type _Init)->_Reduce_type
3236  {
3237  while (_Begin != _End)
3238  {
3239  _Init = _Sym_fun(_Init, *_Begin++);
3240  }
3241 
3242  return _Init;
3243  },
3244  _Sym_fun);
3245 }
3246 
3285 
3286 template<typename _Forward_iterator>
3287 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(
3288  _Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity)
3289 {
3290  return parallel_reduce(_Begin, _End, _Identity, std::plus<typename std::iterator_traits<_Forward_iterator>::value_type>());
3291 }
3292 
3293 // Implementation for the parallel reduce
3294 template <typename _Forward_iterator, typename _Function>
3295 typename _Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func,
3296  std::forward_iterator_tag)
3297 {
3298  // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so
3299  // it is up to the user to provide valid range.
3300  if (_First != _Last)
3301  {
3302  task_group _Task_group;
3303  _Parallel_reduce_forward_executor(_First, _Last, _Func, _Task_group);
3304  _Task_group.wait();
3305  return _Func._Combinable._Serial_combine_release();
3306  }
3307  else
3308  {
3309  return _Func._Identity_value;
3310  }
3311 }
3312 
3313 // All the code below is the worker without range stealing
3314 template<typename _Forward_iterator, typename _Functor>
3316 {
3317 public:
3318  // The bucket allocation order will depend on the worker construction order
3319  _Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun):
3320  _M_begin(_Begin), _M_end(_End), _M_fun(_Fun), _M_bucket(_M_fun._Combinable._Unsafe_push_back())
3321  {
3322  }
3323 
3324  void operator ()() const
3325  {
3326  _M_bucket->_Put(_M_fun._Sub_fun(_M_begin, _M_end, _M_fun._Identity_value));
3327  }
3328 
3329 private:
3330  const _Functor &_M_fun;
3331  const _Forward_iterator _M_begin, _M_end;
3332  typename _Functor::_Bucket_type * const _M_bucket;
3334 };
3335 
3336 template <typename _Worker, typename _Random_iterator, typename _Function>
3337 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun);
3338 
3339 template <typename _Random_iterator, typename _Function>
3340 typename _Function::_Reduce_type _Parallel_reduce_impl(_Random_iterator _First, _Random_iterator _Last, const _Function& _Func,
3341  std::random_access_iterator_tag)
3342 {
3344 
3345  // Special case for 0, 1 element
3346  if (_First >= _Last)
3347  {
3348  return _Func._Identity_value;
3349  }
3350  // Directly compute if size is too small
3351  else if (_Last - _First <= 1)
3352  {
3353  _Worker_class(_First, _Last, _Func)();
3354  return _Func._Combinable._Serial_combine_release();
3355  }
3356  else
3357  {
3358  // Use fixed ordered chunk partition to schedule works
3359  _Parallel_reduce_random_executor<_Worker_class>(_First, _Last, _Func);
3360  return _Func._Combinable._Serial_combine_release();
3361  }
3362 }
3363 
3364 // the parallel worker executor for fixed iterator
3365 // it will divide fixed number of chunks
3366 // almost same as fixed parallel for, except keep the chunk dividing order
3367 template <typename _Worker, typename _Random_iterator, typename _Function>
3368 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun)
3369 {
3370  size_t _Cpu_num = static_cast<size_t>(::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors()), _Size = _End - _Begin;
3371 
3374  task_handle<_Worker> *_Tasks = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker>) * (_Cpu_num - 1)));
3375 
3376  size_t _Begin_index = 0;
3377  size_t _Step = _Size / _Cpu_num;
3378  size_t _NumRemaining = _Size - _Step * _Cpu_num;
3379 
3380  for(size_t _I = 0; _I < _Cpu_num - 1; _I++)
3381  {
3382  size_t _Next = _Begin_index + _Step;
3383 
3384  // Add remaining to each chunk
3385  if (_NumRemaining)
3386  {
3387  --_NumRemaining;
3388  ++_Next;
3389  }
3390 
3391  // New up a task_handle "in-place", in the array preallocated on the stack
3392  new (_Tasks + _I) task_handle<_Worker>(_Worker(_Begin + _Begin_index, _Begin + _Next, _Fun));
3394 
3395  // Run each of the chunk _Tasks in parallel
3396  _Tg.run(_Tasks[_I]);
3397  _Begin_index = _Next;
3398  }
3399 
3400  task_handle<_Worker> _Tail(_Worker(_Begin + _Begin_index, _End, _Fun));
3401  _Tg.run_and_wait(_Tail);
3402 }
3403 
3404 // The parallel worker executor for forward iterators
3405 // Divide chunks on the fly
3406 template <typename _Forward_iterator, typename _Function, int _Default_worker_size, int _Default_chunk_size>
3408 {
3412 
3413  _Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function& _Func):
3414  _Workers(static_cast<task_handle<_Worker_class> *>(::Concurrency::Alloc(sizeof(task_handle<_Worker_class>) * _Default_worker_size)))
3415  {
3416  _Worker_size = 0;
3417  while (_Worker_size < _Default_worker_size && _First != _Last)
3418  {
3419  // Copy the range _Head
3420  _Forward_iterator _Head = _First;
3421 
3422  // Read from forward iterator
3423  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3424  {
3425  // Body is empty
3426  }
3427 
3428  // _First will be the end of current chunk
3429  new (_Workers + _Worker_size++) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3430  }
3431  }
3432 
3434  _Workers(_Other._Workers), _Worker_size(_Other._Worker_size)
3435  {
3436  _Other._Workers = 0; // _Other releases ownership of _Workers on copy.
3437  }
3438 
3439  void operator ()() const
3440  {
3442  for(int _I = 0; _I < _Worker_size; _I++)
3443  {
3444  _Tg.run(_Workers[_I]);
3445  }
3446  _Tg.wait();
3447  }
3448 
3450  {
3451  if (_Workers)
3452  {
3453  for (int _I = 0; _I < _Worker_size; _I++)
3454  {
3455  _Workers[_I].~task_handle<_Worker_class>();
3456  }
3457  ::Concurrency::Free(_Workers);
3458  _Workers = 0;
3459  }
3460  }
3461 };
3462 
3463 template <typename _Forward_iterator, typename _Function>
3464 void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function& _Func, task_group& _Task_group)
3465 {
3466  const static int _Internal_worker_number = 1024, _Default_chunk_size = 512;
3468 
3469  structured_task_group _Worker_group;
3471  task_handle<_Worker_class>* _Workers = _Holder._InitOnRawMalloca(_malloca(_Internal_worker_number * sizeof(task_handle<_Worker_class>)));
3472 
3473  // Start execution first
3474  int _Index = 0;
3475  while (_Index < _Internal_worker_number && _First != _Last)
3476  {
3477  // Copy the range _Head
3478  _Forward_iterator _Head = _First;
3479 
3480  // Read from forward iterator
3481  for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)
3482  {
3483  // Body is empty
3484  };
3485 
3486  // Create a new task, _First is range _End
3487  new (_Workers + _Index) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));
3489  _Worker_group.run(_Workers[_Index]);
3490  ++_Index;
3491  }
3492 
3493  // Divide and append the left
3494  while (_First != _Last)
3495  {
3497  }
3498 
3499  _Worker_group.wait();
3500 }
3501 
3502 #pragma warning(pop)
3503 
3504 
3505 // Disable C4180: qualifier applied to function type has no meaning; ignored
3506 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.
3507 #pragma warning(push)
3508 #pragma warning(disable: 4180)
3509 
3510 //
3511 // Dispatch the execution and handle the condition that all of the iterators are random access
3512 //
3513 template<typename _Any_input_traits, typename _Any_output_traits>
3515 {
3516  template<typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3517  static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator& _Result, const _Unary_operator& _Unary_op, const auto_partitioner&)
3518  {
3519  task_group _Tg;
3520  _Parallel_transform_unary_impl2(_Begin, _End, _Result, _Unary_op, _Tg);
3521  _Tg.wait();
3522  }
3523 };
3524 
3525 template<>
3526 struct _Unary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag>
3527 {
3528  template<typename _Random_input_iterator, typename _Random_output_iterator, typename _Unary_operator, typename _Partitioner>
3529  static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End,
3530  _Random_output_iterator& _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3531  {
3532  if (_Begin < _End)
3533  {
3534  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End - _Begin), static_cast<size_t>(1),
3535  [_Begin, &_Result, &_Unary_op](size_t _Index)
3536  {
3537  _Result[_Index] = _Unary_op(_Begin[_Index]);
3538  },
3539  std::forward<_Partitioner>(_Part));
3540  _Result += _End - _Begin;
3541  }
3542  }
3543 };
3544 
3545 template<typename _Any_input_traits1, typename _Any_input_traits2, typename _Any_output_traits>
3547 {
3548 
3549  template<typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3550  static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2,
3551  _Output_iterator& _Result, const _Binary_operator& _Binary_op, const auto_partitioner&)
3552  {
3553  task_group _Tg;
3554  _Parallel_transform_binary_impl2(_Begin1, _End1, _Begin2, _Result, _Binary_op, _Tg);
3555  _Tg.wait();
3556  }
3557 };
3558 
3559 template<>
3560 struct _Binary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag>
3561 {
3562  template<typename _Random_input_iterator1, typename _Random_input_iterator2, typename _Random_output_iterator, typename _Binary_operator, typename _Partitioner>
3563  static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1,
3564  _Random_input_iterator2 _Begin2, _Random_output_iterator& _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
3565  {
3566  if (_Begin1 < _End1)
3567  {
3568  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End1 - _Begin1), static_cast<size_t>(1),
3569  [_Begin1, _Begin2, &_Result, &_Binary_op](size_t _Index)
3570  {
3571  _Result[_Index] = _Binary_op(_Begin1[_Index], _Begin2[_Index]);
3572  },
3573  std::forward<_Partitioner>(_Part));
3574  _Result += _End1 - _Begin1;
3575  }
3576  }
3577 };
3578 
3579 //
3580 // The implementation for at least one of the iterator is forward iterator
3581 //
3582 template <typename _Forward_iterator, typename _Iterator_kind>
3584 {
3585 public:
3586  static const size_t _Size = 1024;
3587  typedef typename std::iterator_traits<_Forward_iterator>::value_type value_type;
3588 
3590  {
3591  static_assert(!std::is_same<_Iterator_kind, std::input_iterator_tag>::value
3592  && !std::is_same<_Iterator_kind, std::output_iterator_tag>::value,
3593  "iterator can not be input_iterator or output_iterator.");
3594  }
3595 
3596  size_t _Populate(_Forward_iterator& _First, _Forward_iterator _Last)
3597  {
3598  size_t _Length = 0;
3599  static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");
3600 
3601  for (size_t _Index=0; (_Index < _Size) && (_First != _Last); _Index++)
3602  {
3603  // We only support l-value here, so it's safe
3604  _M_element_array[_Length++] = &(*_First++);
3605  }
3606 
3607  return _Length;
3608  }
3609 
3610  void _Populate(_Forward_iterator& _First, size_t _Length)
3611  {
3612  for (size_t _Index=0; _Index < _Length; _Index++)
3613  {
3614  _M_element_array[_Index] = &(*_First++);
3615  }
3616  }
3617 
3618  void _Store(const value_type& _Elem, size_t _Index) const
3619  {
3620  *(_M_element_array[_Index]) = _Elem;
3621  }
3622 
3623  typename std::iterator_traits<_Forward_iterator>::reference _Load(size_t _Index) const
3624  {
3625  return *(_M_element_array[_Index]);
3626  }
3627 
3628 private:
3629  typename std::iterator_traits<_Forward_iterator>::pointer _M_element_array[_Size];
3630 };
3631 
3632 template <typename _Random_iterator>
3634 {
3635 public:
3636  static const size_t _Size = 1024;
3637  typedef typename std::iterator_traits<_Random_iterator>::value_type value_type;
3638 
3640  {
3641  }
3642 
3643  size_t _Populate(_Random_iterator& _First, _Random_iterator _Last)
3644  {
3645  typename std::iterator_traits<_Random_iterator>::difference_type _Range_size = _Last - _First;
3646  typename std::iterator_traits<_Random_iterator>::difference_type _Sized = _Size;
3647  _M_first = _First;
3648 
3649  if (_Range_size > _Sized)
3650  {
3651  _First += _Size;
3652  return _Size;
3653  }
3654  else
3655  {
3656  _First += _Range_size;
3657  return static_cast<size_t>(_Range_size);
3658  }
3659  }
3660 
3661  void _Populate(_Random_iterator& _First, size_t _Length)
3662  {
3663  _M_first = _First;
3664  _First += _Length;
3665  }
3666 
3667  void _Store(const value_type& _Elem, size_t _Index) const
3668  {
3669  _M_first[_Index] = _Elem;
3670  }
3671 
3672  typename std::iterator_traits<_Random_iterator>::reference _Load(size_t _Index) const
3673  {
3674  // We only support l-value here
3675  return _M_first[_Index];
3676  }
3677 
3678 private:
3679  _Random_iterator _M_first;
3680 };
3681 
3682 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3684 {
3685 public:
3686  _Parallel_transform_binary_helper(_Input_iterator1& _First1, _Input_iterator1 _Last1, _Input_iterator2& _First2,
3687  _Output_iterator& _Result, const _Binary_operator& _Binary_op) :
3688  _M_binary_op(_Binary_op), _M_len(0)
3689  {
3690  _M_len = _M_input_helper1._Populate(_First1, _Last1);
3691  _M_input_helper2._Populate(_First2, _M_len);
3692  _M_output_helper._Populate(_Result, _M_len);
3693  }
3694 
3695  void operator()() const
3696  {
3697  // Invoke parallel_for on the batched up array of elements
3698  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3699  [this] (size_t _Index)
3700  {
3701  _M_output_helper._Store(_M_binary_op(_M_input_helper1._Load(_Index), _M_input_helper2._Load(_Index)), _Index);
3702  });
3703  }
3704 
3705 private:
3706 
3710  const _Binary_operator& _M_binary_op;
3711  size_t _M_len;
3712 
3713  _Parallel_transform_binary_helper const & operator=(_Parallel_transform_binary_helper const&); // no assignment operator
3714 };
3715 
3716 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
3717 void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result,
3718  const _Binary_operator& _Binary_op, task_group& _Tg)
3719 {
3720  // This functor will be copied on the heap and will execute the chunk in parallel
3721  {
3723  _Tg.run(_Functor);
3724  }
3725 
3726  // If there is a tail, push the tail
3727  if (_First1 != _Last1)
3728  {
3729  _Tg.run(
3730  [=, &_Result, &_Binary_op, &_Tg]
3731  {
3732  _Parallel_transform_binary_impl2(_First1, _Last1, _First2, _Result, _Binary_op, _Tg);
3733  });
3734  }
3735 }
3736 
3737 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3739 {
3740 public:
3741  _Parallel_transform_unary_helper(_Input_iterator& _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator& _Unary_op) :
3742  _M_unary_op(_Unary_op), _M_len(0)
3743  {
3744  _M_len = _M_input_helper._Populate(_First, _Last);
3745  _M_output_helper._Populate(_Result, _M_len);
3746  }
3747 
3748  void operator()() const
3749  {
3750  // Invoke parallel_for on the batched up array of elements
3751  ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),
3752  [this] (size_t _Index)
3753  {
3754  _M_output_helper._Store(_M_unary_op(_M_input_helper._Load(_Index)), _Index);
3755  });
3756  }
3757 
3758 private:
3759 
3762  const _Unary_operator& _M_unary_op;
3763  size_t _M_len;
3764 
3765  _Parallel_transform_unary_helper const & operator=(_Parallel_transform_unary_helper const&); // no assignment operator
3766 };
3767 
3768 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>
3769 void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result,
3770  const _Unary_operator& _Unary_op, task_group& _Tg)
3771 {
3772  // This functor will be copied on the heap and will execute the chunk in parallel
3773  {
3775  _Tg.run(_Functor);
3776  }
3777 
3778  // If there is a tail, push the tail
3779  if (_First != _Last)
3780  {
3781  _Tg.run(
3782  [=, &_Result, &_Unary_op, &_Tg]
3783  {
3784  _Parallel_transform_unary_impl2(_First, _Last, _Result, _Unary_op, _Tg);
3785  });
3786  }
3787 }
3788 
3789 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator, typename _Partitioner>
3790 _Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)
3791 {
3792  typedef typename std::iterator_traits<_Input_iterator>::iterator_category _Input_iterator_type;
3793  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
3794 
3795  if (_First != _Last)
3796  {
3798  ::_Parallel_transform_unary_impl(_First, _Last, _Result, _Unary_op, std::forward<_Partitioner>(_Part));
3799  }
3800 
3801  return _Result;
3802 }
3803 
3854 
3855 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3856 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const auto_partitioner& _Part = auto_partitioner())
3857 {
3858  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3859 }
3860 
3911 
3912 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3913 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const static_partitioner& _Part)
3914 {
3915  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3916 }
3917 
3968 
3969 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
3970 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const simple_partitioner& _Part)
3971 {
3972  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
3973 }
3974 
4025 
4026 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>
4027 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, affinity_partitioner& _Part)
4028 {
4029  return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);
4030 }
4031 
4088 
4089 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator, typename _Partitioner>
4090 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4091  _Output_iterator _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)
4092 {
4093  typedef typename std::iterator_traits<_Input_iterator1>::iterator_category _Input_iterator_type1;
4094  typedef typename std::iterator_traits<_Input_iterator2>::iterator_category _Input_iterator_type2;
4095  typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;
4096 
4097  if (_First1 != _Last1)
4098  {
4100  ::_Parallel_transform_binary_impl(_First1, _Last1, _First2, _Result, _Binary_op, std::forward<_Partitioner>(_Part));
4101  }
4102 
4103  return _Result;
4104 }
4105 
4153 
4154 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>
4155 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,
4156  _Output_iterator _Result, const _Binary_operator& _Binary_op)
4157 {
4158  return parallel_transform(_First1, _Last1, _First2, _Result, _Binary_op, auto_partitioner());
4159 }
4160 
4161 #pragma warning(pop)
4162 
4163 #pragma warning(push)
4164 // object allocated on the heap may not be aligned 64
4165 #pragma warning(disable: 4316)
4166 
4180 
4181 template<typename _Ty>
4183 {
4184 private:
4185 
4186 // Disable warning C4324: structure was padded due to __declspec(align())
4187 // This padding is expected and necessary.
4188 #pragma warning(push)
4189 #pragma warning(disable: 4324)
4191  struct _Node
4192  {
4193  unsigned long _M_key;
4194  _Ty _M_value;
4195  _Node* _M_chain;
4196 
4197  _Node(unsigned long _Key, _Ty _InitialValue)
4198  : _M_key(_Key), _M_value(_InitialValue), _M_chain(NULL)
4199  {
4200  }
4201  };
4202 #pragma warning(pop)
4203 
4204  static _Ty _DefaultInit()
4205  {
4206  return _Ty();
4207  }
4208 
4209 public:
4220 
4222  : _M_fnInitialize(_DefaultInit)
4223  {
4224  _InitNew();
4225  }
4226 
4244 
4245  template <typename _Function>
4246  explicit combinable(_Function _FnInitialize)
4247  : _M_fnInitialize(_FnInitialize)
4248  {
4249  _InitNew();
4250  }
4251 
4265 
4266  combinable(const combinable& _Copy)
4267  : _M_size(_Copy._M_size), _M_fnInitialize(_Copy._M_fnInitialize)
4268  {
4269  _InitCopy(_Copy);
4270  }
4271 
4281 
4283  {
4284  clear();
4285  delete [] _M_buckets;
4286  _M_fnInitialize = _Copy._M_fnInitialize;
4287  _M_size = _Copy._M_size;
4288  _InitCopy(_Copy);
4289 
4290  return *this;
4291  }
4292 
4296 
4298  {
4299  clear();
4300  delete [] _M_buckets;
4301  }
4302 
4310 
4311  _Ty& local()
4312  {
4314  size_t _Index;
4315  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4316  if (_ExistingNode == NULL)
4317  {
4318  _ExistingNode = _AddLocalItem(_Key, _Index);
4319  }
4320 
4321  _CONCRT_ASSERT(_ExistingNode != NULL);
4322  return _ExistingNode->_M_value;
4323  }
4324 
4337 
4338  _Ty& local(bool& _Exists)
4339  {
4341  size_t _Index;
4342  _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);
4343  if (_ExistingNode == NULL)
4344  {
4345  _Exists = false;
4346  _ExistingNode = _AddLocalItem(_Key, _Index);
4347  }
4348  else
4349  {
4350  _Exists = true;
4351  }
4352 
4353  _CONCRT_ASSERT(_ExistingNode != NULL);
4354  return _ExistingNode->_M_value;
4355  }
4356 
4360 
4361  void clear()
4362  {
4363  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4364  {
4365  _Node* _CurrentNode = _M_buckets[_Index];
4366  while (_CurrentNode != NULL)
4367  {
4368  _Node* _NextNode = _CurrentNode->_M_chain;
4369  delete _CurrentNode;
4370  _CurrentNode = _NextNode;
4371  }
4372  }
4373  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4374  }
4375 
4390 
4391  template<typename _Function>
4392  _Ty combine(_Function _FnCombine) const
4393  {
4394  _Node* _CurrentNode = NULL;
4395  size_t _Index;
4396 
4397  // Look for the first value in the set, and use (a copy of) that as the result.
4398  // This eliminates a single call (of unknown cost) to _M_fnInitialize.
4399  for (_Index = 0; _Index < _M_size; ++_Index)
4400  {
4401  _CurrentNode = _M_buckets[_Index];
4402  if (_CurrentNode != NULL)
4403  {
4404  break;
4405  }
4406  }
4407 
4408  // No values... return the initializer value.
4409  if (_CurrentNode == NULL)
4410  {
4411  return _M_fnInitialize();
4412  }
4413 
4414  // Accumulate the rest of the items in the current bucket.
4415  _Ty _Result = _CurrentNode->_M_value;
4416  for (_CurrentNode = _CurrentNode->_M_chain; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4417  {
4418  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4419  }
4420 
4421  // Accumulate values from the rest of the buckets.
4422  _CONCRT_ASSERT(_Index < _M_size);
4423  for (++_Index; _Index < _M_size; ++_Index)
4424  {
4425  for (_CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4426  {
4427  _Result = _FnCombine(_Result, _CurrentNode->_M_value);
4428  }
4429  }
4430 
4431  return _Result;
4432  }
4433 
4446 
4447  template<typename _Function>
4448  void combine_each(_Function _FnCombine) const
4449  {
4450  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4451  {
4452  for (_Node* _CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4453  {
4454  _FnCombine(_CurrentNode->_M_value);
4455  }
4456  }
4457  }
4458 
4459 private:
4460  void _InitNew()
4461  {
4463  _M_buckets = new _Node*[_M_size];
4464  memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);
4465  }
4466 
4467  void _InitCopy(const combinable& _Copy)
4468  {
4469  _M_buckets = new _Node*[_M_size];
4470  for (size_t _Index = 0; _Index < _M_size; ++_Index)
4471  {
4472  _M_buckets[_Index] = NULL;
4473  for (_Node* _CurrentNode = _Copy._M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)
4474  {
4475  _Node* _NewNode = new _Node(_CurrentNode->_M_key, _CurrentNode->_M_value);
4476  _NewNode->_M_chain = _M_buckets[_Index];
4477  _M_buckets[_Index] = _NewNode;
4478  }
4479  }
4480  }
4481 
4482  _Node* _FindLocalItem(unsigned long _Key, size_t* _PIndex)
4483  {
4484  _CONCRT_ASSERT(_PIndex != NULL);
4485 
4486  *_PIndex = _Key % _M_size;
4487 
4488  // Search at this index for an existing value.
4489  _Node* _CurrentNode = _M_buckets[*_PIndex];
4490  while (_CurrentNode != NULL)
4491  {
4492  if (_CurrentNode->_M_key == _Key)
4493  {
4494  return _CurrentNode;
4495  }
4496 
4497  _CurrentNode = _CurrentNode->_M_chain;
4498  }
4499 
4500  return NULL;
4501  }
4502 
4503  _Node* _AddLocalItem(unsigned long _Key, size_t _Index)
4504  {
4505  _Node* _NewNode = new _Node(_Key, _M_fnInitialize());
4506  _Node* _TopNode;
4507  do
4508  {
4509  _TopNode = _M_buckets[_Index];
4510  _NewNode->_M_chain = _TopNode;
4511  } while (_InterlockedCompareExchangePointer(reinterpret_cast<void * volatile *>(&_M_buckets[_Index]), _NewNode, _TopNode) != _TopNode);
4512 
4513  return _NewNode;
4514  }
4515 
4516 private:
4517  _Node *volatile * _M_buckets;
4518  size_t _M_size;
4519  std::function<_Ty ()> _M_fnInitialize;
4520 };
4521 
4522 #pragma warning(pop) // C4316
4523 
4524 #pragma push_macro("_MAX_NUM_TASKS_PER_CORE")
4525 #pragma push_macro("_FINE_GRAIN_CHUNK_SIZE")
4526 #pragma push_macro("_SORT_MAX_RECURSION_DEPTH")
4527 
4528 // This number is used to control dynamic task splitting
4529 // The ideal chunk (task) division is that the number of cores is equal to the number of tasks, but it will
4530 // perform very poorly when tasks are not balanced. The simple solution is to allocate more tasks than number
4531 // of cores. _MAX_NUM_TASKS_PER_CORE provides a maximum number of tasks that will be allocated per core.
4532 // If this number is too small, the load balancing problem will affect efficiency very seriously, especially
4533 // when the compare operation is expensive.
4534 //
4535 // Note that this number is a maximum number -- the dynamic partition system will reduce the number of partitions
4536 // per core based on the dynamic load. If all cores are very busy, the number of partitions will shrink to
4537 // reduce the scheduler overhead.
4538 //
4539 // Initially, the total tasks(chunks) number of partitions "_Div_num" will be: core number * _MAX_NUM_TASKS_PER_CORE.
4540 // The _Div_num will be divided by 2 after each task splitting. There are two special numbers for _Div_num:
4541 // 1. When _Div_num reaches the point that _Div_num < _MAX_NUM_TASKS_PER_CORE, it means we have split more tasks than cores.
4542 // 2. When _Div_num reaches the point that _Div_num <= 1, it means stop splitting more tasks and begin sorting serially.
4543 #define _MAX_NUM_TASKS_PER_CORE 1024
4544 
4545 // This is a number mainly is used to control the sampling and dynamic task splitting strategies.
4546 // If the user configurable minimal divisible chunk size (default is 2048) is smaller than FINE_GRAIN_CHUNK_SIZE,
4547 // the random sampling algorithm for quicksort will enter fine-grained mode, and take a strategy that reduces the sampling
4548 // overhead. Also, the dynamic task splitting will enter fine-grained mode, which will split as many tasks as possible.
4549 #define _FINE_GRAIN_CHUNK_SIZE 512
4550 
4551 // This is the maximum depth that the quicksort will be called recursively. If we allow too far, a stack overflow may occur.
4552 #define _SORT_MAX_RECURSION_DEPTH 64
4553 
4554 template<typename _Random_iterator, typename _Function>
4555 inline size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
4556 {
4557  _Potentially_equal = false;
4558  if (_Func(_Begin[_A], _Begin[_B]))
4559  {
4560  if (_Func(_Begin[_A], _Begin[_C]))
4561  {
4562  return _Func(_Begin[_B], _Begin[_C]) ? _B : _C;
4563  }
4564  else
4565  {
4566  return _A;
4567  }
4568  }
4569  else
4570  {
4571  if (_Func(_Begin[_B], _Begin[_C]))
4572  {
4573  return _Func(_Begin[_A], _Begin[_C]) ? _A : _C;
4574  }
4575  else
4576  {
4577  _Potentially_equal = true;
4578  return _B;
4579  }
4580  }
4581 }
4582 
4583 template<typename _Random_iterator, typename _Function>
4584 inline size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
4585 {
4586  size_t _Offset = _Size / 8;
4587  size_t _A = _Median_of_three(_Begin, 0, _Offset, _Offset * 2, _Func, _Potentially_equal),
4588  _B = _Median_of_three(_Begin, _Offset * 3, _Offset * 4, _Offset * 5, _Func, _Potentially_equal),
4589  _C = _Median_of_three(_Begin, _Offset * 6, _Offset * 7, _Size - 1, _Func, _Potentially_equal);
4590  _B = _Median_of_three(_Begin, _A, _B, _C, _Func, _Potentially_equal);
4591 
4592  if (_Potentially_equal)
4593  {
4594  _Potentially_equal = !_Func(_Begin[_C], _Begin[_A]);
4595  }
4596 
4597  return _B;
4598 }
4599 
4600 // _Potentially_equal means that potentially all the values in the buffer are equal to the pivot value
4601 template<typename _Random_iterator, typename _Function>
4602 inline size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
4603 {
4604  // Base on different chunk size, apply different sampling optimization
4605  if (_Chunk_size < _FINE_GRAIN_CHUNK_SIZE && _Size <= std::max<size_t>(_Chunk_size * 4, static_cast<size_t>(15)))
4606  {
4607  bool _Never_care_equal;
4608  return _Median_of_three(_Begin, 0, _Size / 2, _Size - 1, _Func, _Never_care_equal);
4609  }
4610  else
4611  {
4612  return _Median_of_nine(_Begin, _Size, _Func, _Potentially_equal);
4613  }
4614 }
4615 
4616 // Find out two middle points for two sorted arrays by binary search so that the number of total elements on the left part of two middle points is equal
4617 // to the number of total elements on the right part of two sorted arrays and all elements on the left part is smaller than right part.
4618 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4619 size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
4620 {
4621  size_t _Len = (_Len1 + _Len2) / 2, _Index1 = 0, _Index2 = 0;
4622 
4623  while (_Index1 < _Len1 && _Index2 < _Len2)
4624  {
4625  size_t _Mid1 = (_Index1 + _Len1) / 2, _Mid2 = (_Index2 + _Len2) / 2;
4626  if (_Func(_Begin1[_Mid1], _Begin2[_Mid2]))
4627  {
4628  if (_Mid1 + _Mid2 < _Len)
4629  {
4630  _Index1 = _Mid1 + 1;
4631  }
4632  else
4633  {
4634  _Len2 = _Mid2;
4635  }
4636  }
4637  else
4638  {
4639  if (_Mid1 + _Mid2 < _Len)
4640  {
4641  _Index2 = _Mid2 + 1;
4642  }
4643  else
4644  {
4645  _Len1 = _Mid1;
4646  }
4647  }
4648  }
4649 
4650  if (_Index1 == _Len1)
4651  {
4652  _Len2 = _Len - _Len1;
4653  }
4654  else
4655  {
4656  _Len1 = _Len - _Len2;
4657  }
4658 
4659  return _Len;
4660 }
4661 
4662 // "move" operation is applied between buffers
4663 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4664 void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2,
4665  _Random_output_iterator _Output, const _Function &_Func)
4666 {
4667  while (_Begin1 != _End1 && _Begin2 != _End2)
4668  {
4669  if (_Func(*_Begin1, *_Begin2))
4670  {
4671  *_Output++ = std::move(*_Begin1++);
4672  }
4673  else
4674  {
4675  *_Output++ = std::move(*_Begin2++);
4676  }
4677  }
4678 
4679  if (_Begin1 != _End1)
4680  {
4681  std::_Move_no_deprecate(_Begin1, _End1, _Output);
4682  }
4683  else if (_Begin2 != _End2)
4684  {
4685  std::_Move_no_deprecate(_Begin2, _End2, _Output);
4686  }
4687 }
4688 
4689 // _Div_num of threads(tasks) merge two chunks in parallel, _Div_num should be power of 2, if not, the largest power of 2 that is
4690 // smaller than _Div_num will be used
4691 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>
4692 void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output,
4693  const _Function &_Func, size_t _Div_num)
4694 {
4695  // Turn to serial merge or continue splitting chunks base on "_Div_num"
4696  if (_Div_num <= 1 || (_Len1 <= 1 && _Len2 <= 1))
4697  {
4698  _Merge_chunks(_Begin1, _Begin1 + _Len1, _Begin2, _Begin2 + _Len2, _Output, _Func);
4699  }
4700  else
4701  {
4702  size_t _Mid_len1 = _Len1, _Mid_len2 = _Len2;
4703  size_t _Mid = _Search_mid_point(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Func);
4704 
4706  auto _Handle = make_task([&]
4707  {
4708  _Parallel_merge(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Output, _Func, _Div_num / 2);
4709  });
4710  _Tg.run(_Handle);
4711 
4712  _Parallel_merge(_Begin1 + _Mid_len1, _Len1 - _Mid_len1, _Begin2 + _Mid_len2, _Len2 - _Mid_len2, _Output + _Mid, _Func, _Div_num / 2);
4713 
4714  _Tg.wait();
4715  }
4716 }
4717 
4718 // Return current sorting byte from key
4719 template<typename _Ty, typename _Function>
4720 inline size_t _Radix_key(const _Ty& _Val, size_t _Radix, _Function _Proj_func)
4721 {
4722  return static_cast<size_t>(_Proj_func(_Val) >> static_cast<int>(8 * _Radix) & 255);
4723 }
4724 
4725 // One pass of radix sort
4726 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4727 void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
4728 {
4729  if (!_Size)
4730  {
4731  return;
4732  }
4733 
4734  size_t _Pos[256] = {0};
4735 
4736  for (size_t _I = 0; _I < _Size; _I++)
4737  {
4738  ++_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)];
4739  }
4740 
4741  for (size_t _I = 1; _I < 256; _I++)
4742  {
4743  _Pos[_I] += _Pos[_I - 1];
4744  }
4745 
4746  // _Size > 0
4747  for (size_t _I = _Size - 1; _I != 0; _I--)
4748  {
4749  _Output[--_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)]] = std::move(_Begin[_I]);
4750  }
4751 
4752  _Output[--_Pos[_Radix_key(_Begin[0], _Radix, _Proj_func)]] = std::move(_Begin[0]);
4753 }
4754 
4755 // Serial least-significant-byte radix sort, it will sort base on last "_Radix" number of bytes
4756 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4757 void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4758  size_t _Radix, _Function _Proj_func, size_t _Deep = 0)
4759 {
4760  size_t _Cur_radix = 0;
4761  if (_Size == 0)
4762  {
4763  return;
4764  }
4765 
4766  while (_Cur_radix < _Radix)
4767  {
4768  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4769  _Integer_radix_pass(_Output, _Size, _Begin, _Cur_radix++, _Proj_func);
4770  }
4771 
4772  if (_Cur_radix == _Radix)
4773  {
4774  _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);
4775  }
4776 
4777  // if odd round is passed, then move result back to input buffer
4778  if (_Deep + _Radix + 1 & 1)
4779  {
4780  if (_Radix + 1 & 1)
4781  {
4782  std::_Move_no_deprecate(_Output, _Output + _Size, _Begin);
4783  }
4784  else
4785  {
4786  std::_Move_no_deprecate(_Begin, _Begin + _Size, _Output);
4787  }
4788  }
4789 }
4790 
4791 // Parallel most-significant-byte _Radix sort.
4792 // In the end, it will turn to serial least-significant-byte radix sort
4793 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4794 void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4795  size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep = 0)
4796 {
4797  // If the chunk _Size is too small, then turn to serial least-significant-byte radix sort
4798  if (_Size <= _Chunk_size || _Radix < 1)
4799  {
4800  return _Integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Deep);
4801  }
4802 
4804  size_t _Buffer_size = sizeof(size_t) * 256 * _Threads_num;
4805  size_t _Step = _Size / _Threads_num;
4806  size_t _Remain = _Size % _Threads_num;
4807 
4809  using _Chunk_ptr_t = size_t (*)[256];
4810  _Chunk_ptr_t _Chunks = reinterpret_cast<_Chunk_ptr_t>(_Holder._InitOnRawMalloca(_malloca(_Buffer_size)));
4811 
4812  memset(_Chunks, 0, _Buffer_size);
4813 
4814  // Our purpose is to map unsorted data in buffer "_Begin" to buffer "_Output" so that all elements who have the same
4815  // byte value in the "_Radix" position will be grouped together in the buffer "_Output"
4816  //
4817  // Serial version:
4818  // To understand this algorithm, first consider a serial version. In following example, we treat 1 digit as 1 byte, so we have a
4819  // total of 10 elements for each digit instead of 256 elements in each byte. Let's suppose "_Radix" == 1 (right most is 0), and:
4820  //
4821  // begin: [ 32 | 62 | 21 | 43 | 55 | 43 | 23 | 44 ]
4822  //
4823  // We want to divide the output buffer "_Output" into 10 chunks, and each the element in the "_Begin" buffer should be mapped into
4824  // the proper destination chunk based on its current digit (byte) indicated by "_Radix"
4825  //
4826  // Because "_Radix" == 1, after a pass of this function, the chunks in the "_Output" should look like:
4827  //
4828  // buffer: [ | | 21 23 | 32 | 43 43 44 | 55 | 62 | | | ]
4829  // 0 1 2 3 4 5 6 7 8 9
4830  //
4831  // The difficulty is determining where to insert values into the "_Output" to get the above result. The way to get the
4832  // start position of each chunk of the buffer is:
4833  // 1. Count the number of elements for each chunk (in above example, chunk0 is 0, chunk1 is 0, chunk2 is 2, chunk3 is 1 ...
4834  // 2. Make a partial sum for these chunks( in above example, we will get chunk0=chunk0=0, chunk1=chunk0+chunk1=0,
4835  // chunk2=chunk0+chunk1+chunk2=2, chunk3=chunk0+chunk1+chunk2+chunk3=3
4836  //
4837  // After these steps, we will get the end position of each chunk in the "_Output". The begin position of each chunk will be the end
4838  // point of last chunk (begin point is close but the end point is open). After that, we can scan the original array again and directly
4839  // put elements from original buffer "_Begin" into specified chunk on buffer "_Output".
4840  // Finally, we invoke _parallel_integer_radix_sort in parallel for each chunk and sort them in parallel based on the next digit (byte).
4841  // Because this is a STABLE sort algorithm, if two numbers has same key value on this byte (digit), their original order should be kept.
4842  //
4843  // Parallel version:
4844  // Almost the same as the serial version, the differences are:
4845  // 1. The count for each chunk is executed in parallel, and each thread will count one segment of the input buffer "_Begin".
4846  // The count result will be separately stored in their own chunk size counting arrays so we have a total of threads-number
4847  // of chunk count arrays.
4848  // For example, we may have chunk00, chunk01, ..., chunk09 for first thread, chunk10, chunk11, ..., chunk19 for second thread, ...
4849  // 2. The partial sum should be executed across these chunk counting arrays that belong to different threads, instead of just
4850  // making a partial sum in one counting array.
4851  // This is because we need to put values from different segments into one final buffer, and the absolute buffer position for
4852  // each chunkXX is needed.
4853  // 3. Make a parallel scan for original buffer again, and move numbers in parallel into the corresponding chunk on each buffer based
4854  // on these threads' chunk size counters.
4855 
4856  // Count in parallel and separately save their local results without reducing
4857  ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4858  {
4859  size_t _Beg_index, _End_index;
4860 
4861  // Calculate the segment position
4862  if (_Index < _Remain)
4863  {
4864  _Beg_index = _Index * (_Step + 1);
4865  _End_index = _Beg_index + (_Step + 1);
4866  }
4867  else
4868  {
4869  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4870  _End_index = _Beg_index + _Step;
4871  }
4872 
4873  // Do a counting
4874  while (_Beg_index != _End_index)
4875  {
4876  ++_Chunks[_Index][_Radix_key(_Begin[_Beg_index++], _Radix, _Proj_func)];
4877  }
4878  });
4879 
4880  int _Index = -1, _Count = 0;
4881 
4882  // Partial sum cross different threads' chunk counters
4883  for (int _I = 0; _I < 256; _I++)
4884  {
4885  size_t _Last = _I ? _Chunks[_Threads_num - 1][_I - 1] : 0;
4886  _Chunks[0][_I] += _Last;
4887 
4888  for (size_t _J = 1; _J < _Threads_num; _J++)
4889  {
4890  _Chunks[_J][_I] += _Chunks[_J - 1][_I];
4891  }
4892 
4893  // "_Chunks[_Threads_num - 1][_I] - _Last" will get the global _Size for chunk _I(including all threads local _Size for chunk _I)
4894  // this will chunk whether the chunk _I is empty or not. If it's not empty, it will be recorded.
4895  if (_Chunks[_Threads_num - 1][_I] - _Last)
4896  {
4897  ++_Count;
4898  _Index = _I;
4899  }
4900  }
4901 
4902  // If there is more than 1 chunk that has content, then continue the original algorithm
4903  if (_Count > 1)
4904  {
4905  // Move the elements in parallel into each chunk
4906  ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)
4907  {
4908  size_t _Beg_index, _End_index;
4909 
4910  // Calculate the segment position
4911  if (_Index < _Remain)
4912  {
4913  _Beg_index = _Index * (_Step + 1);
4914  _End_index = _Beg_index + (_Step + 1);
4915  }
4916  else
4917  {
4918  _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;
4919  _End_index = _Beg_index + _Step;
4920  }
4921 
4922  // Do a move operation to directly put each value into its destination chunk
4923  // Chunk pointer is moved after each put operation.
4924  if (_Beg_index != _End_index--)
4925  {
4926  while (_Beg_index != _End_index)
4927  {
4928  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4929  --_End_index;
4930  }
4931  _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);
4932  }
4933  });
4934 
4935  // Invoke _parallel_integer_radix_sort in parallel for each chunk
4936  ::Concurrency::parallel_for(static_cast<size_t>(0), static_cast<size_t>(256), [=](size_t _Index)
4937  {
4938  if (_Index < 256 - 1)
4939  {
4940  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Chunks[0][_Index + 1] - _Chunks[0][_Index],
4941  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4942  }
4943  else
4944  {
4945  _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Size - _Chunks[0][_Index],
4946  _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);
4947  }
4948  });
4949  }
4950  else
4951  {
4952  // Only one chunk has content
4953  // A special optimization is applied because one chunk means all numbers have a same value on this particular byte (digit).
4954  // Because we cannot sort them at all (they are all equal at this point), directly call _parallel_integer_radix_sort to
4955  // sort next byte (digit)
4956  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix - 1, _Proj_func, _Chunk_size, _Deep);
4957  }
4958 }
4959 
4960 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
4961 void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,
4962  _Function _Proj_func, const size_t _Chunk_size)
4963 {
4964  typedef typename std::iterator_traits<_Random_iterator>::value_type _Value_type;
4965  // The key type of the radix sort, this must be an "unsigned integer-like" type, that is, it needs support:
4966  // operator>> (int), operator>>= (int), operator& (int), operator <, operator size_t ()
4967  typedef typename std::remove_const<typename std::remove_reference<decltype(_Proj_func(*_Begin))>::type>::type _Integer_type;
4968 
4969  // Find out the max value, which will be used to determine the highest differing byte (the radix position)
4970  _Integer_type _Max_val = ::Concurrency::parallel_reduce(_Begin, _Begin + _Size, _Proj_func(*_Begin),
4971  [=](_Random_iterator _Begin, _Random_iterator _End, _Integer_type _Init) -> _Integer_type
4972  {
4973  while (_Begin != _End)
4974  {
4975  _Integer_type _Ret = _Proj_func(*_Begin++);
4976  if (_Init < _Ret)
4977  {
4978  _Init = _Ret;
4979  }
4980  }
4981 
4982  return _Init;
4983  }, [](const _Integer_type &_A, const _Integer_type &_B) -> const _Integer_type& {return (_A < _B)? _B : _A;});
4984  size_t _Radix = 0;
4985 
4986  // Find out highest differing byte
4987  while (_Max_val >>= 8)
4988  {
4989  ++_Radix;
4990  }
4991 
4992  _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Chunk_size);
4993 }
4994 
4995 template<typename _Random_iterator, typename _Function>
4996 void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
4997 {
4998  if (_Depth >= _SORT_MAX_RECURSION_DEPTH || _Size <= _Chunk_size || _Size <= static_cast<size_t>(3) || _Chunk_size >= _FINE_GRAIN_CHUNK_SIZE && _Div_num <= 1)
4999  {
5000  return std::sort(_Begin, _Begin + _Size, _Func);
5001  }
5002 
5003  // Determine whether we need to do a three-way quick sort
5004  // We benefit from three-way merge if there are a lot of elements that are EQUAL to the median value,
5005  // _Select_median_pivot function will test redundant density by sampling
5006  bool _Is_three_way_split = false;
5007  size_t _Mid_index = _Select_median_pivot(_Begin, _Size, _Func, _Chunk_size, _Is_three_way_split);
5008 
5009  // Move the median value to the _Begin position.
5010  if (_Mid_index)
5011  {
5012  std::swap(*_Begin, _Begin[_Mid_index]);
5013  }
5014  size_t _I = 1, _J = _Size - 1;
5015 
5016  // Three-way or two-way partition
5017  // _Div_num < _MAX_NUM_TASKS_PER_CORE is checked to make sure it will never do three-way split before splitting enough tasks
5018  if (_Is_three_way_split && _Div_num < _MAX_NUM_TASKS_PER_CORE)
5019  {
5020  while (_Func(*_Begin, _Begin[_J]))
5021  {
5022  --_J;
5023  }
5024 
5025  while (_Func(_Begin[_I], *_Begin))
5026  {
5027  ++_I;
5028  }
5029 
5030  // Starting from this point, left side of _I will less than median value, right side of _J will be greater than median value,
5031  // and the middle part will be equal to median. _K is used to scan between _I and _J
5032  size_t _K = _J;
5033  while (_I <= _K)
5034  {
5035  if (_Func(_Begin[_K], *_Begin))
5036  {
5037  std::swap(_Begin[_I++], _Begin[_K]);
5038  }
5039  else
5040  {
5041  --_K;
5042  }
5043 
5044  while (_Func(*_Begin, _Begin[_K]))
5045  {
5046  std::swap(_Begin[_K--], _Begin[_J--]);
5047  }
5048  }
5049 
5050  ++_J;
5051  }
5052  else
5053  {
5054  while (_I <= _J)
5055  {
5056  // Will stop before _Begin
5057  while (_Func(*_Begin, _Begin[_J]))
5058  {
5059  --_J;
5060  }
5061 
5062  // There must be another element equal or greater than *_Begin
5063  while (_Func(_Begin[_I], *_Begin))
5064  {
5065  ++_I;
5066  }
5067 
5068  if (_I < _J)
5069  {
5070  std::swap(_Begin[_I++], _Begin[_J--]);
5071  }
5072  else
5073  {
5074  break;
5075  }
5076  }
5077 
5078  _I = ++_J;
5079  }
5080 
5081  std::swap(*_Begin, _Begin[--_I]);
5082 
5084  volatile size_t _Next_div = _Div_num / 2;
5085  auto _Handle = make_task([&]
5086  {
5087  _Parallel_quicksort_impl(_Begin + _J, _Size - _J, _Func, _Next_div, _Chunk_size, _Depth+1);
5088  });
5089  _Tg.run(_Handle);
5090 
5091  _Parallel_quicksort_impl(_Begin, _I, _Func, _Next_div, _Chunk_size, _Depth+1);
5092 
5093  // If at this point, the work hasn't been scheduled, then slow down creating new tasks
5094  if (_Div_num < _MAX_NUM_TASKS_PER_CORE)
5095  {
5096  _Next_div /= 2;
5097  }
5098 
5099  _Tg.wait();
5100 }
5101 
5102 // This function will be called to sort the elements in the "_Begin" buffer. However, we can't tell whether the result will end up in buffer
5103 // "_Begin", or buffer "_Output" when it returned. The return value is designed to indicate which buffer holds the sorted result.
5104 // Return true if the merge result is in the "_Begin" buffer; return false if the result is in the "_Output" buffer.
5105 // We can't always put the result into one assigned buffer because that may cause frequent buffer copies at return time.
5106 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>
5107 inline bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func,
5108  int _Div_num, const size_t _Chunk_size)
5109 {
5110  static_assert(std::is_same<typename std::iterator_traits<_Random_iterator>::value_type, typename std::iterator_traits<_Random_buffer_iterator>::value_type>::value,
5111  "same value type expected");
5112 
5113  if (_Div_num <= 1 || _Size <= _Chunk_size)
5114  {
5115  _Parallel_quicksort_impl(_Begin, _Size, _Func, _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5116 
5117  // In case _Size <= _Chunk_size happened BEFORE the planned stop time (when _Div_num == 1) we need to calculate how many turns of
5118  // binary divisions are left. If there are an odd number of turns left, then the buffer move is necessary to make sure the final
5119  // merge result will be in the original input array.
5120  int _Left_div_turns = 0;
5121  while (_Div_num >>= 1)
5122  {
5123  _Left_div_turns++;
5124  }
5125 
5126  if (_Left_div_turns & 1)
5127  {
5128  std::move(_Begin, _Begin + _Size, _Output);
5129  return true;
5130  }
5131  else
5132  {
5133  return false;
5134  }
5135  }
5136  else
5137  {
5138  size_t _Mid = _Size / 2;
5140 
5141  auto _Handle = make_task([&, _Chunk_size]
5142  {
5143  _Parallel_buffered_sort_impl(_Begin, _Mid, _Output, _Func, _Div_num / 2, _Chunk_size);
5144  });
5145  _Tg.run(_Handle);
5146 
5147  bool _Is_buffer_swap = _Parallel_buffered_sort_impl(_Begin + _Mid, _Size - _Mid, _Output + _Mid, _Func, _Div_num / 2, _Chunk_size);
5148 
5149  _Tg.wait();
5150 
5151  if (_Is_buffer_swap)
5152  {
5153  _Parallel_merge(_Output, _Mid, _Output + _Mid, _Size - _Mid, _Begin, _Func, _Div_num);
5154  }
5155  else
5156  {
5157  _Parallel_merge(_Begin, _Mid, _Begin + _Mid, _Size - _Mid, _Output, _Func, _Div_num);
5158  }
5159 
5160  return !_Is_buffer_swap;
5161  }
5162 }
5163 
5164 // Disable the warning saying constant value in condition expression.
5165 // This is by design that lets the compiler optimize the trivial constructor.
5166 #pragma warning (push)
5167 #pragma warning (disable: 4127)
5168 
5169 // Allocate and construct a buffer
5170 template<typename _Allocator>
5171 inline typename _Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
5172 {
5173  typename _Allocator::pointer _P = _Alloc.allocate(_N);
5174 
5175  // If the objects being sorted have trivial default constructors, they do not need to be
5176  // constructed here. This can benefit performance.
5177  if (!std::is_trivially_default_constructible<typename _Allocator::value_type>::value)
5178  {
5179  for (size_t _I = 0; _I < _N; _I++)
5180  {
5181  // Objects being sorted must have a default constructor
5182  typename _Allocator::value_type _T;
5183  _Alloc.construct(_P + _I, std::forward<typename _Allocator::value_type>(_T));
5184  }
5185  }
5186 
5187  return _P;
5188 }
5189 
5190 // Destroy and deallocate a buffer
5191 template<typename _Allocator>
5192 inline void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
5193 {
5194  // If the objects being sorted have trivial destructors, they do not need to be
5195  // destructed here. This can benefit performance.
5196  if (!std::is_trivially_destructible<typename _Allocator::value_type>::value)
5197  {
5198  for (size_t _I = 0; _I < _N; _I++)
5199  {
5200  _Alloc.destroy(_P + _I);
5201  }
5202  }
5203 
5204  _Alloc.deallocate(_P, _N);
5205 }
5206 
5207 //
5208 // Exception safe RAII wrapper for the allocated buffers
5209 //
5210 
5211 template<typename _Allocator>
5213 {
5214 public:
5215  _AllocatedBufferHolder(size_t _Size, const _Allocator & _Alloc): _M_alloc(_Alloc)
5216  {
5217  _M_size = _Size;
5218  _M_buffer = _Construct_buffer(_Size, _M_alloc);
5219  }
5220 
5222  {
5223  _Destroy_buffer(_M_buffer, _M_size, _M_alloc);
5224  }
5225 
5226  typename _Allocator::pointer _Get_buffer()
5227  {
5228  return _M_buffer;
5229  }
5230 
5231 private:
5232  size_t _M_size;
5233  _Allocator _M_alloc;
5234  typename _Allocator::pointer _M_buffer;
5235 };
5236 
5237 
5238 #pragma warning (pop)
5239 
5272 
5273 template<typename _Random_iterator, typename _Function>
5274 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5275 {
5276  _CONCRT_ASSERT(_Chunk_size > 0);
5277 
5278  // Check for cancellation before the algorithm starts.
5280 
5281  size_t _Size = _End - _Begin;
5283 
5284  if (_Size <= _Chunk_size || _Core_num < 2)
5285  {
5286  return std::sort(_Begin, _End, _Func);
5287  }
5288 
5289  _Parallel_quicksort_impl(_Begin, _Size, _Func, _Core_num * _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);
5290 }
5291 
5313 
5314 template<typename _Random_iterator>
5315 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5316 {
5317  parallel_sort(_Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5318 }
5319 
5362 
5363 template<typename _Allocator, typename _Random_iterator, typename _Function>
5364 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5365 {
5366  _CONCRT_ASSERT(_Chunk_size > 0);
5367 
5368  // Check cancellation before the algorithm starts.
5370 
5371  size_t _Size = _End - _Begin;
5373 
5374  if (_Size <= _Chunk_size || _Core_num < 2)
5375  {
5376  return std::sort(_Begin, _End, _Func);
5377  }
5378  const static size_t _CORE_NUM_MASK = 0x55555555;
5379 
5380  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5381 
5382  // Prevent cancellation from happening during the algorithm in case it leaves buffers in unknown state.
5384  // This buffered sort algorithm will divide chunks and apply parallel quicksort on each chunk. In the end, it will
5385  // apply parallel merge to these sorted chunks.
5386  //
5387  // We need to decide on the number of chunks to divide the input buffer into. If we divide it into n chunks, log(n)
5388  // merges will be needed to get the final sorted result. In this algorithm, we have two buffers for each merge
5389  // operation, let's say buffer A and B. Buffer A is the original input array, buffer B is the additional allocated
5390  // buffer. Each turn's merge will put the merge result into the other buffer; for example, if we decided to split
5391  // into 8 chunks in buffer A at very beginning, after one pass of merging, there will be 4 chunks in buffer B.
5392  // If we apply one more pass of merging, there will be 2 chunks in buffer A again.
5393  //
5394  // The problem is we want to the final merge pass to put the result back in buffer A, so that we don't need
5395  // one extra copy to put the sorted data back to buffer A.
5396  // To make sure the final result is in buffer A (original input array), we need an even number of merge passes,
5397  // which means log(n) must be an even number. Thus n must be a number power(2, even number). For example, when the
5398  // even number is 2, n is power(2, 2) = 4, when even number is 4, n is power(2, 4) = 16. When we divide chunks
5399  // into these numbers, the final merge result will be in the original input array. Now we need to decide the chunk(split)
5400  // number based on this property and the number of cores.
5401  //
5402  // We want to get a chunk (split) number close to the core number (or a little more than the number of cores),
5403  // and it also needs to satisfy above property. For a 8 core machine, the best chunk number should be 16, because it's
5404  // the smallest number that satisfies the above property and is bigger than the core number (so that we can utilize all
5405  // cores, a little more than core number is OK, we need to split more tasks anyway).
5406  //
5407  // In this algorithm, we will make this alignment by bit operations (it's easy and clear). For a binary representation,
5408  // all the numbers that satisfy power(2, even number) will be 1, 100, 10000, 1000000, 100000000 ...
5409  // After OR-ing these numbers together, we will get a mask (... 0101 0101 0101) which is all possible combinations of
5410  // power(2, even number). We use _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, a bit-wise operation to align
5411  // _Core_num's highest bit into a power(2, even number).
5412  //
5413  // It means if _Core_num = 8, the highest bit in binary is bin(1000) which is not power(2, even number). After this
5414  // bit-wise operation, it will align to bin(10000) = 16 which is power(2, even number). If the _Core_num = 16, after
5415  // alignment it still returns 16. The trick is to make sure the highest bit of _Core_num will align to the "1" bit of the
5416  // mask bin(... 0101 0101 0101) We don't care about the other bits on the aligned result except the highest bit, because they
5417  // will be ignored in the function.
5419  _Func, _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, _Chunk_size);
5421 
5422 }
5423 
5463 
5464 template<typename _Allocator, typename _Random_iterator, typename _Function>
5465 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5466 {
5467  _Allocator _Alloc;
5468  return parallel_buffered_sort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Func, _Chunk_size);
5469 }
5470 
5507 
5508 template<typename _Random_iterator, typename _Function>
5509 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)
5510 {
5511  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End, _Func, _Chunk_size);
5512 }
5513 
5539 
5540 template<typename _Random_iterator>
5541 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5542 {
5543  parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End,
5544  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5545 }
5546 
5575 
5576 template<typename _Allocator, typename _Random_iterator>
5577 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5578 {
5579  parallel_buffered_sort<_Allocator>(_Begin, _End,
5580  std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5581 }
5582 
5614 
5615 template<typename _Allocator, typename _Random_iterator>
5616 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5617 {
5618  parallel_buffered_sort<_Allocator>(_Alloc, _Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());
5619 }
5620 
5621 #pragma warning(push)
5622 #pragma warning (disable: 4127)
5623 //
5624 // This is a default function used for parallel_radixsort which will return just the value.
5625 // It also performs compile-time checks to ensure that the data type is integral.
5626 //
5627 template <typename _DataType>
5629 {
5630  size_t operator()(const _DataType& _Val) const
5631  {
5632  // An instance of the type predicate returns the value if the type _DataType is one of the integral types, otherwise it
5633  // statically asserts.
5634  // An integral type is one of: bool, char, unsigned char, signed char, wchar_t, short, unsigned short, int, unsigned int, long,
5635  // and unsigned long.
5636  // In addition, with compilers that provide them, an integral type can be one of long long, unsigned long long, __int64, and
5637  // unsigned __int64
5638  static_assert(std::is_integral<_DataType>::value,
5639  "Type should be integral to use default radix function. For more information on integral types, please refer to https://msdn.microsoft.com/en-us/library/bb983099.aspx.");
5640  static_assert((sizeof(_DataType) <= sizeof(size_t)), "Passed Type is bigger than size_t.");
5641 
5642  if (std::is_unsigned<_DataType>::value)
5643  {
5644  return _Val;
5645  }
5646  else
5647  {
5648  // The default function needs to take the signed integer-like representation and map it to an unsigned one. The
5649  // following code will take the midpoint of the unsigned representable range (SIZE_MAX/2)+1 and does an unsigned
5650  // add of the value. Thus, it maps a [-signed_min,+signed_max] range into a [0, unsigned_max] range.
5651  return (((SIZE_MAX/2) + 1) + static_cast<size_t>(_Val));
5652  }
5653  }
5654 };
5655 #pragma warning (pop)
5656 
5681 
5682 template<typename _Random_iterator>
5683 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5684 {
5685  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5686 
5688 
5689  parallel_radixsort<std::allocator<_DataType>>(_Begin, _End, _Proj_func, 256 * 256);
5690 }
5691 
5722 
5723 template<typename _Allocator, typename _Random_iterator>
5724 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)
5725 {
5726  typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;
5727 
5729 
5730  parallel_radixsort<_Allocator>(_Alloc, _Begin, _End, _Proj_func);
5731 }
5732 
5760 
5761 template<typename _Allocator, typename _Random_iterator>
5762 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
5763 {
5764  _Allocator _Alloc;
5765  return parallel_radixsort<_Allocator, _Random_iterator>(_Alloc, _Begin, _End);
5766 }
5767 
5807 
5808 template<typename _Allocator, typename _Random_iterator, typename _Function>
5809 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5810 {
5811  _CONCRT_ASSERT(_Chunk_size > 0);
5812 
5813  // Check for cancellation before the algorithm starts.
5815 
5816  size_t _Size = _End - _Begin;
5817 
5818  // If _Size <= 1, no more sorting needs to be done.
5819  if (_Size <= 1)
5820  {
5821  return;
5822  }
5823 
5824  _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);
5825 
5826  // Prevent cancellation from happening during the algorithm in case it leaves the buffers in unknown state.
5828  _Parallel_integer_sort_asc(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()), _Proj_func, _Chunk_size);
5830 }
5831 
5868 
5869 template<typename _Allocator, typename _Random_iterator, typename _Function>
5870 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5871 {
5872  _Allocator _Alloc;
5873  return parallel_radixsort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Proj_func, _Chunk_size);
5874 }
5875 
5909 
5910 template<typename _Random_iterator, typename _Function>
5911 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)
5912 {
5913  parallel_radixsort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(
5914  _Begin, _End, _Proj_func, _Chunk_size);
5915 }
5916 
5917 #pragma pop_macro("_SORT_MAX_RECURSION_DEPTH")
5918 #pragma pop_macro("_MAX_NUM_TASKS_PER_CORE")
5919 #pragma pop_macro("_FINE_GRAIN_CHUNK_SIZE")
5920 }
5921 
5922 namespace concurrency = ::Concurrency;
5923 
5924 #pragma pop_macro("new")
5925 #pragma pack(pop)
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2473
Definition: ppl.h:3019
_Type _Get_num_chunks(_Type)
Definition: ppl.h:1750
volatile long _M_completion_count
Definition: ppl.h:2115
A cancellation beacon is a flag which can be polled in an inlinable fashion using the is_signaled met...
Definition: concrt.h:5299
void _Wait_on_intrusive_steal()
Definition: ppl.h:2075
void _Parallel_for_each_chunk(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2808
Definition: concrt.h:364
_CRTIMP2 long __cdecl GetCurrentThreadId()
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of cancellation...
void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
Definition: ppl.h:4996
size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
Definition: ppl.h:4602
std::function< _Ty()> _M_fnInitialize
Definition: ppl.h:4519
_Variant_copymove_layer_ & operator=(_Variant_copymove_layer_ &&_That) _NOEXCEPT_OP((conjunction< is_nothrow_move_constructible< _Types >...
::Concurrency::details::_TaskCollection _M_task_collection
Definition: ppl.h:842
static _CONCRTIMP void __cdecl _Yield()
_Ty & local()
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4311
void _Parallel_chunk_task_group_run(structured_task_group &_Task_group, task_handle< _Worker_class > *_Chunk_helpers, const Partitioner &, _Index_type _I)
Definition: ppl.h:2345
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1784
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5465
void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, _Function _Proj_func, const size_t _Chunk_size)
Definition: ppl.h:4961
Definition: xutility:531
const _Sub_function & _Sub_fun
Definition: ppl.h:3000
bool _Send_range(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2003
size_t _M_size
Definition: ppl.h:4518
_Parallel_fixed_chunk_helper< _Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator > _Base
Definition: ppl.h:2314
The combinable object is intended to provide thread-private copies of data, to perform lock-free t...
Definition: ppl.h:4182
Structured task collections represent groups of work which follow a strictly LIFO ordered paradigm qu...
Definition: concrt.h:4495
volatile _Index_type _M_current
Definition: ppl.h:1889
const _Unary_operator& _M_unary_op
Definition: ppl.h:3762
~combinable()
Destroys a combinable object.
Definition: ppl.h:4297
size_t _M_number
Definition: ppl.h:3050
void _Populate(_Forward_iterator &_First, size_t _Length)
Definition: ppl.h:3610
unsigned int _M_len
Definition: ppl.h:2798
_Bucket * _M_root
Definition: ppl.h:3051
#define _TRACE_LEVEL_INFORMATION
Definition: ppl.h:38
void _NotifyCancel()
Definition: ppl.h:2089
_Worker_proxy * _M_pParent_worker
Definition: ppl.h:2109
Implements busy wait with no backoff
Definition: concrt.h:578
TaskProc m_pFunction
Definition: concrt.h:4191
void run_with_cancellation_token(const _Function &_Func, cancellation_token _Ct)
Executes a function object immediately and synchronously in the context of a given cancellation token...
Definition: ppl.h:865
#define _CONCRT_ASSERT(x)
Definition: concrt.h:123
location & _M_chunk_location
Definition: ppl.h:2337
_CONCRTIMP void __cdecl _Trace_ppl_function(const GUID &_Guid, unsigned char _Level, ConcRT_EventType _Type)
_Parallel_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2137
combinable(const combinable &_Copy)
Constructs a new combinable object.
Definition: ppl.h:4266
_Index_type _Number_of_iterations() const
Definition: ppl.h:1852
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:427
location * _M_pChunk_locations
Definition: ppl.h:1766
~_Worker_proxy()
Definition: ppl.h:1905
void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4727
task_handle< _Function > make_task(const _Function &_Func)
A factory method for creating a task_handle object.
Definition: ppl.h:165
unsigned int size_t
Definition: sourceannotations.h:19
std::iterator_traits< _Forward_iterator >::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits< _Forward_iterator >::value_type &_Identity)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3287
static cancellation_token none()
Returns a cancellation token which can never be subject to cancellation.
Definition: pplcancellation_token.h:628
_Node * _FindLocalItem(unsigned long _Key, size_t *_PIndex)
Definition: ppl.h:4482
_CONCRTIMP const GUID PPLParallelForEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_CONCRTIMP void _Cancel()
Cancels work on the task collection.
void operator()() const
Definition: ppl.h:3695
_In_ long
Definition: corecrt_wstdlib.h:88
std::iterator_traits< _Random_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3672
void(__cdecl * TaskProc)(void *)
Concurrency::details contains definitions of support routines in the public namespaces and one or mor...
Definition: concrt.h:251
_CONCRTIMP bool __cdecl is_current_task_group_canceling()
Returns an indication of whether the task group which is currently executing inline on the current co...
_Bucket * _Next
Definition: ppl.h:3027
_Parallel_transform_unary_helper(_Input_iterator &_First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op)
Definition: ppl.h:3741
void * align(size_t _Bound, size_t _Size, void *&_Ptr, size_t &_Space) _NOEXCEPT
Definition: memory:2409
void operator()() const
Definition: ppl.h:2325
_In_ size_t _Deref_pre_opt_z_ char const _In_ size_t _N
Definition: wchar.h:78
_CONCRTIMP const GUID PPLParallelInvokeEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
_Combinable_type & _Combinable
Definition: ppl.h:3003
void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output, const _Function &_Func, size_t _Div_num)
Definition: ppl.h:4692
void combine_each(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4448
_Parallel_chunk_helper(const _Random_iterator &_First, const _Index_type &_Step, const _Function &_Func, const _Range< _Index_type > &_Worker_range, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2146
void _IncrementConstructedElemsCount()
Definition: concrt.h:1072
location & _Get_chunk_location(unsigned int _ChunkIndex)
Definition: ppl.h:1744
_Functor::_Bucket_type *const _M_bucket
Definition: ppl.h:3332
_Bucket(_Bucket *_N)
Definition: ppl.h:3029
void operator()() const
Definition: ppl.h:2287
STL namespace.
_CONCRTIMP void *__cdecl Alloc(size_t _NumBytes)
Allocates a block of memory of the size specified from the Concurrency Runtime Caching Suballocator...
__declspec(align(64)) struct _Node
Definition: ppl.h:4190
const _Function & _M_function
Definition: ppl.h:2266
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator&_Unary_op, const auto_partitioner &_Part=auto_partitioner())
Applies a specified function object to each element in a source range, or to a pair of elements from ...
Definition: ppl.h:3856
_Parallel_transform_binary_helper(_Input_iterator1 &_First1, _Input_iterator1 _Last1, _Input_iterator2 &_First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op)
Definition: ppl.h:3686
The Concurrency namespace provides classes and functions that provide access to the Concurrency Runti...
Definition: agents.h:43
void _Parallel_for_each_forward_impl(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2824
static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator &_Result, const _Unary_operator&_Unary_op, const auto_partitioner &)
Definition: ppl.h:3517
void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep=0)
Definition: ppl.h:4794
std::iterator_traits< _Forward_iterator >::value_type _Value_type
Definition: ppl.h:2769
unchecked_array_iterator< _Iterator > make_unchecked_array_iterator(const _Iterator _Ptr)
Definition: iterator:693
void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, size_t _Deep=0)
Definition: ppl.h:4757
void _Disable_intrusive_steal()
Definition: ppl.h:2032
_CRT_BEGIN_C_HEADER _Check_return_ _Ret_maybenull_ _In_ size_t _Size
Definition: corecrt_malloc.h:58
~task_handle()
Destroys the task_handle object.
Definition: ppl.h:110
void interruption_point()
Creates an interruption point for cancellation. If a cancellation is in progress in the context where...
Definition: ppl.h:880
_CONCRTIMP void _Cancel()
Cancels work on the task collection.
__declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration
_ElemType * _InitOnRawMalloca(void *_MallocaRet)
Definition: concrt.h:1062
void clear()
Clears any intermediate computational results from a previous usage.
Definition: ppl.h:4361
void parallel_invoke(const _Function1 &_Func1, const _Function2 &_Func2)
Executes the function objects supplied as parameters in parallel, and blocks until they have finished...
Definition: ppl.h:945
task_handle const & operator=(task_handle const &)
_Allocator _M_alloc
Definition: ppl.h:5233
_Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun)
Definition: ppl.h:3319
_Iterator_helper< _Input_iterator2, typename std::iterator_traits< _Input_iterator2 >::iterator_category > _M_input_helper2
Definition: ppl.h:3708
_In_ size_t _In_ int _Index
Definition: time.h:102
task_group()
Constructs a new task_group object.
Definition: ppl.h:504
_CONCRTIMP const GUID PPLParallelForeachEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...
_OutIt _Move_no_deprecate(_InIt _First, _InIt _Last, _OutIt _Dest)
Definition: xutility:2558
_In_ size_t _In_ int _Radix
Definition: corecrt_wstdlib.h:53
void operator()() const
Definition: ppl.h:2159
void _Set_last_iteration(const _Index_type _I)
Definition: ppl.h:1878
_TaskCollectionStatus _Wait()
Waits for all chores running in the _StructuredTaskCollection to finish (normally or abnormally)...
Definition: concrt.h:4604
static _CONCRTIMP _Context __cdecl _CurrentContext()
The static_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1641
_Iterator_helper< _Input_iterator1, typename std::iterator_traits< _Input_iterator1 >::iterator_category > _M_input_helper1
Definition: ppl.h:3707
The simple_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1670
structured_task_group()
Constructs a new structured_task_group object.
Definition: ppl.h:219
size_t _M_size
Definition: ppl.h:5232
void _Set_tree_done()
Definition: ppl.h:2056
void _Set_current_iteration(const _Index_type _I)
Definition: ppl.h:1864
_TaskCollectionStatus _Wait()
Waits for all chores running in the _TaskCollection to finish (normally or abnormally). This method encapsulates all the running tasks in an exception handling block, and will re-throw any exceptions that occur in any of it tasks (if those exceptions occur on another thread, they are marshaled from that thread to the thread where the _TaskCollection was created, and re-thrown). After this function returns, the _TaskCollection cannot be used for scheduling further work.
Definition: concrt.h:4809
size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
Definition: ppl.h:4619
Definition: ppl.h:1805
::Concurrency::details::_Cancellation_beacon _M_beacon
Definition: ppl.h:2112
bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func, int _Div_num, const size_t _Chunk_size)
Definition: ppl.h:5107
volatile _Index_type _M_last
Definition: ppl.h:1890
_Bucket * _Unsafe_push_back()
Definition: ppl.h:3101
_Parallel_for_each_helper(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func)
Definition: ppl.h:2772
bool _SpinOnce()
Spins for one time quantum,until a maximum spin is reached.
Definition: concrt.h:626
~task_group()
Destroys a task_group object. You are expected to call the either the wait or run_and_wait method on ...
Definition: ppl.h:536
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2441
const _Index_type _M_first_iteration
Definition: ppl.h:2304
_Worker_proxy(_Worker_proxy *_PParent_worker=NULL)
Definition: ppl.h:1899
_Ty _Serial_combine_release()
Definition: ppl.h:3079
void _InitCopy(const combinable &_Copy)
Definition: ppl.h:4467
_Range(_Index_type _Current_iteration, _Index_type _Last_iteration)
Definition: ppl.h:1810
void run(const _Function &_Func)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:573
~structured_task_group()
Destroys a structured_task_group object. You are expected to call either the wait or run_and_wait met...
Definition: ppl.h:252
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:285
#define _malloca(size)
Definition: malloc.h:127
void _Parallel_chunk_impl(const _Random_iterator &_First, _Index_type _Range_arg, const _Index_type &_Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2366
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:388
void _Put(const _Ty &_Cur)
Definition: ppl.h:3042
bool _Is_done()
Definition: ppl.h:2043
_In_ wctype_t _Type
Definition: corecrt_wctype.h:111
_Ty & local(bool &_Exists)
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4338
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:696
void operator()() const
Definition: ppl.h:3748
static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End, _Random_output_iterator &_Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3529
std::iterator_traits< _Forward_iterator >::value_type value_type
Definition: ppl.h:3587
_Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type &_Identity, const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3166
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function &_Fun)
Definition: ppl.h:3368
_CONCRTIMP void __cdecl Free(_Pre_maybenull_ _Post_invalid_ void *_PAllocation)
Releases a block of memory previously allocated by the Alloc method to the Concurrency Runtime Cachin...
void _Parallel_for_each_partitioned_impl(const _Random_iterator &_First, _Index_type _Range_arg, _Index_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2856
const _Random_iterator & _M_first
Definition: ppl.h:2300
void parallel_buffered_sort(const _Allocator &_Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5364
~auto_partitioner()
Destroys a auto_partitioner object.
Definition: ppl.h:1627
const _Index_type & _M_step
Definition: ppl.h:2301
_Bucket * _Construct(_Bucket *_Par=0)
Definition: ppl.h:3055
size_t _GetAllocationSize() const
Definition: concrt.h:1104
The auto_partitioner class represents the default method parallel_for, parallel_for_each and parallel...
Definition: ppl.h:1614
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
const _Binary_operator& _M_binary_op
Definition: ppl.h:3710
void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5274
affinity_partitioner()
Constructs an affinity_partitioner object.
Definition: ppl.h:1731
void parallel_for_each(const extent< _Rank > &_Compute_domain, const _Kernel_type &_Kernel)
Invokes a parallel computation of a kernel function over a compute domain on an accelerator_view. The accelerator_view is determined from the arrays and/or array_views captured by the kernel function, or if no accelerator_view can be derived, the default is chosen.
Definition: amp.h:7020
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2555
~_AllocatedBufferHolder()
Definition: ppl.h:5221
void _Enable_intrusive_steal(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2026
auto_partitioner()
Constructs a auto_partitioner object.
Definition: ppl.h:1621
#define _FINE_GRAIN_CHUNK_SIZE
Definition: ppl.h:4549
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3667
#define _SORT_MAX_RECURSION_DEPTH
Definition: ppl.h:4552
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5460
void _Propagate_cancel()
Definition: ppl.h:2094
void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, const auto_partitioner &, std::forward_iterator_tag)
Definition: ppl.h:2841
_Node *volatile * _M_buckets
Definition: ppl.h:4517
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3709
char int *typedef int(__CRTDECL *_CRT_REPORT_HOOKW)(int
Definition: crtdbg.h:45
structured_task_group(cancellation_token _CancellationToken)
Constructs a new structured_task_group object.
Definition: ppl.h:236
_In_ wchar_t _C
Definition: wchar.h:253
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:796
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of a cancellation...
_Base _M_fixed_helper
Definition: ppl.h:2338
bool _GetRuntimeOwnsLifetime() const
Definition: concrt.h:4232
_Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other)
Definition: ppl.h:3433
_Allocator::pointer _M_buffer
Definition: ppl.h:5234
size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4584
static _Ty _DefaultInit()
Definition: ppl.h:4204
void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges elements in a specified range into an non descending order using a radix sorting algorithm...
Definition: ppl.h:5683
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...
#define false
Definition: stdbool.h:16
_Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5171
::Concurrency::details::_TaskCollectionBase * _OwningCollection() const
Definition: concrt.h:4218
const _Index_type _M_last_iteration
Definition: ppl.h:2269
unsigned long long _Size_type
Definition: ppl.h:1673
The structured_task_group class represents a highly structured collection of parallel work...
Definition: ppl.h:205
combinable & operator=(const combinable &_Copy)
Assigns to a combinable object from another combinable object.
Definition: ppl.h:4282
void * _InterlockedCompareExchangePointer(void *volatile *, void *, void *)
const _Functor & _M_fun
Definition: ppl.h:3330
combinable(_Function _FnInitialize)
Constructs a new combinable object.
Definition: ppl.h:4246
const _Reduce_type & _Identity_value
Definition: ppl.h:3001
task_group_status wait()
Waits until all work on the structured_task_group has completed or is canceled.
Definition: ppl.h:349
void _SetRuntimeOwnsLifetime(bool _FValue)
Definition: concrt.h:4225
_Range< _Index_type > *volatile _M_pWorker_range
Definition: ppl.h:2118
void _Parallel_invoke_impl(const _Function1 &_Func1, const _Function2 &_Func2)
Definition: ppl.h:908
_Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, affinity_partitioner &_Part)
Definition: ppl.h:2316
Task collections represent groups of work which step outside the strict structuring of the _Structure...
Definition: concrt.h:4702
_Function _M_function
Definition: ppl.h:137
Definition: ppl.h:3583
~simple_partitioner()
Destroys a simple_partitioner object.
Definition: ppl.h:1695
_Size_type _M_chunk_size
Definition: ppl.h:1713
size_t _Radix_key(const _Ty &_Val, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4720
The task_group class represents a collection of parallel work which can be waited on or canceled...
Definition: ppl.h:490
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...
An abstraction of a physical location on hardware.
Definition: concrt.h:1825
const _Forward_iterator _M_end
Definition: ppl.h:3331
void run(const _Function &_Func, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:614
void _Insert(_Bucket *_Item)
Definition: ppl.h:3034
static _CONCRTIMP location __cdecl current()
Returns a location object representing the most specific place the calling thread is executing...
task_group(cancellation_token _CancellationToken)
Constructs a new task_group object.
Definition: ppl.h:520
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the structured_task_group object (although such certainly qualifies this method to return true). It may be the case that the structured_task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this structured_task_group object, true will be returned as well.
Definition: ppl.h:463
size_t _Populate(_Random_iterator &_First, _Random_iterator _Last)
Definition: ppl.h:3643
unsigned int _M_num_chunks
Definition: ppl.h:1763
void operator()() const
The function call operator that the runtime invokes to perform the work of the task handle...
Definition: ppl.h:126
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3618
void _Send_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1818
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1630
std::iterator_traits< _Forward_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3623
#define _MAX_NUM_TASKS_PER_CORE
Definition: ppl.h:4543
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:812
_ElemType * _AddRawMallocaNode(void *_MallocaRet)
Definition: concrt.h:1124
_In_ _Value
Definition: corecrt_wstdlib.h:65
void _Set_done()
Definition: ppl.h:2048
bool _Receive_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1919
bool _Verify_beacon_cancellation()
Definition: ppl.h:2067
~_Parallel_reduce_forward_executor_helper()
Definition: ppl.h:3449
#define _T(x)
Definition: tchar.h:2427
void _Steal_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1838
task_group_status
Describes the execution status of a task_group or structured_task_group object. A value of this type ...
Definition: pplinterface.h:102
void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5192
_Reduce_type _Reduce_type
Definition: ppl.h:3005
size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4555
_Iterator_helper< _Input_iterator, typename std::iterator_traits< _Input_iterator >::iterator_category > _M_input_helper
Definition: ppl.h:3760
_Allocator::pointer _Get_buffer()
Definition: ppl.h:5226
_Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function &_Func)
Definition: ppl.h:3413
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:653
void operator()() const
Definition: ppl.h:2783
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:757
simple_partitioner(_Size_type _Chunk_size)
Constructs a simple_partitioner object.
Definition: ppl.h:1683
_Order_combinable(const _Sym_fun &_Fun)
Definition: ppl.h:3061
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1659
void swap(any &_Left, any &_Right) _NOEXCEPT
Definition: any:450
bool _Is_helper_registered()
Definition: ppl.h:2038
long __cdecl _InterlockedDecrement(long volatile *)
const _Sym_fun & _M_fun
Definition: ppl.h:3049
static_partitioner()
Constructs a static_partitioner object.
Definition: ppl.h:1648
#define _CONCRTIMP
Definition: crtdefs.h:48
_CONCRTIMP size_t __cdecl _GetCombinableSize()
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3761
constexpr remove_reference< _Ty >::type && move(_Ty &&_Arg) _NOEXCEPT
Definition: type_traits:1349
_Diff _Count
Definition: algorithm:1941
task_handle< _Worker_class > * _Workers
Definition: ppl.h:3410
const _Index_type _M_last_iteration
Definition: ppl.h:2305
_Worker_proxy< _Index_type > *const _M_parent_worker
Definition: ppl.h:2271
static _CONCRTIMP unsigned int __cdecl _GetNumberOfVirtualProcessors()
_Check_return_ _Ret_maybenull_ _In_ size_t _In_ size_t _Offset
Definition: corecrt_malloc.h:159
void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2, _Random_output_iterator _Output, const _Function &_Func)
Definition: ppl.h:4664
::Concurrency::details::_Context _M_context
Definition: ppl.h:2113
std::iterator_traits< _Random_iterator >::value_type value_type
Definition: ppl.h:3637
void _Populate(_Random_iterator &_First, size_t _Length)
Definition: ppl.h:3661
The affinity_partitioner class is similar to the static_partitioner class, but it improves cache affi...
Definition: ppl.h:1723
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func)
Definition: ppl.h:2509
char _Value[(sizeof(_Ty)/sizeof(char))]
Definition: ppl.h:3026
_AllocatedBufferHolder(size_t _Size, const _Allocator &_Alloc)
Definition: ppl.h:5215
const _Index_type _M_first_iteration
Definition: ppl.h:2268
_Ty combine(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4392
static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1, _Random_input_iterator2 _Begin2, _Random_output_iterator &_Result, const _Binary_operator&_Binary_op, _Partitioner &&_Part)
Definition: ppl.h:3563
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:444
Definition: ppl.h:1896
~static_partitioner()
Destroys a static_partitioner object.
Definition: ppl.h:1656
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1796
_Type _Get_num_chunks(_Type _Range_arg) const
Definition: ppl.h:1698
void _InitNew()
Definition: ppl.h:4460
long __cdecl _InterlockedIncrement(long volatile *)
_Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &)
Definition: ppl.h:2280
void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, task_group &_Tg)
Definition: ppl.h:3717
bool _Is_beacon_signaled()
Definition: ppl.h:2062
size_t _Populate(_Forward_iterator &_First, _Forward_iterator _Last)
Definition: ppl.h:3596
_Iterator_helper()
Definition: ppl.h:3589
_Combinable_type::_Bucket _Bucket_type
Definition: ppl.h:3006
_Parallel_reduce_fixed_worker< _Forward_iterator, _Function > _Worker_class
Definition: ppl.h:3409
_FwdIt const _Ty _Val
Definition: algorithm:1938
const _Function & _M_function
Definition: ppl.h:2302
#define SIZE_MAX
Definition: limits.h:76
size_t operator()(const _DataType &_Val) const
Definition: ppl.h:5630
The task_handle class represents an individual parallel work item. It encapsulates the instructions a...
Definition: ppl.h:85
_Result
Definition: corecrt_wconio.h:362
_Index_type _Get_current_iteration() const
Definition: ppl.h:1858
_Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb)
Definition: ppl.h:3008
Definition: type_traits:1311
const _Function & _M_function
Definition: ppl.h:2796
~affinity_partitioner()
Destroys an affinity_partitioner object.
Definition: ppl.h:1739
_FwdIt _Last
Definition: algorithm:1936
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:323
combinable()
Constructs a new combinable object.
Definition: ppl.h:4221
_Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, std::forward_iterator_tag)
Definition: ppl.h:3295
_Range< _Index_type > *volatile _M_pHelper_range
Definition: ppl.h:2106
_Index_type _Get_last_iteration() const
Definition: ppl.h:1872
const _Random_iterator & _M_first
Definition: ppl.h:2264
const _Index_type & _M_step
Definition: ppl.h:2265
void sort(_RanIt _First, _RanIt _Last, _Pr _Pred)
Definition: algorithm:2913
::Concurrency::details::_StructuredTaskCollection _M_task_collection
Definition: ppl.h:474
~_Order_combinable()
Definition: ppl.h:3067
void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op, task_group &_Tg)
Definition: ppl.h:3769
void parallel_for(_Index_type _First, _Index_type _Last, const _Function &_Func, affinity_partitioner &_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2747
_Node * _AddLocalItem(unsigned long _Key, size_t _Index)
Definition: ppl.h:4503
#define NULL
Definition: corecrt.h:158
void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:3464
volatile long _M_stop_iterating
Definition: ppl.h:2119
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the task_group object (although such certainly qualifies this method to return true). It may be the case that the task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this task_group object, true will be returned as well.
Definition: ppl.h:831
_Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3790
The cancellation_token class represents the ability to determine whether some operation has been requ...
Definition: pplcancellation_token.h:616
static _ChoreType * _InternalAlloc(const _Function &_Func)
Definition: concrt.h:4239
task_group_status wait()
Waits until all work on the task_group object has either completed or been canceled.
Definition: ppl.h:719
task_handle(const _Function &_Func)
Constructs a new task_handle object. The work of the task is performed by invoking the function speci...
Definition: ppl.h:101
static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, const auto_partitioner &)
Definition: ppl.h:3550