STLdoc/VS2015/ppl_8h_source.html

 /***

 * ==++==

 *

 * Copyright (c) Microsoft Corporation. All rights reserved.

 *

 * ==--==

 * =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+

 *

 * ppl.h

 *

 * Parallel Patterns Library

 *

 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

 ****/


 #pragma once


 #include <crtdefs.h>

 #include <concrt.h>

 #include <stdexcept>

 #include <iterator>

 #include <functional>

 #include <memory>

 #include <type_traits>

 #include <algorithm>

 #include <malloc.h>


 #include <pplwin.h>


 #define _PPL_H


 #pragma pack(push,_CRT_PACKING)

 #pragma push_macro("new")

 #undef new


 // Define the level of tracing to use


 #define _TRACE_LEVEL_INFORMATION 4


 namespace Concurrency

 {

 namespace details

 {

     _CONCRTIMP size_t __cdecl _GetCombinableSize();

 } // namespace details


 class structured_task_group;

 class task_group;


 template<typename _Function>

 class task_handle : public ::Concurrency::details::_UnrealizedChore

 {

 public:


     task_handle(const _Function& _Func) : _M_function(_Func)

     {

         m_pFunction = reinterpret_cast <TaskProc> (&::Concurrency::details::_UnrealizedChore::_InvokeBridge<task_handle>);

     }


     ~task_handle()

     {

         //

         // We only need to perform a liveness check if the client owns the lifetime of the handle. Doing this for runtime owned handles

         // is not only unnecessary -- it is also dangerous.

         //

         if (_OwningCollection() != NULL && !_GetRuntimeOwnsLifetime())

         {

             _CheckTaskCollection();

         }

     }


     void operator()() const

     {

         _M_function();

     }


 private:


     friend class task_group;

     friend class structured_task_group;


     // The function object invoked to perform the body of the task.

     _Function _M_function;


     task_handle const & operator=(task_handle const&);    // no assignment operator


 };


 template <class _Function>

 task_handle<_Function> make_task(const _Function& _Func)

 {

     return task_handle<_Function>(_Func);

 }


 class structured_task_group

 {

 public:


     structured_task_group()

     {

     }


     structured_task_group(cancellation_token _CancellationToken) :

         _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())

     {

     }


     ~structured_task_group()

     {

     }


     template<class _Function>

     void run(task_handle<_Function>& _Task_handle)

     {

         _Task_handle._SetRuntimeOwnsLifetime(false);

         _M_task_collection._Schedule(&_Task_handle);

     }


     template<class _Function>

     void run(task_handle<_Function>& _Task_handle, location& _Placement)

     {

         _Task_handle._SetRuntimeOwnsLifetime(false);

         _M_task_collection._Schedule(&_Task_handle, &_Placement);

     }


     task_group_status wait()

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         return (task_group_status)_M_task_collection._Wait();

     }


     template<class _Function>

     task_group_status run_and_wait(task_handle<_Function>& _Task_handle)

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);

     }


     template<class _Function>

     task_group_status run_and_wait(const _Function& _Func)

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         task_handle<_Function> _Task(_Func);

         return (task_group_status)_M_task_collection._RunAndWait(&_Task);

     }


     void cancel()

     {

         _M_task_collection._Cancel();

     }


     bool is_canceling()

     {

         return _M_task_collection._IsCanceling();

     }


 private:


     // Disallow passing in an r-value for a task handle argument

     template<class _Function> void run(task_handle<_Function>&& _Task_handle);


     // The underlying group of tasks as known to the runtime.

     ::Concurrency::details::_StructuredTaskCollection _M_task_collection;

 };


 class task_group

 {

 public:


     task_group()

     {

     }


     task_group(cancellation_token _CancellationToken) :

         _M_task_collection(_CancellationToken._GetImpl() != NULL ? _CancellationToken._GetImpl() : ::Concurrency::details::_CancellationTokenState::_None())

     {

     }


     ~task_group()

     {

     }


     template<typename _Function>

     void run(const _Function& _Func)

     {

         _M_task_collection._Schedule(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func));

     }


     template<typename _Function>

     void run(const _Function& _Func, location& _Placement)

     {

         _M_task_collection._Schedule(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func), &_Placement);

     }


     template<typename _Function>

     void run(task_handle<_Function>& _Task_handle)

     {

         _Task_handle._SetRuntimeOwnsLifetime(false);

         _M_task_collection._Schedule(&_Task_handle);

     }


     template<typename _Function>

     void run(task_handle<_Function>& _Task_handle, location& _Placement)

     {

         _Task_handle._SetRuntimeOwnsLifetime(false);

         _M_task_collection._Schedule(&_Task_handle, &_Placement);

     }


     task_group_status wait()

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         return static_cast<task_group_status>(_M_task_collection._Wait());

     }


     template<class _Function>

     task_group_status run_and_wait(task_handle<_Function>& _Task_handle)

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         _Task_handle._SetRuntimeOwnsLifetime(false);

         return (task_group_status)_M_task_collection._RunAndWait(&_Task_handle);

     }


     template<class _Function>

     task_group_status run_and_wait(const _Function& _Func)

     {

         //

         // The underlying scheduler's definitions map exactly to the PPL's. No translation beyond the cast is necessary.

         //

         return (task_group_status)_M_task_collection._RunAndWait(::Concurrency::details::_UnrealizedChore::_InternalAlloc<task_handle<_Function>, _Function>(_Func));

     }


     void cancel()

     {

         _M_task_collection._Cancel();

     }


     bool is_canceling()

     {

         return _M_task_collection._IsCanceling();

     }


 private:


     // Disallow passing in an r-value for a task handle argument

     template<class _Function> void run(task_handle<_Function>&& _Task_handle);


     // The underlying group of tasks as known to the runtime.

     ::Concurrency::details::_TaskCollection _M_task_collection;

 };


 template<typename _Function>

 void run_with_cancellation_token(const _Function& _Func, cancellation_token _Ct)

 {

     structured_task_group _Stg(_Ct);

     _Stg.run_and_wait(_Func);

 }


 inline void interruption_point()

 {

     structured_task_group _Stg;

     _Stg.wait();

 }


 _CONCRTIMP bool __cdecl is_current_task_group_canceling();


 // Parallel Algorithms and Patterns


 // Helper function that implements parallel_invoke with two functions

 // Used by parallel_for and parallel_for_each implementations


 template <typename _Function1, typename _Function2>

 void _Parallel_invoke_impl(const _Function1& _Func1, const _Function2& _Func2)

 {

     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     // We inline the last item to prevent the unnecessary push/pop on the work queue.

     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run_and_wait(_Task_handle2);

 }


 template <typename _Function1, typename _Function2>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     _Parallel_invoke_impl(_Func1, _Func2);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run_and_wait(_Task_handle3);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run_and_wait(_Task_handle4);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run_and_wait(_Task_handle5);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,

     typename _Function6>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,

     const _Function6& _Func6)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run(_Task_handle5);


     task_handle<_Function6> _Task_handle6(_Func6);

     _Task_group.run_and_wait(_Task_handle6);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,

     typename _Function6, typename _Function7>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,

     const _Function6& _Func6, const _Function7& _Func7)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run(_Task_handle5);


     task_handle<_Function6> _Task_handle6(_Func6);

     _Task_group.run(_Task_handle6);


     task_handle<_Function7> _Task_handle7(_Func7);

     _Task_group.run_and_wait(_Task_handle7);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,

     typename _Function6, typename _Function7, typename _Function8>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,

     const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run(_Task_handle5);


     task_handle<_Function6> _Task_handle6(_Func6);

     _Task_group.run(_Task_handle6);


     task_handle<_Function7> _Task_handle7(_Func7);

     _Task_group.run(_Task_handle7);


     task_handle<_Function8> _Task_handle8(_Func8);

     _Task_group.run_and_wait(_Task_handle8);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,

     typename _Function6, typename _Function7, typename _Function8, typename _Function9>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,

     const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run(_Task_handle5);


     task_handle<_Function6> _Task_handle6(_Func6);

     _Task_group.run(_Task_handle6);


     task_handle<_Function7> _Task_handle7(_Func7);

     _Task_group.run(_Task_handle7);


     task_handle<_Function8> _Task_handle8(_Func8);

     _Task_group.run(_Task_handle8);


     task_handle<_Function9> _Task_handle9(_Func9);

     _Task_group.run_and_wait(_Task_handle9);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Function1, typename _Function2, typename _Function3, typename _Function4, typename _Function5,

     typename _Function6, typename _Function7, typename _Function8, typename _Function9, typename _Function10>

 void parallel_invoke(const _Function1& _Func1, const _Function2& _Func2, const _Function3& _Func3, const _Function4& _Func4, const _Function5& _Func5,

     const _Function6& _Func6, const _Function7& _Func7, const _Function8& _Func8, const _Function9& _Func9, const _Function10& _Func10)

 {

     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);


     structured_task_group _Task_group;


     task_handle<_Function1> _Task_handle1(_Func1);

     _Task_group.run(_Task_handle1);


     task_handle<_Function2> _Task_handle2(_Func2);

     _Task_group.run(_Task_handle2);


     task_handle<_Function3> _Task_handle3(_Func3);

     _Task_group.run(_Task_handle3);


     task_handle<_Function4> _Task_handle4(_Func4);

     _Task_group.run(_Task_handle4);


     task_handle<_Function5> _Task_handle5(_Func5);

     _Task_group.run(_Task_handle5);


     task_handle<_Function6> _Task_handle6(_Func6);

     _Task_group.run(_Task_handle6);


     task_handle<_Function7> _Task_handle7(_Func7);

     _Task_group.run(_Task_handle7);


     task_handle<_Function8> _Task_handle8(_Func8);

     _Task_group.run(_Task_handle8);


     task_handle<_Function9> _Task_handle9(_Func9);

     _Task_group.run(_Task_handle9);


     task_handle<_Function10> _Task_handle10(_Func10);

     _Task_group.run_and_wait(_Task_handle10);


     _Trace_ppl_function(PPLParallelInvokeEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 class auto_partitioner

 {

 public:


     auto_partitioner() {}


     ~auto_partitioner() {}


     template<class _Type>

     _Type _Get_num_chunks(_Type ) const

     {

         return static_cast<_Type>(::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors());

     }

 };


 class static_partitioner

 {

 public:


     static_partitioner()

     {

     }


     ~static_partitioner() {}


     template<class _Type>

     _Type _Get_num_chunks(_Type ) const

     {

         return static_cast<_Type>(::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors());

     }

 };


 class simple_partitioner

 {

 private:

     typedef unsigned long long _Size_type;


 public:


     explicit simple_partitioner(_Size_type _Chunk_size) : _M_chunk_size(_Chunk_size)

     {

         if (_Chunk_size == 0)

         {

             throw std::invalid_argument("_Chunk_size");

         }

     }


     ~simple_partitioner() {}


     template<class _Type>

     _Type _Get_num_chunks(_Type _Range_arg) const

     {

         static_assert(sizeof(_Type) <= sizeof(_Size_type), "Potential truncation of _Range_arg");

         _Size_type _Num_chunks = (static_cast<_Size_type>(_Range_arg) / _M_chunk_size);


         if (_Num_chunks == 0)

         {

            _Num_chunks = 1;

         }


         return static_cast<_Type>(_Num_chunks);

     }


 private:


     _Size_type _M_chunk_size;

 };


 class affinity_partitioner

 {

 public:


     affinity_partitioner() : _M_num_chunks(0), _M_pChunk_locations(NULL)

     {

     }


     ~affinity_partitioner()

     {

         delete [] _M_pChunk_locations;

     }


     location& _Get_chunk_location(unsigned int _ChunkIndex)

     {

         return _M_pChunk_locations[_ChunkIndex];

     }


     template<class _Type>

     _Type _Get_num_chunks(_Type )

     {

         if (_M_num_chunks == 0)

         {

             _M_num_chunks = ::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();

             _M_pChunk_locations = new location[_M_num_chunks];

         }


         return static_cast<_Type>(_M_num_chunks);

     }


 private:

     // The number of chunks the partitioner will record affinity for.

     unsigned int _M_num_chunks;


     // Array of remembered locations.

     location * _M_pChunk_locations;

 };


 // Helper methods for scheduling and executing parallel tasks


 // Disable C4180: qualifier applied to function type has no meaning; ignored

 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.

 #pragma warning(push)

 #pragma warning(disable: 4180)

 // Disable C6263: using _alloca in a loop; this can quickly overflow stack

 #pragma warning(disable: 6263)


 // Template class that invokes user function on a parallel_for_each


 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>

 class _Parallel_chunk_helper_invoke

 {

 public:

     static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)

     {

         _Func(_First[_Index]);

     }

 };


 // Template specialized class that invokes user function on a parallel_for


 template <typename _Random_iterator, typename _Index_type, typename _Function>

 class _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, false>

 {

 public:

     static void __cdecl _Invoke(const _Random_iterator& _First, _Index_type& _Index, const _Function& _Func)

     {

         _Func(static_cast<_Random_iterator>(_First + _Index));

     }

 };


 // Represents a range of iteration


 template<typename _Index_type>

 class _Range

 {

 public:


     // Construct an object for the range [_Current_iteration, _Last_iteration)

     _Range(_Index_type _Current_iteration, _Index_type _Last_iteration) :

       _M_current(_Current_iteration), _M_last(_Last_iteration)

     {

         // On creation, the range shall have at least 1 iteration.

         _CONCRT_ASSERT(_Number_of_iterations() > 0);

     }


     // Send a portion of the range to the helper

     void _Send_range(_Range<_Index_type> * _Helper_range)

     {

         // If there are no iterations other than the current one left until finish, there is no help

         // needed. Set the pointer to a special value that helper will understand and continue

         // doing the work.

         _Index_type _Remaining_iterations = _Number_of_iterations();

         if (_Remaining_iterations > 1)

         {

             // Compute the two pieces of the work range: one for the worker and one for helper class.

             _M_last_iteration = _M_current_iteration + _Remaining_iterations / 2;


             // There needs to be at least 1 iteration left because the current iteration cannot be sent.

             _CONCRT_ASSERT(_Number_of_iterations() > 0);

         }


         // This is also a signal for the helper that a range has been sent to it.

         _Helper_range->_M_current_iteration = _M_last_iteration;

     }


     // Steal the entire range and give it to the helper

     void _Steal_range(_Range<_Index_type>  * _Helper_range)

     {

         // We allow stealing only from a range that has at least 1 iteration

         _CONCRT_ASSERT(_Number_of_iterations() > 0);


         _Index_type _Current_iter = _M_current_iteration;


         _Helper_range->_M_current_iteration = _Current_iter + 1;

         _Helper_range->_M_last_iteration = _M_last_iteration;


         _M_last_iteration = _Current_iter + 1;

     }


     // Returns the number of iterations in this range

     _Index_type _Number_of_iterations() const

     {

         return (_M_last_iteration - _M_current_iteration);

     }


     // Returns the current iteration in the range

     _Index_type _Get_current_iteration() const

     {

         return _M_current;

     }


     // Sets the current iteration in the range

     void _Set_current_iteration(const _Index_type _I)

     {

         _M_current = _I;

     }


     __declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration;


     // Returns the last iteration in the range

     _Index_type _Get_last_iteration() const

     {

         return _M_last;

     }


     // Sets the last iteration in the range

     void _Set_last_iteration(const _Index_type _I)

     {

         _M_last = _I;

     }


     __declspec(property(get=_Get_last_iteration, put=_Set_last_iteration)) _Index_type _M_last_iteration;


 private:


     // These members are volatile because they are updated by the helper

     // and used by the worker.

     volatile _Index_type _M_current;

     volatile _Index_type _M_last;

 };


 // A proxy for the worker responsible for maintaining communication with the helper


 template<typename _Index_type>

 class _Worker_proxy

 {

 public:

     _Worker_proxy(_Worker_proxy *_PParent_worker = NULL) :

       _M_pHelper_range(NULL), _M_pParent_worker(_PParent_worker), _M_pWorker_range(NULL), _M_completion_count(0), _M_stop_iterating(0)

     {

         _M_context = ::Concurrency::details::_Context::_CurrentContext();

     }


     ~_Worker_proxy()

     {

         // Make the check to avoid doing extra work in the non-exceptional cases

         if (_M_completion_count != _Tree_Complete)

         {

             // On exception, notify our parent so it breaks out of its loop.

             _Propagate_cancel();


             // On exception, we need to set _M_completion_count to ensure that the helper breaks out of its spin wait.

             _Set_done();

         }

     }


     // Obtain a range from the worker

     bool _Receive_range(_Range<_Index_type> * _Helper_range)

     {

         // If the worker already finished, then there is no work left for the helper

         if (_M_completion_count)

         {

             return false;

         }


         _CONCRT_ASSERT(_Helper_range != NULL);


         // There are two special values for _M_current_iteration that are not valid: one is the

         // initial value of the working class which it will never share, and the other is

         // the last exclusive iteration of the working class, which has no work to be done.

         // We use the former value so that we can understand worker's response.

         _Index_type _Cached_first_iteration = _Helper_range->_M_current_iteration;


         // Following operation is not done via interlocked operation because it does not have to.

         // Helper lazily registers that it would like to help the worker, but it allows for some

         // time to elapse before that information has made it over to the worker. The idea

         // is not to disturb the worker if it is not necessary. It is possible to add interlocked

         // operation in the future if the time spent in the busy wait loop is too big.

         _CONCRT_ASSERT(_M_pHelper_range == NULL);

         _M_pHelper_range = _Helper_range;


         ::Concurrency::details::_SpinWaitBackoffNone spinWait(::Concurrency::details::_Context::_Yield);


         // If the worker is done, it will flush the store buffer and signal the helper by

         // changing _M_current_iteration in the helper's range.

         while ((_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)

         {

             if ((_M_pWorker_range != NULL) && _M_context._IsSynchronouslyBlocked())

             {

                 // Attempt to steal the entire range from the worker if it is synchronously blocked.


                 // Make sure that worker makes no forward progress while helper is attempting to

                 // steal its range. If worker does get unblocked, simply back off in the helper.

                 // Note that there could be another helper running if a range has already been

                 // sent to us.

                 long _Stop_iterating = _InterlockedIncrement(&_M_stop_iterating);

                 _CONCRT_ASSERT(_Stop_iterating > 0);


                 // We need to make a local copy as the pointer could be changed by the worker.

                 _Range<_Index_type> * _Worker_range = _M_pWorker_range;


                 // The order of comparison needs to be preserved. If the parent is blocked, then

                 // it cannot send a range (because _M_stop_iterating is already set). If it sent a range

                 // before being synchronously blocked, then we are no longer the helper. Refrain

                 // from intrusively stealing the range.

                 if ((_Worker_range != NULL) && _M_context._IsSynchronouslyBlocked()

                     && (_Helper_range->_M_current_iteration == _Cached_first_iteration) && !_M_completion_count)

                 {

                     _CONCRT_ASSERT(_M_pHelper_range == _Helper_range);


                     _M_pHelper_range = NULL;

                     _Worker_range->_Steal_range(_Helper_range);


                     _CONCRT_ASSERT(_Helper_range->_M_current_iteration != _Cached_first_iteration);

                 }


                 // At this point, worker is either:

                 //

                 // a) no longer blocked so range will come to the helper naturally, or

                 // b) out of iterations because helper stole all of it

                 _Stop_iterating = _InterlockedDecrement(&_M_stop_iterating);

                 _CONCRT_ASSERT(_Stop_iterating >= 0);

             }

             else

             {

                 // If there is no work received in a full spin, then start yielding the context

                 spinWait._SpinOnce();

             }

         }


         // If the initial iteration is the same as the original first iteration then the

         // worker class is sending the signal that it does not need any help.

         if (_Helper_range->_M_current_iteration == _Cached_first_iteration)

         {

             return false;

         }


         return (_Helper_range->_Number_of_iterations() > 0);

     }


     // Send a portion of our range and notify the helper.

     bool _Send_range(_Range<_Index_type> * _Worker_range)

     {

         // Worker range shall not be available for stealing at this time.

         _CONCRT_ASSERT(_M_pWorker_range == NULL);


         // Helper shall be registered.

         _CONCRT_ASSERT(_M_pHelper_range != NULL);


         // Send the range

         _Worker_range->_Send_range(_M_pHelper_range);


         // Notify the helper. The fence ensures that the prior updates are visible.

         _InterlockedExchangePointer(reinterpret_cast<void * volatile *>(&_M_pHelper_range), NULL);


         // The current iteration should still be left

         _CONCRT_ASSERT(_Worker_range->_Number_of_iterations() >= 1);


         // Indicate if we need another helper

         return (_Worker_range->_Number_of_iterations() > 1);

     }


     // Let the helper know that it is ok to intrusively steal range from the worker by publishing the

     // remaining range.

     void _Enable_intrusive_steal(_Range<_Index_type> * _Worker_range)

     {

         _M_pWorker_range = _Worker_range;

     }


     // Prevent the helper from intrusively stealing range from the worker

     void _Disable_intrusive_steal()

     {

         _M_pWorker_range = NULL;

         _Wait_on_intrusive_steal();

     }


     bool _Is_helper_registered()

     {

         return (_M_pHelper_range != NULL);

     }


     bool _Is_done()

     {

         return (_M_completion_count != 0);

     }


     void _Set_done()

     {

         // Let the helper know that this class is done with work and flush the store buffer. This operation

         // ensures that any buffered store to helper range in _Send_range is flushed and

         // available in _Receive_range (so there will be no lost ranges).

         _InterlockedExchange(&_M_completion_count, 1);

     }


     void _Set_tree_done()

     {

         // Make sure that **WE** know when our destructor hits that the entire tree is complete.

         _M_completion_count = _Tree_Complete;

     }


     bool _Is_beacon_signaled()

     {

         return _M_beacon._Is_signaled();

     }


     bool _Verify_beacon_cancellation()

     {

         return _M_beacon._Confirm_cancel();

     }


 private:


     // Spin wait for any intrusive steal that is in progress.

     void _Wait_on_intrusive_steal()

     {

         // This code is used to synchronize with helper in case of worker cooperative blocking.

         if (_M_stop_iterating != 0)

         {

             ::Concurrency::details::_SpinWaitBackoffNone spinWait;


             while (_M_stop_iterating != 0)

             {

                 spinWait._SpinOnce();

             }

         }

     }


     void _NotifyCancel()

     {

         _M_beacon._Raise();

     }


     void _Propagate_cancel()

     {

         if (_M_pParent_worker != NULL)

         {

             _M_pParent_worker->_NotifyCancel();

         }

     }


     // Constant indicating sub-tree completion

     static const long _Tree_Complete = 2;


     // Read in the loop

     _Range<_Index_type> * volatile               _M_pHelper_range;


     // Read at the end of the loop

     _Worker_proxy *                              _M_pParent_worker;


     // Written rarely

     ::Concurrency::details::_Cancellation_beacon _M_beacon;

     ::Concurrency::details::_Context             _M_context;


     volatile long                                _M_completion_count;


     // Written to in the loop

     _Range<_Index_type> * volatile               _M_pWorker_range;

     volatile long                                _M_stop_iterating;


     _Worker_proxy const & operator=(_Worker_proxy const&);    // no assignment operator


 };


 // parallel_for -- Performs parallel iteration over a range of indices from _First to _Last,

 // excluding _Last. The order in which each iteration is executed is unspecified and non-deterministic.


 // Closure (binding) classes for invoking parallel_for and parallel_for_each, with chunks


 // A dynamically rebalancing closure class used for packaging parallel_for or parallel_for_each for invocation in chunks.

 // If some tasks finish earlier than others, helper tasks get executed which ensures further distribution of work.


 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>

 class _Parallel_chunk_helper

 {

 public:

     _Parallel_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,

         const _Function& _Func, const _Partitioner&, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :

         _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func),

         _M_parent_worker(_Parent_data)

     {

         // Empty constructor because members are already assigned

     }


         // Constructor overload that accepts a range

     _Parallel_chunk_helper(const _Random_iterator& _First,  const _Index_type& _Step, const _Function& _Func,

         const _Range<_Index_type>& _Worker_range, _Worker_proxy<_Index_type> * const _Parent_data = NULL) :

         _M_first(_First), _M_first_iteration(_Worker_range._M_current_iteration), _M_last_iteration(_Worker_range._M_last_iteration), _M_step(_Step), _M_function(_Func),

         _M_parent_worker(_Parent_data)

     {

         // Empty constructor because members are already assigned

     }


     // The main helper function which iterates over the given collection and invokes user function on every iteration.

     // Function is marked as const even though it does mutate some of its members (those are declared as mutable). This is done

     // in order to easily communicate between a worker and a helper instance, without holding references to many local variables.

     // However, this function does not mutate any state that is visible to anyone outside of this class, nor would that be

     // possible due to the implicit copy of the functor that happens when a new task_handle is created.

     void operator()() const

     {

         _Range<_Index_type> _Worker_range(_M_first_iteration, _M_last_iteration);


         // This class has two modes: worker and helper. The originally split chunk is always a

         // worker, while any subsequent class spawned from this class is in the helper

         // mode, which is signified using a link to the worker class through _M_pOwning_worker

         // handle. So, it will wait for work to be dished out by the working class while in helper mode.

         if (_M_parent_worker != NULL && !_M_parent_worker->_Receive_range(&_Worker_range))

         {

             // If the worker class rejected the help, simply return

             return;

         }


         // Keep the secondary, scaled, loop index for quick indexing into the data structure

         _Index_type _Current_iteration = _Worker_range._M_current_iteration;

         _Index_type _Scaled_index = _Current_iteration * _M_step;


         // If there is only one iteration to be executed there is no need to initialize any

         // helper classes (work is indivisible).

         if (_Worker_range._Number_of_iterations() == 1)

         {

             // Execute one iteration

             _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);

             return;

         }


         // If the execution reaches this point it means that this class now has a chunk of work

         // that it needs to get done, so it has transitioned into the worker mode.

         structured_task_group _Helper_group;


         // Initialize fields that are needed in the helper

         _Worker_proxy<_Index_type> _Worker(_M_parent_worker);


         // Instantiate a helper class for this working class and put it on the work queue.

         // If some thread is idle it will be able to steal the helper and help this class

         // finish its work by stealing a piece of the work range.

         task_handle<_Parallel_chunk_helper> _Helper_task(_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));


         _Helper_group.run(_Helper_task);


         ::Concurrency::details::_MallocaListHolder<task_handle<_Parallel_chunk_helper>> _Holder;


         // Normally, for a cancellation semantic in cooperation with the helper, we would run_and_wait the below code on the Helper_group. Unfortunately,

         // the capture by reference of things which must be shared (_Worker, and so forth) will cause the loop below to add additional indirection

         // instructions. The loop below *MUST* be as tight as possible with the defined semantics. Instead, we will manually notify our parent if the

         // worker's destructor runs without hitting the bottom of our chunk. This is done through notification on the beacon.


         for (_Index_type _I = _Current_iteration; _I < _Worker_range._M_last_iteration; (_I++, _Worker_range._M_current_iteration =_I, _Scaled_index += _M_step))

         {

             if (_Worker._Is_beacon_signaled())

             {

                 // Either a parent task group is canceled or one of the other iterations

                 // threw an exception. Abort the remaining iterations

                 //

                 // Note that this could be a false positive that we must verify.

                 if (_Worker._Is_done() || _Worker._Verify_beacon_cancellation())

                 {

                     break;

                 }

             }


             if (_Worker._Is_helper_registered())

             {

                 // The helper class (there can only be one) registered to help this class with the work.

                 // Thus, figure out if this class needs help and split the range among the two classes.


                 if (_Worker._Send_range(&_Worker_range))

                 {

                     // Construct every new instance of a helper class on the stack because it is beneficial to use

                     // a structured task group where the class itself is responsible for task handle's lifetime.

                     task_handle<_Parallel_chunk_helper> * _Helper_subtask = _Holder._AddRawMallocaNode(_malloca(_Holder._GetAllocationSize()));


                     new(_Helper_subtask) task_handle<_Parallel_chunk_helper>

                         (_Parallel_chunk_helper(_M_first, _M_step, _M_function, _Worker_range, &_Worker));


                     // If _Send_range returns true, that means that there is still some non-trivial

                     // work to be done, so this class will potentially need another helper.

                     _Helper_group.run(*_Helper_subtask);

                 }

             }


             // Allow intrusive stealing by the helper

             _Worker._Enable_intrusive_steal(&_Worker_range);


             // Execute one iteration: the element is at scaled index away from the first element.

             _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);


             // Helper shall not steal a range after this call

             _Worker._Disable_intrusive_steal();

         }


         // Indicate that the worker is done with its iterations.

         _Worker._Set_done();


         // Wait for all worker/helper iterations to finish

         _Helper_group.wait();


         // Make sure that we've signaled that the tree is complete. This is used to detect any exception out of either _Parallel_chunk_helper_invoke or

         // _Helper_group.wait() above as a cancellation of the loop which must propagate upwards because we do not wrap the loop body in run_and_wait.

         _Worker._Set_tree_done();

     }


 private:


     const _Random_iterator&            _M_first;

     const _Index_type&                 _M_step;

     const _Function&                   _M_function;


     const _Index_type                  _M_first_iteration;

     const _Index_type                  _M_last_iteration;


     _Worker_proxy<_Index_type> * const _M_parent_worker;


     _Parallel_chunk_helper const & operator=(_Parallel_chunk_helper const&);    // no assignment operator

 };


 template <typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner, bool _Is_iterator>

 class _Parallel_fixed_chunk_helper

 {

 public:

     _Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator& _First, _Index_type _First_iteration,

          _Index_type _Last_iteration, const _Index_type& _Step, const _Function& _Func, const _Partitioner&) :

         _M_first(_First), _M_first_iteration(_First_iteration), _M_last_iteration(_Last_iteration), _M_step(_Step), _M_function(_Func)

     {

         // Empty constructor because members are already assigned

     }


     void operator()() const

     {

         // Keep the secondary, scaled, loop index for quick indexing into the data structure

         _Index_type _Scaled_index = _M_first_iteration * _M_step;


         for (_Index_type _I = _M_first_iteration; _I < _M_last_iteration; (_I++, _Scaled_index += _M_step))

         {

             // Execute one iteration: the element is at scaled index away from the first element.

             _Parallel_chunk_helper_invoke<_Random_iterator, _Index_type, _Function, _Is_iterator>::_Invoke(_M_first, _Scaled_index, _M_function);

         }

     }

 private:


     const _Random_iterator&            _M_first;

     const _Index_type&                 _M_step;

     const _Function&                   _M_function;


     const _Index_type                  _M_first_iteration;

     const _Index_type                  _M_last_iteration;


     _Parallel_fixed_chunk_helper const & operator=(_Parallel_fixed_chunk_helper const&);    // no assignment operator

 };


 template <typename _Random_iterator, typename _Index_type, typename _Function, bool _Is_iterator>

 class _Parallel_localized_chunk_helper

 {

 public:

     typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator> _Base;


     _Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator& _First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type& _Step,

         const _Function& _Func, affinity_partitioner& _Part) :

         _M_fixed_helper(_Chunk_index, _First, _First_iteration, _Last_iteration, _Step, _Func, static_partitioner()),

         _M_chunk_location(_Part._Get_chunk_location(static_cast<unsigned int>(_Chunk_index)))

     {

         // Empty constructor because members are already assigned

     }


     // Override the operator() in the base class. Note that this is not a virtual override.

     void operator()() const

     {

         // Check here if location needs to be saved.

         if (_M_chunk_location._Is_system())

         {

             _M_chunk_location = location::current();

         }


         _M_fixed_helper();

     }

 private:


     location&       _M_chunk_location;

     _Base           _M_fixed_helper;

     _Parallel_localized_chunk_helper const & operator=(_Parallel_localized_chunk_helper const&);    // no assignment operator

 };


 #pragma warning(pop)


 template <typename _Worker_class, typename _Index_type, typename Partitioner>

 void _Parallel_chunk_task_group_run(structured_task_group& _Task_group,

                                     task_handle<_Worker_class>* _Chunk_helpers,

                                     const Partitioner&,

                                     _Index_type _I)

 {

     _Task_group.run(_Chunk_helpers[_I]);

 }


 template <typename _Worker_class, typename _Index_type>

 void _Parallel_chunk_task_group_run(structured_task_group& _Task_group,

                                     task_handle<_Worker_class>* _Chunk_helpers,

                                     affinity_partitioner& _Part,

                                     _Index_type _I)

 {

      _Task_group.run(_Chunk_helpers[_I], _Part._Get_chunk_location(static_cast<unsigned int>(_I)));

 }


 // Helper functions that implement parallel_for


 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function, typename _Partitioner>

 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func, _Partitioner&& _Part)

 {

     _CONCRT_ASSERT(_Range_arg > 1);

     _CONCRT_ASSERT(_Step > 0);


     _Index_type _Num_iterations = (_Step == 1) ? _Range_arg : (((_Range_arg - 1) / _Step) + 1);

     _CONCRT_ASSERT(_Num_iterations > 1);


     _Index_type _Num_chunks = _Part._Get_num_chunks(_Num_iterations);

     _CONCRT_ASSERT(_Num_chunks > 0);


     // Allocate memory on the stack for task_handles to ensure everything is properly structured.

     ::Concurrency::details::_MallocaArrayHolder<task_handle<_Worker_class>> _Holder;

     task_handle<_Worker_class> * _Chunk_helpers = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker_class>) * static_cast<size_t>(_Num_chunks)));


     structured_task_group _Task_group;


     _Index_type _Iterations_per_chunk = _Num_iterations / _Num_chunks;

     _Index_type _Remaining_iterations = _Num_iterations % _Num_chunks;


     // If there are less iterations than desired chunks, set the chunk number

     // to be the number of iterations.

     if (_Iterations_per_chunk == 0)

     {

         _Num_chunks = _Remaining_iterations;

     }


     _Index_type _Work_size = 0;

     _Index_type _Start_iteration = 0;

     _Index_type _I;


     // Split the available work in chunks

     for (_I = 0; _I < _Num_chunks - 1; _I++)

     {

         if (_Remaining_iterations > 0)

         {

             // Iterations are not divided evenly, so add 1 remainder iteration each time

             _Work_size = _Iterations_per_chunk + 1;

             _Remaining_iterations--;

         }

         else

         {

             _Work_size = _Iterations_per_chunk;

         }


         // New up a task_handle "in-place", in the array preallocated on the stack

         new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));

         _Holder._IncrementConstructedElemsCount();


         // Run each of the chunk tasks in parallel

         _Parallel_chunk_task_group_run(_Task_group, _Chunk_helpers, std::forward<_Partitioner>(_Part), _I);


         // Prepare for the next iteration

         _Start_iteration += _Work_size;

     }


     // Because this is the last iteration, then work size might be different

     _CONCRT_ASSERT((_Remaining_iterations == 0) || ((_Iterations_per_chunk == 0) && (_Remaining_iterations == 1)));

     _Work_size = _Num_iterations - _Start_iteration;


     // New up a task_handle "in-place", in the array preallocated on the stack

     new(&_Chunk_helpers[_I]) task_handle<_Worker_class>(_Worker_class(_I, _First, _Start_iteration, _Start_iteration + _Work_size, _Step, _Func, std::forward<_Partitioner>(_Part)));

     _Holder._IncrementConstructedElemsCount();


     _Task_group.run_and_wait(_Chunk_helpers[_I]);

 }


 template <typename _Worker_class, typename _Random_iterator, typename _Index_type, typename _Function>

 void _Parallel_chunk_impl(const _Random_iterator& _First, _Index_type _Range_arg, const _Index_type& _Step, const _Function& _Func)

 {

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, auto_partitioner());

 }


 // Helper for the parallel for API with the default dynamic partitioner which implements range-stealing to balance load.

 template <typename _Index_type, typename _Diff_type, typename _Function>

 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const auto_partitioner& _Part)

 {

     typedef _Parallel_chunk_helper<_Index_type, _Diff_type, _Function, auto_partitioner, false> _Worker_class;

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 // Helper for the parallel_for API with a static partitioner - creates a fixed number of chunks up front with no range-stealing enabled.

 template <typename _Index_type, typename _Diff_type, typename _Function>

 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const static_partitioner& _Part)

 {

     typedef _Parallel_fixed_chunk_helper<_Index_type, _Diff_type, _Function, static_partitioner, false> _Worker_class;

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 // Helper for the parallel_for API with a simple partitioner - creates a fixed number of chunks up front with no range-stealing enabled.

 template <typename _Index_type, typename _Diff_type, typename _Function>

 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, const simple_partitioner& _Part)

 {

     typedef _Parallel_fixed_chunk_helper<_Index_type, _Diff_type, _Function, simple_partitioner, false> _Worker_class;

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 // Helper for the parallel_for API with an affinity partitioner - creates a fixed number of chunks up front with no range-stealing enabled. subsequent

 // calls to parallel_for with the same affinity partitioner (pass in as a non-const reference) are scheduled to the same location they previously ran on

 template <typename _Index_type, typename _Diff_type, typename _Function>

 void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function& _Func, affinity_partitioner& _Part)

 {

     typedef _Parallel_localized_chunk_helper<_Index_type, _Diff_type, _Function, false> _Worker_class;

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 template <typename _Index_type, typename _Function, typename _Partitioner>

 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)

 {

     // The step argument must be 1 or greater; otherwise it is an invalid argument

     if (_Step < 1)

     {

         throw std::invalid_argument("_Step");

     }


     // If there are no elements in this range we just return

     if (_First >= _Last)

     {

         return;

     }


     // Compute the difference type based on the arguments and avoid signed overflow for int, long, and long long

     typedef typename std::conditional<std::is_same<_Index_type, int>::value, unsigned int,

         typename std::conditional<std::is_same<_Index_type, long>::value, unsigned long,

             typename std::conditional<std::is_same<_Index_type, long long>::value, unsigned long long, decltype(_Last - _First)

             >::type

         >::type

     >::type _Diff_type;


     _Diff_type _Range_size = _Diff_type(_Last) - _Diff_type(_First);

     _Diff_type _Diff_step = _Step;


     if (_Range_size <= _Diff_step)

     {

         _Func(_First);

     }

     else

     {

         _Parallel_for_partitioned_impl<_Index_type, _Diff_type, _Function>(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));

     }

 }


 template <typename _Index_type, typename _Function>

 void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)

 {

     _Parallel_for_impl(_First, _Last, _Step, _Func, auto_partitioner());

 }


 template <typename _Index_type, typename _Function, typename _Partitioner>

 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func, _Partitioner&& _Part)

 {

     _Trace_ppl_function(PPLParallelForEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

     _Parallel_for_impl(_First, _Last, _Step, _Func, std::forward<_Partitioner>(_Part));

     _Trace_ppl_function(PPLParallelForEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 template <typename _Index_type, typename _Function>

 void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function& _Func)

 {

     parallel_for(_First, _Last, _Step, _Func, auto_partitioner());

 }


 template <typename _Index_type, typename _Function>

 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const auto_partitioner& _Part = auto_partitioner())

 {

     parallel_for(_First, _Last, _Index_type(1), _Func, _Part);

 }


 template <typename _Index_type, typename _Function>

 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const static_partitioner& _Part)

 {

     parallel_for(_First, _Last, _Index_type(1), _Func, _Part);

 }


 template <typename _Index_type, typename _Function>

 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, const simple_partitioner& _Part)

 {

     parallel_for(_First, _Last, _Index_type(1), _Func, _Part);

 }


 template <typename _Index_type, typename _Function>

 void parallel_for(_Index_type _First, _Index_type _Last, const _Function& _Func, affinity_partitioner& _Part)

 {

     parallel_for(_First, _Last, _Index_type(1), _Func, _Part);

 }


 // parallel_for_each -- This function will iterate over all elements in the iterator's range.


 // Closure (binding) classes for invoking parallel_for_each recursively


 // A closure class used for packaging chunk of elements in parallel_for_each for parallel invocation


 // Forward iterator for_each using unstructured task group


 // Disable C4180: qualifier applied to function type has no meaning; ignored

 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.

 #pragma warning(push)

 #pragma warning(disable: 4180)


 template <typename _Forward_iterator, typename _Function, unsigned int _Chunk_size>

 class _Parallel_for_each_helper

 {

 public:

     typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;

     static const unsigned int _Size = _Chunk_size;


     _Parallel_for_each_helper(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func) :

         _M_function(_Func), _M_len(0)

     {

         static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");

         // Add a batch of work items to this functor's array

         for (unsigned int _Index=0; (_Index < _Size) && (_First != _Last); _Index++)

         {

             _M_element[_M_len++] = &(*_First++);

         }

     }


     void operator()() const

     {

         // Invoke parallel_for on the batched up array of elements

         _Parallel_for_impl(0U, _M_len, 1U,

             [this] (unsigned int _Index)

             {

                 _M_function(*(_M_element[_Index]));

             }

         );

     }


 private:


     const _Function& _M_function;

     typename std::iterator_traits<_Forward_iterator>::pointer    _M_element[_Size];

     unsigned int     _M_len;


     _Parallel_for_each_helper const & operator=(_Parallel_for_each_helper const&);    // no assignment operator

 };


 #pragma warning(pop)


 // Helper functions that implement parallel_for_each


 template <typename _Forward_iterator, typename _Function>

 void _Parallel_for_each_chunk(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)

 {

     // The chunk size selection depends more on the internal implementation of parallel_for than

     // on the actual input. Also, it does not have to be dynamically computed, but it helps

     // parallel_for if it is a power of 2 (easy to divide).

     const unsigned int _Chunk_size = 1024;


     // This functor will be copied on the heap and will execute the chunk in parallel

     _Parallel_for_each_helper<_Forward_iterator, _Function, _Chunk_size> _Functor(_First, _Last, _Func);


     // Because this is an unstructured task group, running the task will make a copy of the necessary data

     // on the heap, ensuring that it is available at the time of execution.

     _Task_group.run(_Functor);

 }


 template <typename _Forward_iterator, typename _Function>

 void _Parallel_for_each_forward_impl(_Forward_iterator& _First, const _Forward_iterator& _Last, const _Function& _Func, task_group& _Task_group)

 {

     _Parallel_for_each_chunk(_First, _Last, _Func, _Task_group);


     // If there is a tail, push the tail

     if (_First != _Last)

     {

         _Task_group.run(

             [&_First, &_Last, &_Func, &_Task_group]

             {

                 ::Concurrency::_Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);

             }

         );

     }

 }


 template <typename _Forward_iterator, typename _Function>

 void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func, const auto_partitioner&, std::forward_iterator_tag)

 {

     // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so

     // it is up to the user to provide valid range.

     if (_First != _Last)

     {

         task_group _Task_group;


         _Parallel_for_each_forward_impl(_First, _Last, _Func, _Task_group);


         _Task_group.wait();

     }

 }


 template <typename _Random_iterator, typename _Index_type, typename _Function>

 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const auto_partitioner& _Part)

 {

     typedef _Parallel_chunk_helper<_Random_iterator, _Index_type, _Function, auto_partitioner, true> _Worker_class;

         // Use the same function that schedules work for parallel for

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 template <typename _Random_iterator, typename _Index_type, typename _Function>

 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const static_partitioner& _Part)

 {

     typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, static_partitioner, true> _Worker_class;

     // Use the same function that schedules work for parallel for

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 template <typename _Random_iterator, typename _Index_type, typename _Function>

 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, const simple_partitioner& _Part)

 {

     typedef _Parallel_fixed_chunk_helper<_Random_iterator, _Index_type, _Function, simple_partitioner, true> _Worker_class;

     // Use the same function that schedules work for parallel for

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 template <typename _Random_iterator, typename _Index_type, typename _Function>

 void _Parallel_for_each_partitioned_impl(const _Random_iterator& _First, _Index_type _Range_arg, _Index_type _Step, const _Function& _Func, affinity_partitioner& _Part)

 {

     typedef _Parallel_localized_chunk_helper<_Random_iterator, _Index_type, _Function, true> _Worker_class;

         // Use the same function that schedules work for parallel for

     _Parallel_chunk_impl<_Worker_class>(_First, _Range_arg, _Step, _Func, _Part);

 }


 template <typename _Random_iterator, typename _Function, typename _Partitioner>

 void _Parallel_for_each_impl(const _Random_iterator& _First, const _Random_iterator& _Last, const _Function& _Func, _Partitioner&& _Part, std::random_access_iterator_tag)

 {

     typedef typename std::iterator_traits<_Random_iterator>::difference_type _Index_type;


     // Exit early if there is nothing in the collection

     if (_First >= _Last)

     {

         return;

     }


     _Index_type _Range_size = _Last - _First;


     if (_Range_size == 1)

     {

         _Func(*_First);

     }

     else

     {

         _Index_type _Step = 1;


         _Parallel_for_each_partitioned_impl(_First, _Range_size, _Step, _Func, std::forward<_Partitioner>(_Part));

    }

 }


 template <typename _Iterator, typename _Function>

 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func)

 {

     parallel_for_each(_First, _Last, _Func, auto_partitioner());

 }


 template <typename _Iterator, typename _Function, typename _Partitioner>

 void parallel_for_each(_Iterator _First, _Iterator _Last, const _Function& _Func, _Partitioner&& _Part)

 {

     _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_START);

     _Parallel_for_each_impl(_First, _Last, _Func, std::forward<_Partitioner>(_Part), typename std::iterator_traits<_Iterator>::iterator_category());

     _Trace_ppl_function(PPLParallelForeachEventGuid, _TRACE_LEVEL_INFORMATION, CONCRT_EVENT_END);

 }


 // Disable C4180: qualifier applied to function type has no meaning; ignored

 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.

 #pragma warning(push)

 #pragma warning(disable: 4180)


 // Helper function assemble all functors

 template <typename _Reduce_type, typename _Sub_function, typename _Combinable_type>

 struct _Reduce_functor_helper

 {

     const _Sub_function &_Sub_fun;

     const _Reduce_type &_Identity_value;


     _Combinable_type &_Combinable;


     typedef _Reduce_type _Reduce_type;

     typedef typename _Combinable_type::_Bucket _Bucket_type;


     _Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb):

         _Sub_fun(_Sub_fun), _Combinable(_Comb), _Identity_value(_Identity)

     {

     }


 private:

     _Reduce_functor_helper &operator =(const _Reduce_functor_helper &_Other);

 };


 // Ordered serial combinable object

 template<typename _Ty, typename _Sym_fun>

 class _Order_combinable

 {

 public:

     // Only write once, limited contention will be caused

     struct _Bucket

     {

         // Allocate enough space in the Bucket to hold a value

         char _Value[(sizeof(_Ty) / sizeof(char))];

         _Bucket * _Next;


         _Bucket(_Bucket *_N)

         {

             _Next = _N;

         }


         void _Insert(_Bucket *_Item)

         {

             // No need to lock, only one thread will insert

             _Item->_Next = _Next;

             _Next = _Item;

         }


         // Construct value in bucket

         void _Put(const _Ty &_Cur)

         {

             new(reinterpret_cast<_Ty *>(&_Value)) _Ty(_Cur);

         }

     };


 private:

     const _Sym_fun &_M_fun;

     size_t _M_number;

     _Bucket *_M_root;

     _Order_combinable &operator =(const _Order_combinable &_Other);


 public:

     _Bucket *_Construct(_Bucket *_Par = 0)

     {

         _Bucket * _Ret = static_cast<_Bucket *>(::Concurrency::Alloc(sizeof(_Bucket)));

         return new(_Ret)_Bucket(_Par);

     }


     _Order_combinable(const _Sym_fun &_Fun): _M_fun(_Fun)

     {

         _M_root = 0;

         _M_number = 0;

     }


     ~_Order_combinable()

     {

         while (_M_root)

         {

             _Bucket *_Cur = _M_root;

             _M_root = _M_root->_Next;

             reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();

             ::Concurrency::Free(_Cur);

         }

     }


     // Serially combine and release the list, return result

     _Ty _Serial_combine_release()

     {

         _Ty _Ret(reinterpret_cast<_Ty &>(_M_root->_Value));

         _Bucket *_Cur = _M_root;

         _M_root = _M_root->_Next;


         while (_M_root)

         {

             reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();

             ::Concurrency::Free(_Cur);

             _Cur = _M_root;

             _Ret = _M_fun(reinterpret_cast <_Ty &> (_Cur->_Value), _Ret);

             _M_root = _M_root->_Next;

         }


         reinterpret_cast<_Ty &>(_Cur->_Value).~_Ty();

         ::Concurrency::Free(_Cur);


         return _Ret;

     }


     // allocate a bucket and push back to the list

     _Bucket *_Unsafe_push_back()

     {

         return _M_root = _Construct(_M_root);

     }

 };


 template<typename _Reduce_type, typename _Forward_iterator, typename _Range_reduce_fun, typename _Sym_reduce_fun>

 inline _Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type& _Identity,

     const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)

 {

     typedef typename std::iterator_traits<_Forward_iterator>::value_type _Value_type;


     static_assert(!std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::input_iterator_tag>::value

         && !std::is_same<typename std::iterator_traits<_Forward_iterator>::iterator_category, std::output_iterator_tag>::value,

         "iterator can not be input_iterator or output_iterator.");


     return _Parallel_reduce_impl(_Begin, _End,

         _Reduce_functor_helper<_Reduce_type, _Range_reduce_fun,

             _Order_combinable<_Reduce_type, _Sym_reduce_fun>>(_Identity, _Range_fun, _Order_combinable<_Reduce_type, _Sym_reduce_fun>(_Sym_fun)),

         typename std::iterator_traits<_Forward_iterator>::iterator_category());

 }


 template<typename _Forward_iterator, typename _Sym_reduce_fun>

 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End,

     const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity, _Sym_reduce_fun _Sym_fun)

 {

     typedef typename std::remove_cv<typename std::iterator_traits<_Forward_iterator>::value_type>::type _Reduce_type;


     return parallel_reduce(_Begin, _End, _Identity,

         [_Sym_fun](_Forward_iterator _Begin, _Forward_iterator _End, _Reduce_type _Init)->_Reduce_type

         {

             while (_Begin != _End)

             {

                 _Init = _Sym_fun(_Init, *_Begin++);

             }


             return _Init;

         },

         _Sym_fun);

 }


 template<typename _Forward_iterator>

 inline typename std::iterator_traits<_Forward_iterator>::value_type parallel_reduce(

     _Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits<_Forward_iterator>::value_type &_Identity)

 {

     return parallel_reduce(_Begin, _End, _Identity, std::plus<typename std::iterator_traits<_Forward_iterator>::value_type>());

 }


 // Implementation for the parallel reduce

 template <typename _Forward_iterator, typename _Function>

 typename _Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator& _Last, const _Function& _Func,

     std::forward_iterator_tag)

 {

     // Because this is a forward iterator, it is difficult to validate that _First comes before _Last, so

     // it is up to the user to provide valid range.

     if (_First != _Last)

     {

         task_group _Task_group;

         _Parallel_reduce_forward_executor(_First, _Last, _Func, _Task_group);

         _Task_group.wait();

         return _Func._Combinable._Serial_combine_release();

     }

     else

     {

         return _Func._Identity_value;

     }

 }


 // All the code below is the worker without range stealing

 template<typename _Forward_iterator, typename _Functor>

 class _Parallel_reduce_fixed_worker

 {

 public:

     // The bucket allocation order will depend on the worker construction order

     _Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun):

         _M_begin(_Begin), _M_end(_End), _M_fun(_Fun), _M_bucket(_M_fun._Combinable._Unsafe_push_back())

     {

     }


     void operator ()() const

     {

         _M_bucket->_Put(_M_fun._Sub_fun(_M_begin, _M_end, _M_fun._Identity_value));

     }


 private:

     const _Functor &_M_fun;

     const _Forward_iterator _M_begin, _M_end;

     typename _Functor::_Bucket_type * const _M_bucket;

     _Parallel_reduce_fixed_worker &operator =(const _Parallel_reduce_fixed_worker &_Other);

 };


 template <typename _Worker, typename _Random_iterator, typename _Function>

 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun);


 template <typename _Random_iterator, typename _Function>

 typename _Function::_Reduce_type _Parallel_reduce_impl(_Random_iterator _First, _Random_iterator _Last, const _Function& _Func,

     std::random_access_iterator_tag)

 {

     typedef _Parallel_reduce_fixed_worker<_Random_iterator, _Function> _Worker_class;


     // Special case for 0, 1 element

     if (_First >= _Last)

     {

         return _Func._Identity_value;

     }

     // Directly compute if size is too small

     else if (_Last - _First <= 1)

     {

         _Worker_class(_First, _Last, _Func)();

         return _Func._Combinable._Serial_combine_release();

     }

     else

     {

         // Use fixed ordered chunk partition to schedule works

         _Parallel_reduce_random_executor<_Worker_class>(_First, _Last, _Func);

         return _Func._Combinable._Serial_combine_release();

     }

 }


 // the parallel worker executor for fixed iterator

 // it will divide fixed number of chunks

 // almost same as fixed parallel for, except keep the chunk dividing order

 template <typename _Worker, typename _Random_iterator, typename _Function>

 void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function& _Fun)

 {

     size_t _Cpu_num = static_cast<size_t>(::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors()), _Size = _End - _Begin;


     structured_task_group _Tg;

     ::Concurrency::details::_MallocaArrayHolder<task_handle<_Worker>> _Holder;

     task_handle<_Worker> *_Tasks = _Holder._InitOnRawMalloca(_malloca(sizeof(task_handle<_Worker>) * (_Cpu_num - 1)));


     size_t _Begin_index = 0;

     size_t _Step = _Size / _Cpu_num;

     size_t _NumRemaining = _Size - _Step * _Cpu_num;


     for(size_t _I = 0; _I < _Cpu_num - 1; _I++)

     {

         size_t _Next = _Begin_index + _Step;


         // Add remaining to each chunk

         if (_NumRemaining)

         {

             --_NumRemaining;

             ++_Next;

         }


         // New up a task_handle "in-place", in the array preallocated on the stack

         new (_Tasks + _I) task_handle<_Worker>(_Worker(_Begin + _Begin_index, _Begin + _Next, _Fun));

         _Holder._IncrementConstructedElemsCount();


         // Run each of the chunk _Tasks in parallel

         _Tg.run(_Tasks[_I]);

         _Begin_index = _Next;

     }


     task_handle<_Worker> _Tail(_Worker(_Begin + _Begin_index, _End, _Fun));

     _Tg.run_and_wait(_Tail);

 }


 // The parallel worker executor for forward iterators

 // Divide chunks on the fly

 template <typename _Forward_iterator, typename _Function, int _Default_worker_size, int _Default_chunk_size>

 struct _Parallel_reduce_forward_executor_helper

 {

     typedef _Parallel_reduce_fixed_worker<_Forward_iterator, _Function> _Worker_class;

     mutable std::auto_ptr<task_handle<_Worker_class>> _Workers;

     int _Worker_size;


     _Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function& _Func):

         _Workers(static_cast<task_handle<_Worker_class> *>(::Concurrency::Alloc(sizeof(task_handle<_Worker_class>) * _Default_worker_size)))

     {

         _Worker_size = 0;

         while (_Worker_size < _Default_worker_size && _First != _Last)

         {

             // Copy the range _Head

             _Forward_iterator _Head = _First;


             // Read from forward iterator

             for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)

             {

                 // Body is empty

             }


             // _First will be the end of current chunk

             new (_Workers.get() + _Worker_size++) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));

         }

     }


     _Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other):

         _Workers(_Other._Workers), _Worker_size(_Other._Worker_size)

     {

     }


     void operator ()() const

     {

         structured_task_group _Tg;

         for(int _I = 0; _I < _Worker_size; _I++)

         {

             _Tg.run(_Workers.get()[_I]);

         }

         _Tg.wait();

     }


     ~_Parallel_reduce_forward_executor_helper()

     {

         if (_Workers.get())

         {

             for (int _I = 0; _I < _Worker_size; _I++)

             {

                 _Workers.get()[_I].~task_handle<_Worker_class>();

             }

             ::Concurrency::Free(_Workers.release());

         }

     }

 };


 template <typename _Forward_iterator, typename _Function>

 void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function& _Func, task_group& _Task_group)

 {

     const static int _Internal_worker_number = 1024, _Default_chunk_size = 512;

     typedef _Parallel_reduce_fixed_worker<_Forward_iterator, _Function> _Worker_class;


     structured_task_group _Worker_group;

     ::Concurrency::details::_MallocaArrayHolder<task_handle<_Worker_class>> _Holder;

     task_handle<_Worker_class>* _Workers = _Holder._InitOnRawMalloca(_malloca(_Internal_worker_number * sizeof(task_handle<_Worker_class>)));


     // Start execution first

     int _Index = 0;

     while (_Index < _Internal_worker_number && _First != _Last)

     {

         // Copy the range _Head

         _Forward_iterator _Head = _First;


         // Read from forward iterator

         for (size_t _I = 0; _I < _Default_chunk_size && _First != _Last; ++_I, ++_First)

         {

             // Body is empty

         };


         // Create a new task, _First is range _End

         new (_Workers + _Index) task_handle<_Worker_class>(_Worker_class(_Head, _First, _Func));

         _Holder._IncrementConstructedElemsCount();

         _Worker_group.run(_Workers[_Index]);

         ++_Index;

     }


     // Divide and append the left

     while (_First != _Last)

     {

         _Task_group.run(_Parallel_reduce_forward_executor_helper<_Forward_iterator, _Function, _Internal_worker_number, _Default_chunk_size>(_First, _Last, _Func));

     }


     _Worker_group.wait();

 }


 #pragma warning(pop)


 // Disable C4180: qualifier applied to function type has no meaning; ignored

 // Warning fires for passing Foo function pointer to parallel_for instead of &Foo.

 #pragma warning(push)

 #pragma warning(disable: 4180)


 //

 // Dispatch the execution and handle the condition that all of the iterators are random access

 //

 template<typename _Any_input_traits, typename _Any_output_traits>

 struct _Unary_transform_impl_helper

 {

     template<typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>

     static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator& _Result, const _Unary_operator& _Unary_op, const auto_partitioner&)

     {

         task_group _Tg;

         _Parallel_transform_unary_impl2(_Begin, _End, _Result, _Unary_op, _Tg);

         _Tg.wait();

     }

 };


 template<>

 struct _Unary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag>

 {

     template<typename _Random_input_iterator, typename _Random_output_iterator, typename _Unary_operator, typename _Partitioner>

     static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End,

         _Random_output_iterator& _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)

     {

         if (_Begin < _End)

         {

             ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End - _Begin), static_cast<size_t>(1),

                 [_Begin, &_Result, &_Unary_op](size_t _Index)

                 {

                     _Result[_Index] = _Unary_op(_Begin[_Index]);

                 },

                 std::forward<_Partitioner>(_Part));

             _Result += _End - _Begin;

         }

     }

 };


 template<typename _Any_input_traits1, typename _Any_input_traits2, typename _Any_output_traits>

 struct _Binary_transform_impl_helper

 {


     template<typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>

     static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2,

         _Output_iterator& _Result, const _Binary_operator& _Binary_op, const auto_partitioner&)

     {

         task_group _Tg;

         _Parallel_transform_binary_impl2(_Begin1, _End1, _Begin2, _Result, _Binary_op, _Tg);

         _Tg.wait();

     }

 };


 template<>

 struct _Binary_transform_impl_helper<std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag>

 {

     template<typename _Random_input_iterator1, typename _Random_input_iterator2, typename _Random_output_iterator, typename _Binary_operator, typename _Partitioner>

     static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1,

         _Random_input_iterator2 _Begin2, _Random_output_iterator& _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)

     {

         if (_Begin1 < _End1)

         {

             ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), static_cast<size_t>(_End1 - _Begin1), static_cast<size_t>(1),

                 [_Begin1, _Begin2, &_Result, &_Binary_op](size_t _Index)

                 {

                     _Result[_Index] = _Binary_op(_Begin1[_Index], _Begin2[_Index]);

                 },

                 std::forward<_Partitioner>(_Part));

             _Result += _End1 - _Begin1;

         }

     }

 };


 //

 // The implementation for at least one of the iterator is forward iterator

 //

 template <typename _Forward_iterator, typename _Iterator_kind>

 class _Iterator_helper

 {

 public:

     static const size_t _Size = 1024;

     typedef typename std::iterator_traits<_Forward_iterator>::value_type value_type;


     _Iterator_helper()

     {

         static_assert(!std::is_same<_Iterator_kind, std::input_iterator_tag>::value

             && !std::is_same<_Iterator_kind, std::output_iterator_tag>::value,

             "iterator can not be input_iterator or output_iterator.");

     }


     size_t _Populate(_Forward_iterator& _First, _Forward_iterator _Last)

     {

         size_t _Length = 0;

         static_assert(std::is_lvalue_reference<decltype(*_First)>::value, "lvalue required for forward iterator operator *");


         for (size_t _Index=0; (_Index < _Size) && (_First != _Last); _Index++)

         {

             // We only support l-value here, so it's safe

             _M_element_array[_Length++] = &(*_First++);

         }


         return _Length;

     }


     void _Populate(_Forward_iterator& _First, size_t _Length)

     {

         for (size_t _Index=0; _Index < _Length; _Index++)

         {

             _M_element_array[_Index] = &(*_First++);

         }

     }


     void _Store(const value_type& _Elem, size_t _Index) const

     {

         *(_M_element_array[_Index]) = _Elem;

     }


     typename std::iterator_traits<_Forward_iterator>::reference _Load(size_t _Index) const

     {

         return *(_M_element_array[_Index]);

     }


 private:

     typename std::iterator_traits<_Forward_iterator>::pointer _M_element_array[_Size];

 };


 template <typename _Random_iterator>

 class _Iterator_helper<_Random_iterator, std::random_access_iterator_tag>

 {

 public:

     static const size_t _Size = 1024;

     typedef typename std::iterator_traits<_Random_iterator>::value_type value_type;


     _Iterator_helper()

     {

     }


     size_t _Populate(_Random_iterator& _First, _Random_iterator _Last)

     {

         typename std::iterator_traits<_Random_iterator>::difference_type _Range_size = _Last - _First;

         typename std::iterator_traits<_Random_iterator>::difference_type _Sized = _Size;

         _M_first = _First;


         if (_Range_size > _Sized)

         {

             _First += _Size;

             return _Size;

         }

         else

         {

             _First += _Range_size;

             return static_cast<size_t>(_Range_size);

         }

     }


     void _Populate(_Random_iterator& _First, size_t _Length)

     {

         _M_first = _First;

         _First += _Length;

     }


     void _Store(const value_type& _Elem, size_t _Index) const

     {

         _M_first[_Index] = _Elem;

     }


     typename std::iterator_traits<_Random_iterator>::reference _Load(size_t _Index) const

     {

         // We only support l-value here

         return _M_first[_Index];

     }


 private:

     _Random_iterator _M_first;

 };


 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>

 class _Parallel_transform_binary_helper

 {

 public:

     _Parallel_transform_binary_helper(_Input_iterator1& _First1, _Input_iterator1 _Last1, _Input_iterator2& _First2,

         _Output_iterator& _Result, const _Binary_operator& _Binary_op) :

         _M_binary_op(_Binary_op), _M_len(0)

         {

             _M_len = _M_input_helper1._Populate(_First1, _Last1);

             _M_input_helper2._Populate(_First2, _M_len);

             _M_output_helper._Populate(_Result, _M_len);

         }


         void operator()() const

         {

             // Invoke parallel_for on the batched up array of elements

             ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),

                 [this] (size_t _Index)

                 {

                     _M_output_helper._Store(_M_binary_op(_M_input_helper1._Load(_Index), _M_input_helper2._Load(_Index)), _Index);

                 });

         }


 private:


     _Iterator_helper<_Input_iterator1, typename std::iterator_traits<_Input_iterator1>::iterator_category>   _M_input_helper1;

     _Iterator_helper<_Input_iterator2, typename std::iterator_traits<_Input_iterator2>::iterator_category>   _M_input_helper2;

     _Iterator_helper<_Output_iterator, typename std::iterator_traits<_Output_iterator>::iterator_category>   _M_output_helper;

     const _Binary_operator&                                                                                  _M_binary_op;

     size_t                                                                                                   _M_len;


     _Parallel_transform_binary_helper const & operator=(_Parallel_transform_binary_helper const&);    // no assignment operator

 };


 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>

 void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result,

     const _Binary_operator& _Binary_op, task_group& _Tg)

 {

     // This functor will be copied on the heap and will execute the chunk in parallel

     {

         _Parallel_transform_binary_helper<_Input_iterator1, _Input_iterator2, _Output_iterator, _Binary_operator> _Functor(_First1, _Last1, _First2, _Result, _Binary_op);

         _Tg.run(_Functor);

     }


     // If there is a tail, push the tail

     if (_First1 != _Last1)

     {

         _Tg.run(

             [=, &_Result, &_Binary_op, &_Tg]

             {

                 _Parallel_transform_binary_impl2(_First1, _Last1, _First2, _Result, _Binary_op, _Tg);

             });

     }

 }


 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>

 class _Parallel_transform_unary_helper

 {

 public:

     _Parallel_transform_unary_helper(_Input_iterator& _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator& _Unary_op) :

         _M_unary_op(_Unary_op), _M_len(0)

         {

             _M_len = _M_input_helper._Populate(_First, _Last);

             _M_output_helper._Populate(_Result, _M_len);

         }


         void operator()() const

         {

             // Invoke parallel_for on the batched up array of elements

             ::Concurrency::_Parallel_for_impl(static_cast<size_t>(0), _M_len, static_cast<size_t>(1),

                 [this] (size_t _Index)

                 {

                     _M_output_helper._Store(_M_unary_op(_M_input_helper._Load(_Index)), _Index);

                 });

         }


 private:


     _Iterator_helper<_Input_iterator, typename std::iterator_traits<_Input_iterator>::iterator_category>   _M_input_helper;

     _Iterator_helper<_Output_iterator, typename std::iterator_traits<_Output_iterator>::iterator_category> _M_output_helper;

     const _Unary_operator&                                                                                     _M_unary_op;

     size_t                                                                                              _M_len;


     _Parallel_transform_unary_helper const & operator=(_Parallel_transform_unary_helper const&);    // no assignment operator

 };


 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator>

 void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result,

     const _Unary_operator& _Unary_op, task_group& _Tg)

 {

     // This functor will be copied on the heap and will execute the chunk in parallel

     {

         _Parallel_transform_unary_helper<_Input_iterator, _Output_iterator, _Unary_operator> _Functor(_First, _Last, _Result, _Unary_op);

         _Tg.run(_Functor);

     }


     // If there is a tail, push the tail

     if (_First != _Last)

     {

         _Tg.run(

             [=, &_Result, &_Unary_op, &_Tg]

             {

                 _Parallel_transform_unary_impl2(_First, _Last, _Result, _Unary_op, _Tg);

             });

     }

 }


 template <typename _Input_iterator, typename _Output_iterator, typename _Unary_operator, typename _Partitioner>

 _Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator& _Unary_op, _Partitioner&& _Part)

 {

     typedef typename std::iterator_traits<_Input_iterator>::iterator_category _Input_iterator_type;

     typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;


     if (_First != _Last)

     {

         _Unary_transform_impl_helper<_Input_iterator_type, _Output_iterator_type>

             ::_Parallel_transform_unary_impl(_First, _Last, _Result, _Unary_op, std::forward<_Partitioner>(_Part));

     }


     return _Result;

 }


 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const auto_partitioner& _Part = auto_partitioner())

 {

     return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);

 }


 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const static_partitioner& _Part)

 {

     return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);

 }


 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, const simple_partitioner& _Part)

 {

     return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);

 }


 template <typename _Input_iterator1, typename _Output_iterator, typename _Unary_operator>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator& _Unary_op, affinity_partitioner& _Part)

 {

     return _Parallel_transform_unary_impl(_First1, _Last1, _Result, _Unary_op, _Part);

 }


 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator, typename _Partitioner>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,

     _Output_iterator _Result, const _Binary_operator& _Binary_op, _Partitioner&& _Part)

 {

     typedef typename std::iterator_traits<_Input_iterator1>::iterator_category _Input_iterator_type1;

     typedef typename std::iterator_traits<_Input_iterator2>::iterator_category _Input_iterator_type2;

     typedef typename std::iterator_traits<_Output_iterator>::iterator_category _Output_iterator_type;


     if (_First1 != _Last1)

     {

         _Binary_transform_impl_helper<_Input_iterator_type1, _Input_iterator_type2, _Output_iterator_type>

             ::_Parallel_transform_binary_impl(_First1, _Last1, _First2, _Result, _Binary_op, std::forward<_Partitioner>(_Part));

     }


     return _Result;

 }


 template <typename _Input_iterator1, typename _Input_iterator2, typename _Output_iterator, typename _Binary_operator>

 _Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2,

     _Output_iterator _Result, const _Binary_operator& _Binary_op)

 {

     return parallel_transform(_First1, _Last1, _First2, _Result, _Binary_op, auto_partitioner());

 }


 #pragma warning(pop)


 #pragma warning(push)

 // object allocated on the heap may not be aligned 64

 #pragma warning(disable: 4316)


 template<typename _Ty>

 class combinable

 {

 private:


 // Disable warning C4324: structure was padded due to __declspec(align())

 // This padding is expected and necessary.

 #pragma warning(push)

 #pragma warning(disable: 4324)

     __declspec(align(64))

     struct _Node

     {

         unsigned long _M_key;

         _Ty _M_value;

         _Node* _M_chain;


         _Node(unsigned long _Key, _Ty _InitialValue)

             : _M_key(_Key), _M_value(_InitialValue), _M_chain(NULL)

         {

         }

     };

 #pragma warning(pop)


     static _Ty _DefaultInit()

     {

         return _Ty();

     }


 public:


     combinable()

         : _M_fnInitialize(_DefaultInit)

     {

         _InitNew();

     }


     template <typename _Function>

     explicit combinable(_Function _FnInitialize)

         : _M_fnInitialize(_FnInitialize)

     {

         _InitNew();

     }


     combinable(const combinable& _Copy)

         : _M_size(_Copy._M_size), _M_fnInitialize(_Copy._M_fnInitialize)

     {

         _InitCopy(_Copy);

     }


     combinable& operator=(const combinable& _Copy)

     {

         clear();

         delete [] _M_buckets;

         _M_fnInitialize = _Copy._M_fnInitialize;

         _M_size = _Copy._M_size;

         _InitCopy(_Copy);


         return *this;

     }


     ~combinable()

     {

         clear();

         delete [] _M_buckets;

     }


     _Ty& local()

     {

         unsigned long _Key = ::Concurrency::details::platform::GetCurrentThreadId();

         size_t _Index;

         _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);

         if (_ExistingNode == NULL)

         {

             _ExistingNode = _AddLocalItem(_Key, _Index);

         }


         _CONCRT_ASSERT(_ExistingNode != NULL);

         return _ExistingNode->_M_value;

     }


     _Ty& local(bool& _Exists)

     {

         unsigned long _Key = ::Concurrency::details::platform::GetCurrentThreadId();

         size_t _Index;

         _Node* _ExistingNode = _FindLocalItem(_Key, &_Index);

         if (_ExistingNode == NULL)

         {

             _Exists = false;

             _ExistingNode = _AddLocalItem(_Key, _Index);

         }

         else

         {

             _Exists = true;

         }


         _CONCRT_ASSERT(_ExistingNode != NULL);

         return _ExistingNode->_M_value;

     }


     void clear()

     {

         for (size_t _Index = 0; _Index < _M_size; ++_Index)

         {

             _Node* _CurrentNode = _M_buckets[_Index];

             while (_CurrentNode != NULL)

             {

                 _Node* _NextNode = _CurrentNode->_M_chain;

                 delete _CurrentNode;

                 _CurrentNode = _NextNode;

             }

         }

         memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);

     }


     template<typename _Function>

     _Ty combine(_Function _FnCombine) const

     {

         _Node* _CurrentNode = NULL;

         size_t _Index;


         // Look for the first value in the set, and use (a copy of) that as the result.

         // This eliminates a single call (of unknown cost) to _M_fnInitialize.

         for (_Index = 0; _Index < _M_size; ++_Index)

         {

             _CurrentNode = _M_buckets[_Index];

             if (_CurrentNode != NULL)

             {

                  break;

             }

         }


         // No values... return the initializer value.

         if (_CurrentNode == NULL)

         {

             return _M_fnInitialize();

         }


         // Accumulate the rest of the items in the current bucket.

         _Ty _Result = _CurrentNode->_M_value;

         for (_CurrentNode = _CurrentNode->_M_chain; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)

         {

             _Result = _FnCombine(_Result, _CurrentNode->_M_value);

         }


         // Accumulate values from the rest of the buckets.

         _CONCRT_ASSERT(_Index < _M_size);

         for (++_Index; _Index < _M_size; ++_Index)

         {

             for (_CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)

             {

                 _Result = _FnCombine(_Result, _CurrentNode->_M_value);

             }

         }


         return _Result;

     }


     template<typename _Function>

     void combine_each(_Function _FnCombine) const

     {

         for (size_t _Index = 0; _Index < _M_size; ++_Index)

         {

             for (_Node* _CurrentNode = _M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)

             {

                 _FnCombine(_CurrentNode->_M_value);

             }

         }

     }


 private:

     void _InitNew()

     {

         _M_size = ::Concurrency::details::_GetCombinableSize();

         _M_buckets = new _Node*[_M_size];

         memset((void*)_M_buckets, 0, _M_size * sizeof _M_buckets[0]);

     }


     void _InitCopy(const combinable& _Copy)

     {

         _M_buckets = new _Node*[_M_size];

         for (size_t _Index = 0; _Index < _M_size; ++_Index)

         {

             _M_buckets[_Index] = NULL;

             for (_Node* _CurrentNode = _Copy._M_buckets[_Index]; _CurrentNode != NULL; _CurrentNode = _CurrentNode->_M_chain)

             {

                 _Node* _NewNode = new _Node(_CurrentNode->_M_key, _CurrentNode->_M_value);

                 _NewNode->_M_chain = _M_buckets[_Index];

                 _M_buckets[_Index] = _NewNode;

             }

         }

     }


     _Node* _FindLocalItem(unsigned long _Key, size_t* _PIndex)

     {

         _CONCRT_ASSERT(_PIndex != NULL);


         *_PIndex = _Key % _M_size;


         // Search at this index for an existing value.

         _Node* _CurrentNode = _M_buckets[*_PIndex];

         while (_CurrentNode != NULL)

         {

             if (_CurrentNode->_M_key == _Key)

             {

                 return _CurrentNode;

             }


             _CurrentNode = _CurrentNode->_M_chain;

         }


         return NULL;

     }


     _Node* _AddLocalItem(unsigned long _Key, size_t _Index)

     {

         _Node* _NewNode = new _Node(_Key, _M_fnInitialize());

         _Node* _TopNode;

         do

         {

             _TopNode = _M_buckets[_Index];

             _NewNode->_M_chain = _TopNode;

         } while (_InterlockedCompareExchangePointer(reinterpret_cast<void * volatile *>(&_M_buckets[_Index]), _NewNode, _TopNode) != _TopNode);


         return _NewNode;

     }


 private:

     _Node *volatile * _M_buckets;

     size_t _M_size;

     std::function<_Ty ()> _M_fnInitialize;

 };


 #pragma warning(pop) // C4316


 #pragma push_macro("_MAX_NUM_TASKS_PER_CORE")

 #pragma push_macro("_FINE_GRAIN_CHUNK_SIZE")

 #pragma push_macro("_SORT_MAX_RECURSION_DEPTH")


 // This number is used to control dynamic task splitting

 // The ideal chunk (task) division is that the number of cores is equal to the number of tasks, but it will

 // perform very poorly when tasks are not balanced. The simple solution is to allocate more tasks than number

 // of cores. _MAX_NUM_TASKS_PER_CORE provides a maximum number of tasks that will be allocated per core.

 // If this number is too small, the load balancing problem will affect efficiency very seriously, especially

 // when the compare operation is expensive.

 //

 // Note that this number is a maximum number -- the dynamic partition system will reduce the number of partitions

 // per core based on the dynamic load. If all cores are very busy, the number of partitions will shrink to

 // reduce the scheduler overhead.

 //

 // Initially, the total tasks(chunks) number of partitions "_Div_num" will be: core number * _MAX_NUM_TASKS_PER_CORE.

 // The _Div_num will be divided by 2 after each task splitting. There are two special numbers for _Div_num:

 //     1. When _Div_num reaches the point that _Div_num < _MAX_NUM_TASKS_PER_CORE, it means we have split more tasks than cores.

 //     2. When _Div_num reaches the point that _Div_num <= 1, it means stop splitting more tasks and begin sorting serially.

 #define _MAX_NUM_TASKS_PER_CORE 1024


 // This is a number mainly is used to control the sampling and dynamic task splitting strategies.

 // If the user configurable minimal divisible chunk size (default is 2048) is smaller than FINE_GRAIN_CHUNK_SIZE,

 // the random sampling algorithm for quicksort will enter fine-grained mode, and take a strategy that reduces the sampling

 // overhead. Also, the dynamic task splitting will enter fine-grained mode, which will split as many tasks as possible.

 #define _FINE_GRAIN_CHUNK_SIZE 512


 // This is the maximum depth that the quicksort will be called recursively. If we allow too far, a stack overflow may occur.

 #define _SORT_MAX_RECURSION_DEPTH 64


 template<typename _Random_iterator, typename _Function>

 inline size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)

 {

     _Potentially_equal = false;

     if (_Func(_Begin[_A], _Begin[_B]))

     {

         if (_Func(_Begin[_A], _Begin[_C]))

         {

             return _Func(_Begin[_B], _Begin[_C]) ? _B : _C;

         }

         else

         {

             return _A;

         }

     }

     else

     {

         if (_Func(_Begin[_B], _Begin[_C]))

         {

             return _Func(_Begin[_A], _Begin[_C]) ? _A : _C;

         }

         else

         {

             _Potentially_equal = true;

             return _B;

         }

     }

 }


 template<typename _Random_iterator, typename _Function>

 inline size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)

 {

     size_t _Offset = _Size / 8;

     size_t _A = _Median_of_three(_Begin, 0, _Offset, _Offset * 2, _Func, _Potentially_equal),

         _B = _Median_of_three(_Begin, _Offset * 3, _Offset * 4, _Offset * 5, _Func, _Potentially_equal),

         _C = _Median_of_three(_Begin, _Offset * 6, _Offset * 7, _Size - 1, _Func, _Potentially_equal);

     _B = _Median_of_three(_Begin, _A, _B, _C, _Func, _Potentially_equal);


     if (_Potentially_equal)

     {

         _Potentially_equal = !_Func(_Begin[_C], _Begin[_A]);

     }


     return _B;

 }


 // _Potentially_equal means that potentially all the values in the buffer are equal to the pivot value

 template<typename _Random_iterator, typename _Function>

 inline size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)

 {

     // Base on different chunk size, apply different sampling optimization

     if (_Chunk_size < _FINE_GRAIN_CHUNK_SIZE && _Size <= std::max<size_t>(_Chunk_size * 4, static_cast<size_t>(15)))

     {

         bool _Never_care_equal;

         return _Median_of_three(_Begin, 0, _Size / 2, _Size - 1, _Func, _Never_care_equal);

     }

     else

     {

         return _Median_of_nine(_Begin, _Size, _Func, _Potentially_equal);

     }

 }


 // Find out two middle points for two sorted arrays by binary search so that the number of total elements on the left part of two middle points is equal

 // to the number of total elements on the right part of two sorted arrays and all elements on the left part is smaller than right part.

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)

 {

     size_t _Len = (_Len1 + _Len2) / 2, _Index1 = 0, _Index2 = 0;


     while (_Index1 < _Len1 && _Index2 < _Len2)

     {

         size_t _Mid1 = (_Index1 + _Len1) / 2, _Mid2 = (_Index2 + _Len2) / 2;

         if (_Func(_Begin1[_Mid1], _Begin2[_Mid2]))

         {

             if (_Mid1 + _Mid2 < _Len)

             {

                 _Index1 = _Mid1 + 1;

             }

             else

             {

                 _Len2 = _Mid2;

             }

         }

         else

         {

             if (_Mid1 + _Mid2 < _Len)

             {

                 _Index2 = _Mid2 + 1;

             }

             else

             {

                 _Len1 = _Mid1;

             }

         }

     }


     if (_Index1 == _Len1)

     {

         _Len2 = _Len - _Len1;

     }

     else

     {

         _Len1 = _Len - _Len2;

     }


     return _Len;

 }


 // "move" operation is applied between buffers

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>

 void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2,

     _Random_output_iterator _Output, const _Function &_Func)

 {

     while (_Begin1 != _End1 && _Begin2 != _End2)

     {

         if (_Func(*_Begin1, *_Begin2))

         {

             *_Output++ = std::move(*_Begin1++);

         }

         else

         {

             *_Output++ = std::move(*_Begin2++);

         }

     }


     if (_Begin1 != _End1)

     {

         std::_Move_no_deprecate(_Begin1, _End1, _Output);

     }

     else if (_Begin2 != _End2)

     {

         std::_Move_no_deprecate(_Begin2, _End2, _Output);

     }

 }


 // _Div_num of threads(tasks) merge two chunks in parallel, _Div_num should be power of 2, if not, the largest power of 2 that is

 // smaller than _Div_num will be used

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Random_output_iterator, typename _Function>

 void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output,

     const _Function &_Func, size_t _Div_num)

 {

     // Turn to serial merge or continue splitting chunks base on "_Div_num"

     if (_Div_num <= 1 || (_Len1 <= 1 && _Len2 <= 1))

     {

         _Merge_chunks(_Begin1, _Begin1 + _Len1, _Begin2, _Begin2 + _Len2, _Output, _Func);

     }

     else

     {

         size_t _Mid_len1 = _Len1, _Mid_len2 = _Len2;

         size_t _Mid = _Search_mid_point(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Func);


         structured_task_group _Tg;

         auto _Handle = make_task([&]

         {

             _Parallel_merge(_Begin1, _Mid_len1, _Begin2, _Mid_len2, _Output, _Func, _Div_num / 2);

         });

         _Tg.run(_Handle);


         _Parallel_merge(_Begin1 + _Mid_len1, _Len1 - _Mid_len1, _Begin2 + _Mid_len2, _Len2 - _Mid_len2, _Output + _Mid, _Func, _Div_num / 2);


         _Tg.wait();

     }

 }


 // Return current sorting byte from key

 template<typename _Ty, typename _Function>

 inline size_t _Radix_key(const _Ty& _Val, size_t _Radix, _Function _Proj_func)

 {

     return static_cast<size_t>(_Proj_func(_Val) >> static_cast<int>(8 * _Radix) & 255);

 }


 // One pass of radix sort

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)

 {

     if (!_Size)

     {

         return;

     }


     size_t _Pos[256] = {0};


     for (size_t _I = 0; _I < _Size; _I++)

     {

         ++_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)];

     }


     for (size_t _I = 1; _I < 256; _I++)

     {

         _Pos[_I] += _Pos[_I - 1];

     }


     // _Size > 0

     for (size_t _I = _Size - 1; _I != 0; _I--)

     {

         _Output[--_Pos[_Radix_key(_Begin[_I], _Radix, _Proj_func)]] = std::move(_Begin[_I]);

     }


     _Output[--_Pos[_Radix_key(_Begin[0], _Radix, _Proj_func)]] = std::move(_Begin[0]);

 }


 // Serial least-significant-byte radix sort, it will sort base on last "_Radix" number of bytes

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,

     size_t _Radix, _Function _Proj_func, size_t _Deep = 0)

 {

     size_t _Cur_radix = 0;

     if (_Size == 0)

     {

         return;

     }


     while (_Cur_radix < _Radix)

     {

         _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);

         _Integer_radix_pass(_Output, _Size, _Begin, _Cur_radix++, _Proj_func);

     }


     if (_Cur_radix == _Radix)

     {

         _Integer_radix_pass(_Begin, _Size, _Output, _Cur_radix++, _Proj_func);

     }


     // if odd round is passed, then move result back to input buffer

     if (_Deep + _Radix + 1 & 1)

     {

         if (_Radix + 1 & 1)

         {

             std::_Move_no_deprecate(_Output, _Output + _Size, _Begin);

         }

         else

         {

             std::_Move_no_deprecate(_Begin, _Begin + _Size, _Output);

         }

     }

 }


 // Parallel most-significant-byte _Radix sort.

 // In the end, it will turn to serial least-significant-byte radix sort

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,

     size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep = 0)

 {

     // If the chunk _Size is too small, then turn to serial least-significant-byte radix sort

     if (_Size <= _Chunk_size || _Radix < 1)

     {

         return _Integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Deep);

     }


     size_t _Threads_num = ::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();

     size_t _Buffer_size = sizeof(size_t) * 256 * _Threads_num;

     size_t _Step = _Size / _Threads_num;

     size_t _Remain = _Size % _Threads_num;


     ::Concurrency::details::_MallocaArrayHolder<size_t [256]> _Holder;

     size_t (*_Chunks)[256] = _Holder._InitOnRawMalloca(_malloca(_Buffer_size));


     memset(_Chunks, 0, _Buffer_size);


     // Our purpose is to map unsorted data in buffer "_Begin" to buffer "_Output" so that all elements who have the same

     // byte value in the "_Radix" position will be grouped together in the buffer "_Output"

     //

     // Serial version:

     // To understand this algorithm, first consider a serial version. In following example, we treat 1 digit as 1 byte, so we have a

     // total of 10 elements for each digit instead of 256 elements in each byte. Let's suppose "_Radix" == 1 (right most is 0), and:

     //

     //      begin:  [ 32 | 62 | 21 | 43 | 55 | 43 | 23 | 44 ]

     //

     // We want to divide the output buffer "_Output" into 10 chunks, and each the element in the "_Begin" buffer should be mapped into

     // the proper destination chunk based on its current digit (byte) indicated by "_Radix"

     //

     // Because "_Radix" == 1, after a pass of this function, the chunks in the "_Output" should look like:

     //

     //      buffer: [   |   | 21 23 | 32 | 43 43 44 | 55 | 62 |   |   |   ]

     //                0   1     2      3      4        5    6   7   8   9

     //

     // The difficulty is determining where to insert values into the "_Output" to get the above result. The way to get the

     // start position of each chunk of the buffer is:

     //      1. Count the number of elements for each chunk (in above example, chunk0 is 0, chunk1 is 0, chunk2 is 2, chunk3 is 1 ...

     //      2. Make a partial sum for these chunks( in above example,  we will get chunk0=chunk0=0, chunk1=chunk0+chunk1=0,

     //         chunk2=chunk0+chunk1+chunk2=2, chunk3=chunk0+chunk1+chunk2+chunk3=3

     //

     // After these steps, we will get the end position of each chunk in the "_Output". The begin position of each chunk will be the end

     // point of last chunk (begin point is close but the end point is open). After that,  we can scan the original array again and directly

     // put elements from original buffer "_Begin" into specified chunk on buffer "_Output".

     // Finally, we invoke _parallel_integer_radix_sort in parallel for each chunk and sort them in parallel based on the next digit (byte).

     // Because this is a STABLE sort algorithm, if two numbers has same key value on this byte (digit), their original order should be kept.

     //

     // Parallel version:

     // Almost the same as the serial version, the differences are:

     //      1. The count for each chunk is executed in parallel, and each thread will count one segment of the input buffer "_Begin".

     //         The count result will be separately stored in their own chunk size counting arrays so we have a total of threads-number

     //         of chunk count arrays.

     //         For example, we may have chunk00, chunk01, ..., chunk09 for first thread, chunk10, chunk11, ..., chunk19 for second thread, ...

     //      2. The partial sum should be executed across these chunk counting arrays that belong to different threads, instead of just

     //         making a partial sum in one counting array.

     //         This is because we need to put values from different segments into one final buffer, and the absolute buffer position for

     //         each chunkXX is needed.

     //      3. Make a parallel scan for original buffer again, and move numbers in parallel into the corresponding chunk on each buffer based

     //         on these threads' chunk size counters.


     // Count in parallel and separately save their local results without reducing

     ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)

     {

         size_t _Beg_index, _End_index;


         // Calculate the segment position

         if (_Index < _Remain)

         {

             _Beg_index = _Index * (_Step + 1);

             _End_index = _Beg_index + (_Step + 1);

         }

         else

         {

             _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;

             _End_index = _Beg_index + _Step;

         }


         // Do a counting

         while (_Beg_index != _End_index)

         {

             ++_Chunks[_Index][_Radix_key(_Begin[_Beg_index++], _Radix, _Proj_func)];

         }

     });


     int _Index = -1, _Count = 0;


     // Partial sum cross different threads' chunk counters

     for (int _I = 0; _I < 256; _I++)

     {

         size_t _Last = _I ? _Chunks[_Threads_num - 1][_I - 1] : 0;

         _Chunks[0][_I] += _Last;


         for (size_t _J = 1; _J < _Threads_num; _J++)

         {

             _Chunks[_J][_I] += _Chunks[_J - 1][_I];

         }


         // "_Chunks[_Threads_num - 1][_I] - _Last" will get the global _Size for chunk _I(including all threads local _Size for chunk _I)

         // this will chunk whether the chunk _I is empty or not. If it's not empty, it will be recorded.

         if (_Chunks[_Threads_num - 1][_I] - _Last)

         {

             ++_Count;

             _Index = _I;

         }

     }


     // If there is more than 1 chunk that has content, then continue the original algorithm

     if (_Count > 1)

     {

         // Move the elements in parallel into each chunk

         ::Concurrency::parallel_for(static_cast<size_t>(0), _Threads_num, [=](size_t _Index)

         {

             size_t _Beg_index, _End_index;


             // Calculate the segment position

             if (_Index < _Remain)

             {

                 _Beg_index = _Index * (_Step + 1);

                 _End_index = _Beg_index + (_Step + 1);

             }

             else

             {

                 _Beg_index = _Remain * (_Step + 1) + (_Index - _Remain) * _Step;

                 _End_index = _Beg_index + _Step;

             }


             // Do a move operation to directly put each value into its destination chunk

             // Chunk pointer is moved after each put operation.

             if (_Beg_index != _End_index--)

             {

                 while (_Beg_index != _End_index)

                 {

                     _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);

                     --_End_index;

                 }

                 _Output[--_Chunks[_Index][_Radix_key(_Begin[_End_index], _Radix, _Proj_func)]] = std::move(_Begin[_End_index]);

             }

         });


         // Invoke _parallel_integer_radix_sort in parallel for each chunk

         ::Concurrency::parallel_for(static_cast<size_t>(0), static_cast<size_t>(256), [=](size_t _Index)

         {

             if (_Index < 256 - 1)

             {

                 _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Chunks[0][_Index + 1] - _Chunks[0][_Index],

                     _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);

             }

             else

             {

                 _Parallel_integer_radix_sort(_Output + _Chunks[0][_Index], _Size - _Chunks[0][_Index],

                     _Begin + _Chunks[0][_Index], _Radix - 1, _Proj_func, _Chunk_size, _Deep + 1);

             }

         });

     }

     else

     {

         // Only one chunk has content

         // A special optimization is applied because one chunk means all numbers have a same value on this particular byte (digit).

         // Because we cannot sort them at all (they are all equal at this point), directly call _parallel_integer_radix_sort to

         // sort next byte (digit)

         _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix - 1, _Proj_func, _Chunk_size, _Deep);

     }

 }


 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output,

     _Function _Proj_func, const size_t _Chunk_size)

 {

     typedef typename std::iterator_traits<_Random_iterator>::value_type _Value_type;

     // The key type of the radix sort, this must be an "unsigned integer-like" type, that is, it needs support:

     //     operator>> (int), operator>>= (int), operator& (int), operator <, operator size_t ()

     typedef typename std::remove_const<typename std::remove_reference<decltype(_Proj_func(*_Begin))>::type>::type _Integer_type;


     // Find out the max value, which will be used to determine the highest differing byte (the radix position)

     _Integer_type _Max_val = ::Concurrency::parallel_reduce(_Begin, _Begin + _Size, _Proj_func(*_Begin),

         [=](_Random_iterator _Begin, _Random_iterator _End, _Integer_type _Init) -> _Integer_type

         {

             while (_Begin != _End)

             {

                 _Integer_type _Ret = _Proj_func(*_Begin++);

                 if (_Init < _Ret)

                 {

                     _Init = _Ret;

                 }

             }


             return _Init;

         }, [](const _Integer_type &_A, const _Integer_type &_B) -> const _Integer_type& {return (_A < _B)? _B : _A;});

     size_t _Radix = 0;


     // Find out highest differing byte

     while (_Max_val >>= 8)

     {

         ++_Radix;

     }


     _Parallel_integer_radix_sort(_Begin, _Size, _Output, _Radix, _Proj_func, _Chunk_size);

 }


 template<typename _Random_iterator, typename _Function>

 void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)

 {

     if (_Depth >= _SORT_MAX_RECURSION_DEPTH || _Size <= _Chunk_size || _Size <= static_cast<size_t>(3) || _Chunk_size >= _FINE_GRAIN_CHUNK_SIZE && _Div_num <= 1)

     {

         return std::sort(_Begin, _Begin + _Size, _Func);

     }


     // Determine whether we need to do a three-way quick sort

     // We benefit from three-way merge if there are a lot of elements that are EQUAL to the median value,

     // _Select_median_pivot function will test redundant density by sampling

     bool _Is_three_way_split = false;

     size_t _Mid_index = _Select_median_pivot(_Begin, _Size, _Func, _Chunk_size, _Is_three_way_split);


     // Move the median value to the _Begin position.

     if (_Mid_index)

     {

         std::swap(*_Begin, _Begin[_Mid_index]);

     }

     size_t _I = 1, _J = _Size - 1;


     // Three-way or two-way partition

     // _Div_num < _MAX_NUM_TASKS_PER_CORE is checked to make sure it will never do three-way split before splitting enough tasks

     if (_Is_three_way_split && _Div_num < _MAX_NUM_TASKS_PER_CORE)

     {

         while (_Func(*_Begin, _Begin[_J]))

         {

             --_J;

         }


         while (_Func(_Begin[_I], *_Begin))

         {

             ++_I;

         }


         // Starting from this point, left side of _I will less than median value, right side of _J will be greater than median value,

         // and the middle part will be equal to median. _K is used to scan between _I and _J

         size_t _K = _J;

         while (_I <= _K)

         {

             if (_Func(_Begin[_K], *_Begin))

             {

                 std::swap(_Begin[_I++], _Begin[_K]);

             }

             else

             {

                 --_K;

             }


             while (_Func(*_Begin, _Begin[_K]))

             {

                 std::swap(_Begin[_K--], _Begin[_J--]);

             }

         }


         ++_J;

     }

     else

     {

         while (_I <= _J)

         {

             // Will stop before _Begin

             while (_Func(*_Begin, _Begin[_J]))

             {

                 --_J;

             }


             // There must be another element equal or greater than *_Begin

             while (_Func(_Begin[_I], *_Begin))

             {

                 ++_I;

             }


             if (_I < _J)

             {

                 std::swap(_Begin[_I++], _Begin[_J--]);

             }

             else

             {

                 break;

             }

         }


         _I = ++_J;

     }


     std::swap(*_Begin, _Begin[--_I]);


     structured_task_group _Tg;

     volatile size_t _Next_div = _Div_num / 2;

     auto _Handle = make_task([&]

     {

         _Parallel_quicksort_impl(_Begin + _J, _Size - _J, _Func, _Next_div, _Chunk_size, _Depth+1);

     });

     _Tg.run(_Handle);


     _Parallel_quicksort_impl(_Begin, _I, _Func, _Next_div, _Chunk_size, _Depth+1);


     // If at this point, the work hasn't been scheduled, then slow down creating new tasks

     if (_Div_num < _MAX_NUM_TASKS_PER_CORE)

     {

         _Next_div /= 2;

     }


     _Tg.wait();

 }


 // This function will be called to sort the elements in the "_Begin" buffer. However, we can't tell whether the result will end up in buffer

 // "_Begin", or buffer "_Output" when it returned. The return value is designed to indicate which buffer holds the sorted result.

 // Return true if the merge result is in the "_Begin" buffer; return false if the result is in the "_Output" buffer.

 // We can't always put the result into one assigned buffer because that may cause frequent buffer copies at return time.

 template<typename _Random_iterator, typename _Random_buffer_iterator, typename _Function>

 inline bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func,

     int _Div_num, const size_t _Chunk_size)

 {

     static_assert(std::is_same<typename std::iterator_traits<_Random_iterator>::value_type, typename std::iterator_traits<_Random_buffer_iterator>::value_type>::value,

         "same value type expected");


     if (_Div_num <= 1 || _Size <= _Chunk_size)

     {

         _Parallel_quicksort_impl(_Begin, _Size, _Func, _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);


         // In case _Size <= _Chunk_size happened BEFORE the planned stop time (when _Div_num == 1) we need to calculate how many turns of

         // binary divisions are left. If there are an odd number of turns left, then the buffer move is necessary to make sure the final

         // merge result will be in the original input array.

         int _Left_div_turns = 0;

         while (_Div_num >>= 1)

         {

             _Left_div_turns++;

         }


         if (_Left_div_turns & 1)

         {

             std::move(_Begin, _Begin + _Size, _Output);

             return true;

         }

         else

         {

             return false;

         }

     }

     else

     {

         size_t _Mid = _Size / 2;

         structured_task_group _Tg;


         auto _Handle = make_task([&, _Chunk_size]

         {

             _Parallel_buffered_sort_impl(_Begin, _Mid, _Output, _Func, _Div_num / 2, _Chunk_size);

         });

         _Tg.run(_Handle);


         bool _Is_buffer_swap = _Parallel_buffered_sort_impl(_Begin + _Mid, _Size - _Mid, _Output + _Mid, _Func, _Div_num / 2, _Chunk_size);


         _Tg.wait();


         if (_Is_buffer_swap)

         {

             _Parallel_merge(_Output, _Mid, _Output + _Mid, _Size - _Mid, _Begin, _Func, _Div_num);

         }

         else

         {

             _Parallel_merge(_Begin, _Mid, _Begin + _Mid, _Size - _Mid, _Output, _Func, _Div_num);

         }


         return !_Is_buffer_swap;

     }

 }


 // Disable the warning saying constant value in condition expression.

 // This is by design that lets the compiler optimize the trivial constructor.

 #pragma warning (push)

 #pragma warning (disable: 4127)


 // Allocate and construct a buffer

 template<typename _Allocator>

 inline typename _Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)

 {

     typename _Allocator::pointer _P = _Alloc.allocate(_N);


     // If the objects being sorted have trivial default constructors, they do not need to be

     // constructed here. This can benefit performance.

     if (!std::is_trivially_default_constructible<typename _Allocator::value_type>::value)

     {

         for (size_t _I = 0; _I < _N; _I++)

         {

             // Objects being sorted must have a default constructor

             typename _Allocator::value_type _T;

             _Alloc.construct(_P + _I, std::forward<typename _Allocator::value_type>(_T));

         }

     }


     return _P;

 }


 // Destroy and deallocate a buffer

 template<typename _Allocator>

 inline void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)

 {

     // If the objects being sorted have trivial destructors, they do not need to be

     // destructed here. This can benefit performance.

     if (!std::is_trivially_destructible<typename _Allocator::value_type>::value)

     {

         for (size_t _I = 0; _I < _N; _I++)

         {

             _Alloc.destroy(_P + _I);

         }

     }


     _Alloc.deallocate(_P, _N);

 }


 //

 // Exception safe RAII wrapper for the allocated buffers

 //


 template<typename _Allocator>

 class _AllocatedBufferHolder

 {

 public:

     _AllocatedBufferHolder(size_t _Size, const _Allocator & _Alloc): _M_alloc(_Alloc)

     {

         _M_size = _Size;

         _M_buffer = _Construct_buffer(_Size, _M_alloc);

     }


     ~_AllocatedBufferHolder()

     {

         _Destroy_buffer(_M_buffer, _M_size, _M_alloc);

     }


     typename _Allocator::pointer _Get_buffer()

     {

         return _M_buffer;

     }


 private:

     size_t _M_size;

     _Allocator _M_alloc;

     typename _Allocator::pointer _M_buffer;

 };


 #pragma warning (pop)


 template<typename _Random_iterator, typename _Function>

 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)

 {

     _CONCRT_ASSERT(_Chunk_size > 0);


     // Check for cancellation before the algorithm starts.

     interruption_point();


     size_t _Size = _End - _Begin;

     size_t _Core_num = ::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();


     if (_Size <= _Chunk_size || _Core_num < 2)

     {

         return std::sort(_Begin, _End, _Func);

     }


     _Parallel_quicksort_impl(_Begin, _Size, _Func, _Core_num * _MAX_NUM_TASKS_PER_CORE, _Chunk_size, 0);

 }


 template<typename _Random_iterator>

 inline void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     parallel_sort(_Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());

 }


 template<typename _Allocator, typename _Random_iterator, typename _Function>

 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)

 {

     _CONCRT_ASSERT(_Chunk_size > 0);


     // Check cancellation before the algorithm starts.

     interruption_point();


     size_t _Size = _End - _Begin;

     size_t _Core_num = ::Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors();


     if (_Size <= _Chunk_size || _Core_num < 2)

     {

         return std::sort(_Begin, _End, _Func);

     }

     const static size_t _CORE_NUM_MASK = 0x55555555;


     _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);


     // Prevent cancellation from happening during the algorithm in case it leaves buffers in unknown state.

     run_with_cancellation_token([&]() {

         // This buffered sort algorithm will divide chunks and apply parallel quicksort on each chunk. In the end, it will

         // apply parallel merge to these sorted chunks.

         //

         // We need to decide on the number of chunks to divide the input buffer into. If we divide it into n chunks, log(n)

         // merges will be needed to get the final sorted result. In this algorithm, we have two buffers for each merge

         // operation, let's say buffer A and B. Buffer A is the original input array, buffer B is the additional allocated

         // buffer. Each turn's merge will put the merge result into the other buffer; for example, if we decided to split

         // into 8 chunks in buffer A at very beginning, after one pass of merging, there will be 4 chunks in buffer B.

         // If we apply one more pass of merging, there will be 2 chunks in buffer A again.

         //

         // The problem is we want to the final merge pass to put the result back in buffer A, so that we don't need

         // one extra copy to put the sorted data back to buffer A.

         // To make sure the final result is in buffer A (original input array), we need an even number of merge passes,

         // which means log(n) must be an even number. Thus n must be a number power(2, even number). For example, when the

         // even number is 2, n is power(2, 2) = 4, when even number is 4, n is power(2, 4) = 16. When we divide chunks

         // into these numbers, the final merge result will be in the original input array. Now we need to decide the chunk(split)

         // number based on this property and the number of cores.

         //

         // We want to get a chunk (split) number close to the core number (or a little more than the number of cores),

         // and it also needs to satisfy above property. For a 8 core machine, the best chunk number should be 16, because it's

         // the smallest number that satisfies the above property and is bigger than the core number (so that we can utilize all

         // cores, a little more than core number is OK, we need to split more tasks anyway).

         //

         // In this algorithm, we will make this alignment by bit operations (it's easy and clear). For a binary representation,

         // all the numbers that satisfy power(2, even number) will be 1, 100, 10000, 1000000, 100000000 ...

         // After OR-ing these numbers together, we will get a mask (... 0101 0101 0101) which is all possible combinations of

         // power(2, even number). We use _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, a bit-wise operation to align

         // _Core_num's highest bit into a power(2, even number).

         //

         // It means if _Core_num = 8, the highest bit in binary is bin(1000) which is not power(2, even number). After this

         // bit-wise operation, it will align to bin(10000) = 16 which is power(2, even number). If the _Core_num = 16, after

         // alignment it still returns 16. The trick is to make sure the highest bit of _Core_num will align to the "1" bit of the

         // mask bin(... 0101 0101 0101) We don't care about the other bits on the aligned result except the highest bit, because they

         // will be ignored in the function.

         _Parallel_buffered_sort_impl(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()),

             _Func, _Core_num & _CORE_NUM_MASK | _Core_num << 1 & _CORE_NUM_MASK, _Chunk_size);

     }, cancellation_token::none());


 }


 template<typename _Allocator, typename _Random_iterator, typename _Function>

 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)

 {

     _Allocator _Alloc;

     return parallel_buffered_sort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Func, _Chunk_size);

 }


 template<typename _Random_iterator, typename _Function>

 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size = 2048)

 {

     parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End, _Func, _Chunk_size);

 }


 template<typename _Random_iterator>

 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     parallel_buffered_sort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(_Begin, _End,

         std::less<typename std::iterator_traits<_Random_iterator>::value_type>());

 }


 template<typename _Allocator, typename _Random_iterator>

 inline void parallel_buffered_sort(const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     parallel_buffered_sort<_Allocator>(_Begin, _End,

         std::less<typename std::iterator_traits<_Random_iterator>::value_type>());

 }


 template<typename _Allocator, typename _Random_iterator>

 inline void parallel_buffered_sort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     parallel_buffered_sort<_Allocator>(_Alloc, _Begin, _End, std::less<typename std::iterator_traits<_Random_iterator>::value_type>());

 }


 #pragma warning(push)

 #pragma warning (disable: 4127)

 //

 // This is a default function used for parallel_radixsort which will return just the value.

 // It also performs compile-time checks to ensure that the data type is integral.

 //

 template <typename _DataType>

 struct _Radix_sort_default_function

 {

     size_t operator()(const _DataType& _Val) const

     {

         // An instance of the type predicate returns the value if the type _DataType is one of the integral types, otherwise it

         // statically asserts.

         // An integral type is one of: bool, char, unsigned char, signed char, wchar_t, short, unsigned short, int, unsigned int, long,

         // and unsigned long.

         // In addition, with compilers that provide them, an integral type can be one of long long, unsigned long long, __int64, and

         // unsigned __int64

         static_assert(std::is_integral<_DataType>::value,

             "Type should be integral to use default radix function. For more information on integral types, please refer to https://msdn.microsoft.com/en-us/library/bb983099.aspx.");

         static_assert((sizeof(_DataType) <= sizeof(size_t)), "Passed Type is bigger than size_t.");


         if (std::is_unsigned<_DataType>::value)

         {

             return _Val;

         }

         else

         {

             // The default function needs to take the signed integer-like representation and map it to an unsigned one. The

             // following code will take the midpoint of the unsigned representable range (SIZE_MAX/2)+1 and does an unsigned

             // add of the value. Thus, it maps a [-signed_min,+signed_max] range into a [0, unsigned_max] range.

             return (((SIZE_MAX/2) + 1) + static_cast<size_t>(_Val));

         }

     }

 };

 #pragma warning (pop)


 template<typename _Random_iterator>

 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;


     _Radix_sort_default_function<_DataType> _Proj_func;


     parallel_radixsort<std::allocator<_DataType>>(_Begin, _End, _Proj_func, 256 * 256);

 }


 template<typename _Allocator, typename _Random_iterator>

 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     typedef typename std::iterator_traits<_Random_iterator>::value_type _DataType;


     _Radix_sort_default_function<_DataType> _Proj_func;


     parallel_radixsort<_Allocator>(_Alloc, _Begin, _End, _Proj_func);

 }


 template<typename _Allocator, typename _Random_iterator>

 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)

 {

     _Allocator _Alloc;

     return parallel_radixsort<_Allocator, _Random_iterator>(_Alloc, _Begin, _End);

 }


 template<typename _Allocator, typename _Random_iterator, typename _Function>

 inline void parallel_radixsort(const _Allocator& _Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)

 {

     _CONCRT_ASSERT(_Chunk_size > 0);


     // Check for cancellation before the algorithm starts.

     interruption_point();


     size_t _Size = _End - _Begin;


     // If _Size <= 1, no more sorting needs to be done.

     if (_Size <= 1)

     {

         return;

     }


     _AllocatedBufferHolder<_Allocator> _Holder(_Size, _Alloc);


     // Prevent cancellation from happening during the algorithm in case it leaves the buffers in unknown state.

     run_with_cancellation_token([&]() {

         _Parallel_integer_sort_asc(_Begin, _Size, stdext::make_unchecked_array_iterator(_Holder._Get_buffer()), _Proj_func, _Chunk_size);

     }, cancellation_token::none());

 }


 template<typename _Allocator, typename _Random_iterator, typename _Function>

 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)

 {

     _Allocator _Alloc;

     return parallel_radixsort<_Allocator, _Random_iterator, _Function>(_Alloc, _Begin, _End, _Proj_func, _Chunk_size);

 }


 template<typename _Random_iterator, typename _Function>

 inline void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Proj_func, const size_t _Chunk_size = 256 * 256)

 {

     parallel_radixsort<std::allocator<typename std::iterator_traits<_Random_iterator>::value_type>>(

         _Begin, _End, _Proj_func, _Chunk_size);

 }


 #pragma pop_macro("_SORT_MAX_RECURSION_DEPTH")

 #pragma pop_macro("_MAX_NUM_TASKS_PER_CORE")

 #pragma pop_macro("_FINE_GRAIN_CHUNK_SIZE")

 }


 namespace concurrency = ::Concurrency;


 #pragma pop_macro("new")

 #pragma pack(pop)

Concurrency::_Parallel_for_impl
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2473

Concurrency::_Order_combinable
Definition: ppl.h:3019

Concurrency::affinity_partitioner::_Get_num_chunks
_Type _Get_num_chunks(_Type)
Definition: ppl.h:1750

Concurrency::_Worker_proxy::_M_completion_count
volatile long _M_completion_count
Definition: ppl.h:2115

Concurrency::details::_Cancellation_beacon
A cancellation beacon is a flag which can be polled in an inlinable fashion using the is_signaled met...
Definition: concrt.h:5299

Concurrency::_Worker_proxy::_Wait_on_intrusive_steal
void _Wait_on_intrusive_steal()
Definition: ppl.h:2075

Concurrency::_Parallel_for_each_chunk
void _Parallel_for_each_chunk(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2808

Concurrency::details::_Context
Definition: concrt.h:364

Concurrency::details::platform::GetCurrentThreadId
_CRTIMP2 long __cdecl GetCurrentThreadId()

Concurrency::details::_StructuredTaskCollection::_IsCanceling
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of cancellation...

Concurrency::_Parallel_quicksort_impl
void _Parallel_quicksort_impl(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, size_t _Div_num, const size_t _Chunk_size, int _Depth)
Definition: ppl.h:4993

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_Iterator_helper
_Iterator_helper()
Definition: ppl.h:3637

Concurrency::_Select_median_pivot
size_t _Select_median_pivot(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, const size_t _Chunk_size, bool &_Potentially_equal)
Definition: ppl.h:4600

Concurrency::combinable::_M_fnInitialize
std::function< _Ty()> _M_fnInitialize
Definition: ppl.h:4517

Concurrency::_Parallel_for_each_helper
Definition: ppl.h:2766

Concurrency::task_group::_M_task_collection
::Concurrency::details::_TaskCollection _M_task_collection
Definition: ppl.h:842

Concurrency::details::_MallocaListHolder
Definition: concrt.h:1100

NULL
#define NULL
Definition: vcruntime.h:236

Concurrency::details::_Context::_Yield
static _CONCRTIMP void __cdecl _Yield()

Concurrency::combinable::local
_Ty & local()
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4309

crtdefs.h

Concurrency::_Parallel_chunk_task_group_run
void _Parallel_chunk_task_group_run(structured_task_group &_Task_group, task_handle< _Worker_class > *_Chunk_helpers, const Partitioner &, _Index_type _I)
Definition: ppl.h:2345

Concurrency::_Parallel_chunk_helper_invoke::_Invoke
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1784

Concurrency::CONCRT_EVENT_END
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5465

Concurrency::_Parallel_integer_sort_asc
void _Parallel_integer_sort_asc(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, _Function _Proj_func, const size_t _Chunk_size)
Definition: ppl.h:4958

random_access_iterator_tag
Definition: xutility:529

Concurrency::_Reduce_functor_helper::_Sub_fun
const _Sub_function & _Sub_fun
Definition: ppl.h:3000

Concurrency::_Worker_proxy::_Send_range
bool _Send_range(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2003

Concurrency::combinable::_M_size
size_t _M_size
Definition: ppl.h:4516

Concurrency::_Parallel_localized_chunk_helper::_Base
_Parallel_fixed_chunk_helper< _Random_iterator, _Index_type, _Function, static_partitioner, _Is_iterator > _Base
Definition: ppl.h:2314

Concurrency::combinable
The combinable object is intended to provide thread-private copies of data, to perform lock-free t...
Definition: ppl.h:4180

Concurrency::details::_StructuredTaskCollection
Structured task collections represent groups of work which follow a strictly LIFO ordered paradigm qu...
Definition: concrt.h:4495

make_unchecked_array_iterator
unchecked_array_iterator< _Iterator > make_unchecked_array_iterator(_Iterator _Ptr)
Definition: iterator:725

Concurrency::_Range::_M_current
volatile _Index_type _M_current
Definition: ppl.h:1889

Concurrency::_Parallel_transform_unary_helper::_M_unary_op
const _Unary_operator& _M_unary_op
Definition: ppl.h:3760

Concurrency::combinable::~combinable
~combinable()
Destroys a combinable object.
Definition: ppl.h:4295

Concurrency::_Order_combinable::_M_number
size_t _M_number
Definition: ppl.h:3050

Concurrency::_Iterator_helper::_Populate
void _Populate(_Forward_iterator &_First, size_t _Length)
Definition: ppl.h:3608

Concurrency::_Parallel_for_each_helper::_M_len
unsigned int _M_len
Definition: ppl.h:2798

Concurrency::_Order_combinable::_M_root
_Bucket * _M_root
Definition: ppl.h:3051

_Count
unsigned int _Count
Definition: xcomplex:668

_TRACE_LEVEL_INFORMATION
#define _TRACE_LEVEL_INFORMATION
Definition: ppl.h:38

Concurrency::_Worker_proxy::_NotifyCancel
void _NotifyCancel()
Definition: ppl.h:2089

Concurrency::_Worker_proxy::_M_pParent_worker
_Worker_proxy * _M_pParent_worker
Definition: ppl.h:2109

Concurrency::details::_SpinWait
Implements busy wait with no backoff
Definition: concrt.h:578

Concurrency::details::_Chore::m_pFunction
TaskProc m_pFunction
Definition: concrt.h:4191

Concurrency::run_with_cancellation_token
void run_with_cancellation_token(const _Function &_Func, cancellation_token _Ct)
Executes a function object immediately and synchronously in the context of a given cancellation token...
Definition: ppl.h:865

_CONCRT_ASSERT
#define _CONCRT_ASSERT(x)
Definition: concrt.h:123

Concurrency::_Parallel_localized_chunk_helper::_M_chunk_location
location & _M_chunk_location
Definition: ppl.h:2337

Concurrency::_Trace_ppl_function
_CONCRTIMP void __cdecl _Trace_ppl_function(const GUID &_Guid, unsigned char _Level, ConcRT_EventType _Type)

Concurrency::_Parallel_chunk_helper::_Parallel_chunk_helper
_Parallel_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2137

Concurrency::combinable::combinable
combinable(const combinable &_Copy)
Constructs a new combinable object.
Definition: ppl.h:4264

Concurrency::_Range::_Number_of_iterations
_Index_type _Number_of_iterations() const
Definition: ppl.h:1852

Concurrency::_Unary_transform_impl_helper
Definition: ppl.h:3512

Concurrency::structured_task_group::run_and_wait
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:427

Concurrency::affinity_partitioner::_M_pChunk_locations
location * _M_pChunk_locations
Definition: ppl.h:1766

Concurrency::_Worker_proxy::~_Worker_proxy
~_Worker_proxy()
Definition: ppl.h:1905

Concurrency::_Integer_radix_pass
void _Integer_radix_pass(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4725

Concurrency::_Parallel_chunk_helper
Definition: ppl.h:2134

Concurrency::make_task
task_handle< _Function > make_task(const _Function &_Func)
A factory method for creating a task_handle object.
Definition: ppl.h:165

Concurrency::parallel_reduce
std::iterator_traits< _Forward_iterator >::value_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const typename std::iterator_traits< _Forward_iterator >::value_type &_Identity)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3287

Concurrency::cancellation_token::none
static cancellation_token none()
Returns a cancellation token which can never be subject to cancellation.
Definition: pplcancellation_token.h:628

Concurrency::combinable::_FindLocalItem
_Node * _FindLocalItem(unsigned long _Key, size_t *_PIndex)
Definition: ppl.h:4480

Concurrency::PPLParallelForEventGuid
_CONCRTIMP const GUID PPLParallelForEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...

Concurrency::details::_TaskCollection::_Cancel
_CONCRTIMP void _Cancel()
Cancels work on the task collection.

Concurrency::_Parallel_transform_binary_helper::operator()
void operator()() const
Definition: ppl.h:3693

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_Load
std::iterator_traits< _Random_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3670

Concurrency::TaskProc
void(__cdecl * TaskProc)(void *)
Concurrency::details contains definitions of support routines in the public namespaces and one or mor...
Definition: concrt.h:251

Concurrency::is_current_task_group_canceling
_CONCRTIMP bool __cdecl is_current_task_group_canceling()
Returns an indication of whether the task group which is currently executing inline on the current co...

Concurrency::_Order_combinable::_Bucket::_Next
_Bucket * _Next
Definition: ppl.h:3027

Concurrency::_Parallel_transform_unary_helper::_Parallel_transform_unary_helper
_Parallel_transform_unary_helper(_Input_iterator &_First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op)
Definition: ppl.h:3739

align
void * align(size_t _Bound, size_t _Size, void *&_Ptr, size_t &_Space) _NOEXCEPT
Definition: memory:1985

Concurrency::_Binary_transform_impl_helper
Definition: ppl.h:3544

Concurrency::_Parallel_localized_chunk_helper::operator()
void operator()() const
Definition: ppl.h:2325

Concurrency::PPLParallelInvokeEventGuid
_CONCRTIMP const GUID PPLParallelInvokeEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...

Concurrency::details::_TaskCollection::_Schedule
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...

Concurrency::_Reduce_functor_helper::_Combinable
_Combinable_type & _Combinable
Definition: ppl.h:3003

Concurrency::_Parallel_merge
void _Parallel_merge(_Random_iterator _Begin1, size_t _Len1, _Random_buffer_iterator _Begin2, size_t _Len2, _Random_output_iterator _Output, const _Function &_Func, size_t _Div_num)
Definition: ppl.h:4690

_Val
_In_ int _Val
Definition: vcruntime_string.h:62

Concurrency::combinable::combine_each
void combine_each(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4446

Concurrency::_Parallel_chunk_helper::_Parallel_chunk_helper
_Parallel_chunk_helper(const _Random_iterator &_First, const _Index_type &_Step, const _Function &_Func, const _Range< _Index_type > &_Worker_range, _Worker_proxy< _Index_type > *const _Parent_data=NULL)
Definition: ppl.h:2146

Concurrency::details::_MallocaArrayHolder::_IncrementConstructedElemsCount
void _IncrementConstructedElemsCount()
Definition: concrt.h:1072

Concurrency::affinity_partitioner::_Get_chunk_location
location & _Get_chunk_location(unsigned int _ChunkIndex)
Definition: ppl.h:1744

Concurrency::_Parallel_reduce_fixed_worker::_M_bucket
_Functor::_Bucket_type *const _M_bucket
Definition: ppl.h:3332

Concurrency::_Order_combinable::_Bucket::_Bucket
_Bucket(_Bucket *_N)
Definition: ppl.h:3029

Concurrency::_Parallel_fixed_chunk_helper::operator()
void operator()() const
Definition: ppl.h:2287

std
STL namespace.

Concurrency::_Parallel_reduce_forward_executor_helper
Definition: ppl.h:3407

Concurrency::Alloc
_CONCRTIMP void *__cdecl Alloc(size_t _NumBytes)
Allocates a block of memory of the size specified from the Concurrency Runtime Caching Suballocator...

Concurrency::combinable::__declspec
__declspec(align(64)) struct _Node
Definition: ppl.h:4188

Concurrency::_Parallel_chunk_helper::_M_function
const _Function & _M_function
Definition: ppl.h:2266

Concurrency::parallel_transform
_Output_iterator parallel_transform(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Output_iterator _Result, const _Unary_operator&_Unary_op, const auto_partitioner &_Part=auto_partitioner())
Applies a specified function object to each element in a source range, or to a pair of elements from ...
Definition: ppl.h:3854

Concurrency::_Parallel_transform_binary_helper::_Parallel_transform_binary_helper
_Parallel_transform_binary_helper(_Input_iterator1 &_First1, _Input_iterator1 _Last1, _Input_iterator2 &_First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op)
Definition: ppl.h:3684

Concurrency
The Concurrency namespace provides classes and functions that provide access to the Concurrency Runti...
Definition: agents.h:43

Concurrency::_Parallel_for_each_forward_impl
void _Parallel_for_each_forward_impl(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:2824

Concurrency::_Unary_transform_impl_helper::_Parallel_transform_unary_impl
static void _Parallel_transform_unary_impl(_Input_iterator _Begin, _Input_iterator _End, _Output_iterator &_Result, const _Unary_operator&_Unary_op, const auto_partitioner &)
Definition: ppl.h:3515

Concurrency::_Parallel_integer_radix_sort
void _Parallel_integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, const size_t _Chunk_size, size_t _Deep=0)
Definition: ppl.h:4792

Concurrency::_Parallel_for_each_helper::_Value_type
std::iterator_traits< _Forward_iterator >::value_type _Value_type
Definition: ppl.h:2769

Concurrency::_Integer_radix_sort
void _Integer_radix_sort(const _Random_iterator &_Begin, size_t _Size, const _Random_buffer_iterator &_Output, size_t _Radix, _Function _Proj_func, size_t _Deep=0)
Definition: ppl.h:4755

Concurrency::_Worker_proxy::_Disable_intrusive_steal
void _Disable_intrusive_steal()
Definition: ppl.h:2032

Concurrency::_AllocatedBufferHolder
Definition: ppl.h:5209

Concurrency::task_handle::~task_handle
~task_handle()
Destroys the task_handle object.
Definition: ppl.h:110

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_M_first
_Random_iterator _M_first
Definition: ppl.h:3677

Concurrency::interruption_point
void interruption_point()
Creates an interruption point for cancellation. If a cancellation is in progress in the context where...
Definition: ppl.h:880

Concurrency::details::_StructuredTaskCollection::_Cancel
_CONCRTIMP void _Cancel()
Cancels work on the task collection.

Concurrency::_Range::__declspec
__declspec(property(get=_Get_current_iteration, put=_Set_current_iteration)) _Index_type _M_current_iteration

Concurrency::details::_MallocaArrayHolder::_InitOnRawMalloca
_ElemType * _InitOnRawMalloca(void *_MallocaRet)
Definition: concrt.h:1062

Concurrency::combinable::clear
void clear()
Clears any intermediate computational results from a previous usage.
Definition: ppl.h:4359

_Construct
void _Construct(_Ty1 *_Ptr, _Ty2 &&_Val)
Definition: xmemory0:138

pplwin.h

Concurrency::parallel_invoke
void parallel_invoke(const _Function1 &_Func1, const _Function2 &_Func2)
Executes the function objects supplied as parameters in parallel, and blocks until they have finished...
Definition: ppl.h:945

Concurrency::task_handle::operator=
task_handle const & operator=(task_handle const &)

Concurrency::_AllocatedBufferHolder::_M_alloc
_Allocator _M_alloc
Definition: ppl.h:5230

Concurrency::_Parallel_reduce_fixed_worker::_Parallel_reduce_fixed_worker
_Parallel_reduce_fixed_worker(_Forward_iterator _Begin, _Forward_iterator _End, const _Functor &_Fun)
Definition: ppl.h:3319

Concurrency::_Parallel_transform_binary_helper::_M_input_helper2
_Iterator_helper< _Input_iterator2, typename std::iterator_traits< _Input_iterator2 >::iterator_category > _M_input_helper2
Definition: ppl.h:3706

Concurrency::_Parallel_reduce_forward_executor_helper::_Workers
std::auto_ptr< task_handle< _Worker_class > > _Workers
Definition: ppl.h:3410

Concurrency::task_group::task_group
task_group()
Constructs a new task_group object.
Definition: ppl.h:504

Concurrency::PPLParallelForeachEventGuid
_CONCRTIMP const GUID PPLParallelForeachEventGuid
A category GUID describing ETW events fired by the Concurrency Runtime that are directly related to u...

_Move_no_deprecate
_OutIt _Move_no_deprecate(_InIt _First, _InIt _Last, _OutIt _Dest)
Definition: xutility:2623

Concurrency::_Parallel_chunk_helper::operator()
void operator()() const
Definition: ppl.h:2159

Concurrency::details::_UnrealizedChore::_CheckTaskCollection
_CONCRTIMP void _CheckTaskCollection()

Concurrency::_Range::_Set_last_iteration
void _Set_last_iteration(const _Index_type _I)
Definition: ppl.h:1878

Platform::Details::int
unsigned int
Definition: vccorlib.h:2468

Concurrency::details::_StructuredTaskCollection::_Wait
_TaskCollectionStatus _Wait()
Waits for all chores running in the _StructuredTaskCollection to finish (normally or abnormally)...
Definition: concrt.h:4604

Concurrency::details::_Context::_CurrentContext
static _CONCRTIMP _Context __cdecl _CurrentContext()

Concurrency::static_partitioner
The static_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1641

Concurrency::_Parallel_transform_binary_helper::_M_input_helper1
_Iterator_helper< _Input_iterator1, typename std::iterator_traits< _Input_iterator1 >::iterator_category > _M_input_helper1
Definition: ppl.h:3705

Concurrency::simple_partitioner
The simple_partitioner class represents a static partitioning of the range iterated over by parallel_...
Definition: ppl.h:1670

Concurrency::structured_task_group::structured_task_group
structured_task_group()
Constructs a new structured_task_group object.
Definition: ppl.h:219

Concurrency::_AllocatedBufferHolder::_M_size
size_t _M_size
Definition: ppl.h:5229

Concurrency::_Worker_proxy::_Set_tree_done
void _Set_tree_done()
Definition: ppl.h:2056

Concurrency::_Range::_Set_current_iteration
void _Set_current_iteration(const _Index_type _I)
Definition: ppl.h:1864

Concurrency::details::_TaskCollection::_Wait
_TaskCollectionStatus _Wait()
Waits for all chores running in the _TaskCollection to finish (normally or abnormally). This method encapsulates all the running tasks in an exception handling block, and will re-throw any exceptions that occur in any of it tasks (if those exceptions occur on another thread, they are marshaled from that thread to the thread where the _TaskCollection was created, and re-thrown). After this function returns, the _TaskCollection cannot be used for scheduling further work.
Definition: concrt.h:4809

Concurrency::_Search_mid_point
size_t _Search_mid_point(const _Random_iterator &_Begin1, size_t &_Len1, const _Random_buffer_iterator &_Begin2, size_t &_Len2, const _Function &_Func)
Definition: ppl.h:4617

Concurrency::_Range
Definition: ppl.h:1805

Concurrency::_Worker_proxy::_M_beacon
::Concurrency::details::_Cancellation_beacon _M_beacon
Definition: ppl.h:2112

Concurrency::_Parallel_buffered_sort_impl
bool _Parallel_buffered_sort_impl(const _Random_iterator &_Begin, size_t _Size, _Random_buffer_iterator _Output, const _Function &_Func, int _Div_num, const size_t _Chunk_size)
Definition: ppl.h:5104

Concurrency::_Range::_M_last
volatile _Index_type _M_last
Definition: ppl.h:1890

Concurrency::_Order_combinable::_Unsafe_push_back
_Bucket * _Unsafe_push_back()
Definition: ppl.h:3101

Concurrency::_Parallel_for_each_helper::_Parallel_for_each_helper
_Parallel_for_each_helper(_Forward_iterator &_First, const _Forward_iterator &_Last, const _Function &_Func)
Definition: ppl.h:2772

Concurrency::details::_SpinWait::_SpinOnce
bool _SpinOnce()
Spins for one time quantum,until a maximum spin is reached.
Definition: concrt.h:626

Concurrency::task_group::~task_group
~task_group()
Destroys a task_group object. You are expected to call the either the wait or run_and_wait method on ...
Definition: ppl.h:536

Concurrency::_Parallel_chunk_helper_invoke
Definition: ppl.h:1781

Concurrency::_Parallel_for_partitioned_impl
void _Parallel_for_partitioned_impl(_Index_type _First, _Diff_type _Range_arg, _Diff_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2441

Concurrency::_Parallel_fixed_chunk_helper::_M_first_iteration
const _Index_type _M_first_iteration
Definition: ppl.h:2304

Concurrency::_Worker_proxy::_Worker_proxy
_Worker_proxy(_Worker_proxy *_PParent_worker=NULL)
Definition: ppl.h:1899

Concurrency::_Order_combinable::_Serial_combine_release
_Ty _Serial_combine_release()
Definition: ppl.h:3079

Concurrency::combinable::_InitCopy
void _InitCopy(const combinable &_Copy)
Definition: ppl.h:4465

Concurrency::_Range::_Range
_Range(_Index_type _Current_iteration, _Index_type _Last_iteration)
Definition: ppl.h:1810

Concurrency::task_group::run
void run(const _Function &_Func)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:573

Concurrency::_Reduce_functor_helper
Definition: ppl.h:2998

Concurrency::structured_task_group::~structured_task_group
~structured_task_group()
Destroys a structured_task_group object. You are expected to call either the wait or run_and_wait met...
Definition: ppl.h:252

Concurrency::structured_task_group::run
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:285

Concurrency::_Parallel_chunk_impl
void _Parallel_chunk_impl(const _Random_iterator &_First, _Index_type _Range_arg, const _Index_type &_Step, const _Function &_Func, _Partitioner &&_Part)
Definition: ppl.h:2366

Concurrency::_Parallel_localized_chunk_helper
Definition: ppl.h:2311

Concurrency::structured_task_group::run_and_wait
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the structured_task_g...
Definition: ppl.h:388

Concurrency::_Parallel_transform_binary_helper::_M_len
size_t _M_len
Definition: ppl.h:3709

Concurrency::_Order_combinable::_Bucket::_Put
void _Put(const _Ty &_Cur)
Definition: ppl.h:3042

Concurrency::_Worker_proxy::_Is_done
bool _Is_done()
Definition: ppl.h:2043

Concurrency::_Parallel_transform_unary_helper
Definition: ppl.h:3736

Concurrency::combinable::local
_Ty & local(bool &_Exists)
Returns a reference to the thread-private sub-computation.
Definition: ppl.h:4336

Concurrency::task_group::run
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:696

Concurrency::_Parallel_transform_unary_helper::operator()
void operator()() const
Definition: ppl.h:3746

Concurrency::_Unary_transform_impl_helper< std::random_access_iterator_tag, std::random_access_iterator_tag >::_Parallel_transform_unary_impl
static void _Parallel_transform_unary_impl(_Random_input_iterator _Begin, _Random_input_iterator _End, _Random_output_iterator &_Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3527

Concurrency::_Iterator_helper::value_type
std::iterator_traits< _Forward_iterator >::value_type value_type
Definition: ppl.h:3585

Concurrency::_Parallel_reduce_forward_executor_helper::_Worker_size
int _Worker_size
Definition: ppl.h:3411

size_t
unsigned int size_t
Definition: sourceannotations.h:19

Concurrency::parallel_reduce
_Reduce_type parallel_reduce(_Forward_iterator _Begin, _Forward_iterator _End, const _Reduce_type &_Identity, const _Range_reduce_fun &_Range_fun, const _Sym_reduce_fun &_Sym_fun)
Computes the sum of all elements in a specified range by computing successive partial sums...
Definition: ppl.h:3166

Concurrency::_Parallel_reduce_random_executor
void _Parallel_reduce_random_executor(_Random_iterator _Begin, _Random_iterator _End, const _Function &_Fun)
Definition: ppl.h:3368

Concurrency::_Order_combinable::_Bucket
Definition: ppl.h:3023

Concurrency::Free
_CONCRTIMP void __cdecl Free(_Pre_maybenull_ _Post_invalid_ void *_PAllocation)
Releases a block of memory previously allocated by the Alloc method to the Concurrency Runtime Cachin...

Concurrency::_Parallel_for_each_partitioned_impl
void _Parallel_for_each_partitioned_impl(const _Random_iterator &_First, _Index_type _Range_arg, _Index_type _Step, const _Function &_Func, const auto_partitioner &_Part)
Definition: ppl.h:2856

Concurrency::_Parallel_fixed_chunk_helper::_M_first
const _Random_iterator & _M_first
Definition: ppl.h:2300

Concurrency::parallel_buffered_sort
void parallel_buffered_sort(const _Allocator &_Alloc, const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5361

Concurrency::auto_partitioner::~auto_partitioner
~auto_partitioner()
Destroys a auto_partitioner object.
Definition: ppl.h:1627

Concurrency::_Parallel_fixed_chunk_helper::_M_step
const _Index_type & _M_step
Definition: ppl.h:2301

Concurrency::_Order_combinable::_Construct
_Bucket * _Construct(_Bucket *_Par=0)
Definition: ppl.h:3055

Concurrency::details::_MallocaListHolder::_GetAllocationSize
size_t _GetAllocationSize() const
Definition: concrt.h:1104

Concurrency::auto_partitioner
The auto_partitioner class represents the default method parallel_for, parallel_for_each and parallel...
Definition: ppl.h:1614

Concurrency::details::_StructuredTaskCollection::_RunAndWait
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...

Concurrency::_Parallel_transform_binary_helper::_M_binary_op
const _Binary_operator& _M_binary_op
Definition: ppl.h:3708

Concurrency::parallel_sort
void parallel_sort(const _Random_iterator &_Begin, const _Random_iterator &_End, const _Function &_Func, const size_t _Chunk_size=2048)
Arranges the elements in a specified range into a nondescending order, or according to an ordering cr...
Definition: ppl.h:5271

Concurrency::affinity_partitioner::affinity_partitioner
affinity_partitioner()
Constructs an affinity_partitioner object.
Definition: ppl.h:1731

Concurrency::parallel_for_each
void parallel_for_each(const extent< _Rank > &_Compute_domain, const _Kernel_type &_Kernel)
Invokes a parallel computation of a kernel function over a compute domain on an accelerator_view. The accelerator_view is determined from the arrays and/or array_views captured by the kernel function, or if no accelerator_view can be derived, the default is chosen.
Definition: amp.h:7020

Concurrency::parallel_for
void parallel_for(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func, _Partitioner &&_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2555

Concurrency::_AllocatedBufferHolder::~_AllocatedBufferHolder
~_AllocatedBufferHolder()
Definition: ppl.h:5218

Concurrency::_Worker_proxy::_Enable_intrusive_steal
void _Enable_intrusive_steal(_Range< _Index_type > *_Worker_range)
Definition: ppl.h:2026

swap
void swap(array< _Ty, _Size > &_Left, array< _Ty, _Size > &_Right) _NOEXCEPT_OP(_NOEXCEPT_OP(_Left.swap(_Right)))
Definition: array:433

Concurrency::auto_partitioner::auto_partitioner
auto_partitioner()
Constructs a auto_partitioner object.
Definition: ppl.h:1621

_FINE_GRAIN_CHUNK_SIZE
#define _FINE_GRAIN_CHUNK_SIZE
Definition: ppl.h:4547

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_Store
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3665

_SORT_MAX_RECURSION_DEPTH
#define _SORT_MAX_RECURSION_DEPTH
Definition: ppl.h:4550

Concurrency::CONCRT_EVENT_START
An event type that marks the beginning of a start/end event pair.
Definition: concrt.h:5460

Concurrency::_Worker_proxy::_Propagate_cancel
void _Propagate_cancel()
Definition: ppl.h:2094

Concurrency::_Parallel_for_each_impl
void _Parallel_for_each_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, const auto_partitioner &, std::forward_iterator_tag)
Definition: ppl.h:2841

Concurrency::combinable::_M_buckets
_Node *volatile * _M_buckets
Definition: ppl.h:4515

Concurrency::_Parallel_transform_binary_helper::_M_output_helper
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3707

Concurrency::structured_task_group::structured_task_group
structured_task_group(cancellation_token _CancellationToken)
Constructs a new structured_task_group object.
Definition: ppl.h:236

Concurrency::task_group::run_and_wait
task_group_status run_and_wait(const _Function &_Func)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:796

Concurrency::details::_TaskCollection::_IsCanceling
_CONCRTIMP bool _IsCanceling()
Informs the caller whether or not the task collection is currently in the midst of a cancellation...

Concurrency::_Parallel_localized_chunk_helper::_M_fixed_helper
_Base _M_fixed_helper
Definition: ppl.h:2338

Concurrency::details::_UnrealizedChore::_GetRuntimeOwnsLifetime
bool _GetRuntimeOwnsLifetime() const
Definition: concrt.h:4232

Concurrency::_Parallel_reduce_forward_executor_helper::_Parallel_reduce_forward_executor_helper
_Parallel_reduce_forward_executor_helper(const _Parallel_reduce_forward_executor_helper &_Other)
Definition: ppl.h:3433

Concurrency::_AllocatedBufferHolder::_M_buffer
_Allocator::pointer _M_buffer
Definition: ppl.h:5231

Concurrency::_Median_of_nine
size_t _Median_of_nine(const _Random_iterator &_Begin, size_t _Size, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4582

Concurrency::combinable::_DefaultInit
static _Ty _DefaultInit()
Definition: ppl.h:4202

Concurrency::parallel_radixsort
void parallel_radixsort(const _Random_iterator &_Begin, const _Random_iterator &_End)
Arranges elements in a specified range into an non descending order using a radix sorting algorithm...
Definition: ppl.h:5680

Concurrency::details::_StructuredTaskCollection::_Schedule
_CONCRTIMP void _Schedule(_UnrealizedChore *_PChore, location *_PLocation)
Schedules a chore that can potentially run in parallel. The chore is pushed onto the associated works...

false
#define false
Definition: stdbool.h:16

Concurrency::_Construct_buffer
_Allocator::pointer _Construct_buffer(size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5168

Concurrency::details::_UnrealizedChore
Definition: concrt.h:4197

Concurrency::details::_UnrealizedChore::_OwningCollection
::Concurrency::details::_TaskCollectionBase * _OwningCollection() const
Definition: concrt.h:4218

Concurrency::_Parallel_chunk_helper::_M_last_iteration
const _Index_type _M_last_iteration
Definition: ppl.h:2269

Concurrency::_Parallel_fixed_chunk_helper
Definition: ppl.h:2277

Concurrency::simple_partitioner::_Size_type
unsigned long long _Size_type
Definition: ppl.h:1673

Concurrency::structured_task_group
The structured_task_group class represents a highly structured collection of parallel work...
Definition: ppl.h:205

Concurrency::combinable::operator=
combinable & operator=(const combinable &_Copy)
Assigns to a combinable object from another combinable object.
Definition: ppl.h:4280

_InterlockedCompareExchangePointer
void * _InterlockedCompareExchangePointer(void *volatile *, void *, void *)

Concurrency::_Parallel_reduce_fixed_worker::_M_fun
const _Functor & _M_fun
Definition: ppl.h:3330

Concurrency::combinable::combinable
combinable(_Function _FnInitialize)
Constructs a new combinable object.
Definition: ppl.h:4244

Concurrency::_Reduce_functor_helper::_Identity_value
const _Reduce_type & _Identity_value
Definition: ppl.h:3001

Concurrency::structured_task_group::wait
task_group_status wait()
Waits until all work on the structured_task_group has completed or is canceled.
Definition: ppl.h:349

Concurrency::details::_UnrealizedChore::_SetRuntimeOwnsLifetime
void _SetRuntimeOwnsLifetime(bool _FValue)
Definition: concrt.h:4225

Concurrency::_Worker_proxy::_M_pWorker_range
_Range< _Index_type > *volatile _M_pWorker_range
Definition: ppl.h:2118

Concurrency::_Parallel_invoke_impl
void _Parallel_invoke_impl(const _Function1 &_Func1, const _Function2 &_Func2)
Definition: ppl.h:908

Concurrency::_Parallel_localized_chunk_helper::_Parallel_localized_chunk_helper
_Parallel_localized_chunk_helper(_Index_type _Chunk_index, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, affinity_partitioner &_Part)
Definition: ppl.h:2316

Concurrency::details::_TaskCollection
Task collections represent groups of work which step outside the strict structuring of the _Structure...
Definition: concrt.h:4702

Concurrency::task_handle::_M_function
_Function _M_function
Definition: ppl.h:137

Concurrency::_Iterator_helper
Definition: ppl.h:3581

Concurrency::simple_partitioner::~simple_partitioner
~simple_partitioner()
Destroys a simple_partitioner object.
Definition: ppl.h:1695

Concurrency::simple_partitioner::_M_chunk_size
_Size_type _M_chunk_size
Definition: ppl.h:1713

Concurrency::_Radix_key
size_t _Radix_key(const _Ty &_Val, size_t _Radix, _Function _Proj_func)
Definition: ppl.h:4718

Concurrency::task_group
The task_group class represents a collection of parallel work which can be waited on or canceled...
Definition: ppl.h:490

Concurrency::details::_TaskCollection::_RunAndWait
_CONCRTIMP _TaskCollectionStatus __stdcall _RunAndWait(_UnrealizedChore *_PChore=NULL)
A cancellation friendly wrapper with which to execute _PChore and then waits for all chores running i...

Concurrency::location
An abstraction of a physical location on hardware.
Definition: concrt.h:1825

Concurrency::_Parallel_reduce_fixed_worker::_M_end
const _Forward_iterator _M_end
Definition: ppl.h:3331

Concurrency::task_group::run
void run(const _Function &_Func, location &_Placement)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:614

Concurrency::_Order_combinable::_Bucket::_Insert
void _Insert(_Bucket *_Item)
Definition: ppl.h:3034

Concurrency::location::current
static _CONCRTIMP location __cdecl current()
Returns a location object representing the most specific place the calling thread is executing...

Concurrency::task_group::task_group
task_group(cancellation_token _CancellationToken)
Constructs a new task_group object.
Definition: ppl.h:520

Concurrency::structured_task_group::is_canceling
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the structured_task_group object (although such certainly qualifies this method to return true). It may be the case that the structured_task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this structured_task_group object, true will be returned as well.
Definition: ppl.h:463

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_Populate
size_t _Populate(_Random_iterator &_First, _Random_iterator _Last)
Definition: ppl.h:3641

Concurrency::affinity_partitioner::_M_num_chunks
unsigned int _M_num_chunks
Definition: ppl.h:1763

Concurrency::task_handle::operator()
void operator()() const
The function call operator that the runtime invokes to perform the work of the task handle...
Definition: ppl.h:126

Concurrency::_Iterator_helper::_Store
void _Store(const value_type &_Elem, size_t _Index) const
Definition: ppl.h:3616

Concurrency::_Range::_Send_range
void _Send_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1818

Concurrency::auto_partitioner::_Get_num_chunks
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1630

Concurrency::_Iterator_helper::_Load
std::iterator_traits< _Forward_iterator >::reference _Load(size_t _Index) const
Definition: ppl.h:3621

_MAX_NUM_TASKS_PER_CORE
#define _MAX_NUM_TASKS_PER_CORE
Definition: ppl.h:4541

Concurrency::task_group::cancel
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:812

Concurrency::details::_MallocaListHolder::_AddRawMallocaNode
_ElemType * _AddRawMallocaNode(void *_MallocaRet)
Definition: concrt.h:1124

Concurrency::_Worker_proxy::_Set_done
void _Set_done()
Definition: ppl.h:2048

Concurrency::_Worker_proxy::_Receive_range
bool _Receive_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1919

Concurrency::_Worker_proxy::_Verify_beacon_cancellation
bool _Verify_beacon_cancellation()
Definition: ppl.h:2067

Concurrency::_Parallel_reduce_forward_executor_helper::~_Parallel_reduce_forward_executor_helper
~_Parallel_reduce_forward_executor_helper()
Definition: ppl.h:3448

type_traits

Concurrency::_Range::_Steal_range
void _Steal_range(_Range< _Index_type > *_Helper_range)
Definition: ppl.h:1838

Concurrency::task_group_status
task_group_status
Describes the execution status of a task_group or structured_task_group object. A value of this type ...
Definition: pplinterface.h:102

Concurrency::_Destroy_buffer
void _Destroy_buffer(typename _Allocator::pointer _P, size_t _N, _Allocator &_Alloc)
Definition: ppl.h:5189

Concurrency::_Reduce_functor_helper::_Reduce_type
_Reduce_type _Reduce_type
Definition: ppl.h:3005

Concurrency::_Median_of_three
size_t _Median_of_three(const _Random_iterator &_Begin, size_t _A, size_t _B, size_t _C, const _Function &_Func, bool &_Potentially_equal)
Definition: ppl.h:4553

Concurrency::_Parallel_transform_unary_helper::_M_input_helper
_Iterator_helper< _Input_iterator, typename std::iterator_traits< _Input_iterator >::iterator_category > _M_input_helper
Definition: ppl.h:3758

Concurrency::_AllocatedBufferHolder::_Get_buffer
_Allocator::pointer _Get_buffer()
Definition: ppl.h:5223

Concurrency::_Parallel_reduce_forward_executor_helper::_Parallel_reduce_forward_executor_helper
_Parallel_reduce_forward_executor_helper(_Forward_iterator &_First, _Forward_iterator _Last, const _Function &_Func)
Definition: ppl.h:3413

Concurrency::_Parallel_transform_binary_helper
Definition: ppl.h:3681

stdexcept

Concurrency::task_group::run
void run(task_handle< _Function > &_Task_handle)
Schedules a task on the task_group object. If a task_handle object is passed as a parameter to run...
Definition: ppl.h:653

Concurrency::_Parallel_for_each_helper::operator()
void operator()() const
Definition: ppl.h:2783

Concurrency::task_group::run_and_wait
task_group_status run_and_wait(task_handle< _Function > &_Task_handle)
Schedules a task to be run inline on the calling context with the assistance of the task_group object...
Definition: ppl.h:757

Concurrency::simple_partitioner::simple_partitioner
simple_partitioner(_Size_type _Chunk_size)
Constructs a simple_partitioner object.
Definition: ppl.h:1683

Concurrency::_Order_combinable::_Order_combinable
_Order_combinable(const _Sym_fun &_Fun)
Definition: ppl.h:3061

Concurrency::static_partitioner::_Get_num_chunks
_Type _Get_num_chunks(_Type) const
Definition: ppl.h:1659

Concurrency::_Worker_proxy::_Is_helper_registered
bool _Is_helper_registered()
Definition: ppl.h:2038

_InterlockedDecrement
long __cdecl _InterlockedDecrement(long volatile *)

Concurrency::_Order_combinable::_M_fun
const _Sym_fun & _M_fun
Definition: ppl.h:3049

Concurrency::static_partitioner::static_partitioner
static_partitioner()
Constructs a static_partitioner object.
Definition: ppl.h:1648

_CONCRTIMP
#define _CONCRTIMP
Definition: crtdefs.h:48

Concurrency::details::_GetCombinableSize
_CONCRTIMP size_t __cdecl _GetCombinableSize()

Concurrency::_Parallel_transform_unary_helper::_M_output_helper
_Iterator_helper< _Output_iterator, typename std::iterator_traits< _Output_iterator >::iterator_category > _M_output_helper
Definition: ppl.h:3759

move
constexpr remove_reference< _Ty >::type && move(_Ty &&_Arg) _NOEXCEPT
Definition: type_traits:1290

Concurrency::_Parallel_reduce_fixed_worker
Definition: ppl.h:3315

Concurrency::_Parallel_fixed_chunk_helper::_M_last_iteration
const _Index_type _M_last_iteration
Definition: ppl.h:2305

Concurrency::_Parallel_chunk_helper::_M_parent_worker
_Worker_proxy< _Index_type > *const _M_parent_worker
Definition: ppl.h:2271

Concurrency::details::_CurrentScheduler::_GetNumberOfVirtualProcessors
static _CONCRTIMP unsigned int __cdecl _GetNumberOfVirtualProcessors()

concrt.h

Concurrency::_Radix_sort_default_function
Definition: ppl.h:5625

Concurrency::_Merge_chunks
void _Merge_chunks(_Random_iterator _Begin1, const _Random_iterator &_End1, _Random_buffer_iterator _Begin2, const _Random_buffer_iterator &_End2, _Random_output_iterator _Output, const _Function &_Func)
Definition: ppl.h:4662

Concurrency::_Worker_proxy::_M_context
::Concurrency::details::_Context _M_context
Definition: ppl.h:2113

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::value_type
std::iterator_traits< _Random_iterator >::value_type value_type
Definition: ppl.h:3635

Concurrency::_Iterator_helper< _Random_iterator, std::random_access_iterator_tag >::_Populate
void _Populate(_Random_iterator &_First, size_t _Length)
Definition: ppl.h:3659

Concurrency::affinity_partitioner
The affinity_partitioner class is similar to the static_partitioner class, but it improves cache affi...
Definition: ppl.h:1723

Concurrency::_Parallel_for_impl
void _Parallel_for_impl(_Index_type _First, _Index_type _Last, _Index_type _Step, const _Function &_Func)
Definition: ppl.h:2509

Concurrency::_Order_combinable::_Bucket::_Value
char _Value[(sizeof(_Ty)/sizeof(char))]
Definition: ppl.h:3026

Concurrency::_AllocatedBufferHolder::_AllocatedBufferHolder
_AllocatedBufferHolder(size_t _Size, const _Allocator &_Alloc)
Definition: ppl.h:5212

Concurrency::_Parallel_chunk_helper::_M_first_iteration
const _Index_type _M_first_iteration
Definition: ppl.h:2268

Concurrency::combinable::combine
_Ty combine(_Function _FnCombine) const
Computes a final value from the set of thread-local sub-computations by calling the supplied combine ...
Definition: ppl.h:4390

Concurrency::_Binary_transform_impl_helper< std::random_access_iterator_tag, std::random_access_iterator_tag, std::random_access_iterator_tag >::_Parallel_transform_binary_impl
static void _Parallel_transform_binary_impl(_Random_input_iterator1 _Begin1, _Random_input_iterator1 _End1, _Random_input_iterator2 _Begin2, _Random_output_iterator &_Result, const _Binary_operator&_Binary_op, _Partitioner &&_Part)
Definition: ppl.h:3561

Concurrency::structured_task_group::cancel
void cancel()
Makes a best effort attempt to cancel the sub-tree of work rooted at this task group. Every task scheduled on the task group will get canceled transitively if possible.
Definition: ppl.h:444

Concurrency::_Worker_proxy
Definition: ppl.h:1896

Concurrency::static_partitioner::~static_partitioner
~static_partitioner()
Destroys a static_partitioner object.
Definition: ppl.h:1656

Concurrency::_Parallel_chunk_helper_invoke< _Random_iterator, _Index_type, _Function, false >::_Invoke
static void __cdecl _Invoke(const _Random_iterator &_First, _Index_type &_Index, const _Function &_Func)
Definition: ppl.h:1796

Concurrency::simple_partitioner::_Get_num_chunks
_Type _Get_num_chunks(_Type _Range_arg) const
Definition: ppl.h:1698

Concurrency::details::_MallocaArrayHolder
Definition: concrt.h:1044

Concurrency::combinable::_InitNew
void _InitNew()
Definition: ppl.h:4458

_InterlockedIncrement
long __cdecl _InterlockedIncrement(long volatile *)

Concurrency::_Parallel_fixed_chunk_helper::_Parallel_fixed_chunk_helper
_Parallel_fixed_chunk_helper(_Index_type, const _Random_iterator &_First, _Index_type _First_iteration, _Index_type _Last_iteration, const _Index_type &_Step, const _Function &_Func, const _Partitioner &)
Definition: ppl.h:2280

Concurrency::_Parallel_transform_binary_impl2
void _Parallel_transform_binary_impl2(_Input_iterator1 _First1, _Input_iterator1 _Last1, _Input_iterator2 _First2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, task_group &_Tg)
Definition: ppl.h:3715

Concurrency::_Worker_proxy::_Is_beacon_signaled
bool _Is_beacon_signaled()
Definition: ppl.h:2062

Concurrency::_Iterator_helper::_Populate
size_t _Populate(_Forward_iterator &_First, _Forward_iterator _Last)
Definition: ppl.h:3594

Concurrency::_Iterator_helper::_Iterator_helper
_Iterator_helper()
Definition: ppl.h:3587

Concurrency::_Reduce_functor_helper::_Bucket_type
_Combinable_type::_Bucket _Bucket_type
Definition: ppl.h:3006

Concurrency::_Parallel_reduce_forward_executor_helper::_Worker_class
_Parallel_reduce_fixed_worker< _Forward_iterator, _Function > _Worker_class
Definition: ppl.h:3409

Concurrency::_Parallel_fixed_chunk_helper::_M_function
const _Function & _M_function
Definition: ppl.h:2302

SIZE_MAX
#define SIZE_MAX
Definition: limits.h:76

Concurrency::_Parallel_transform_unary_helper::_M_len
size_t _M_len
Definition: ppl.h:3761

Concurrency::_Radix_sort_default_function::operator()
size_t operator()(const _DataType &_Val) const
Definition: ppl.h:5627

Concurrency::task_handle
The task_handle class represents an individual parallel work item. It encapsulates the instructions a...
Definition: ppl.h:85

Concurrency::_Range::_Get_current_iteration
_Index_type _Get_current_iteration() const
Definition: ppl.h:1858

Concurrency::_Reduce_functor_helper::_Reduce_functor_helper
_Reduce_functor_helper(const _Reduce_type &_Identity, const _Sub_function &_Sub_fun, _Combinable_type &&_Comb)
Definition: ppl.h:3008

_Identity
Definition: xtr1common:326

_Value
_In_ int _Value
Definition: setjmp.h:173

Concurrency::_Parallel_for_each_helper::_M_function
const _Function & _M_function
Definition: ppl.h:2796

Concurrency::affinity_partitioner::~affinity_partitioner
~affinity_partitioner()
Destroys an affinity_partitioner object.
Definition: ppl.h:1739

cliext::_Last
_FwdIt _Last
Definition: algorithm:1936

Concurrency::structured_task_group::run
void run(task_handle< _Function > &_Task_handle, location &_Placement)
Schedules a task on the structured_task_group object. The caller manages the lifetime of the task_han...
Definition: ppl.h:323

Concurrency::combinable::combinable
combinable()
Constructs a new combinable object.
Definition: ppl.h:4219

Concurrency::_Parallel_reduce_impl
_Function::_Reduce_type _Parallel_reduce_impl(_Forward_iterator _First, const _Forward_iterator &_Last, const _Function &_Func, std::forward_iterator_tag)
Definition: ppl.h:3295

Concurrency::_Worker_proxy::_M_pHelper_range
_Range< _Index_type > *volatile _M_pHelper_range
Definition: ppl.h:2106

Concurrency::_Range::_Get_last_iteration
_Index_type _Get_last_iteration() const
Definition: ppl.h:1872

_Size
_Size
Definition: vcruntime_string.h:36

Concurrency::_Parallel_chunk_helper::_M_first
const _Random_iterator & _M_first
Definition: ppl.h:2264

Concurrency::_Parallel_chunk_helper::_M_step
const _Index_type & _M_step
Definition: ppl.h:2265

sort
void sort(_RanIt _First, _RanIt _Last, _Pr _Pred)
Definition: algorithm:2781

Concurrency::structured_task_group::_M_task_collection
::Concurrency::details::_StructuredTaskCollection _M_task_collection
Definition: ppl.h:474

Concurrency::_Order_combinable::~_Order_combinable
~_Order_combinable()
Definition: ppl.h:3067

Concurrency::_Parallel_transform_unary_impl2
void _Parallel_transform_unary_impl2(_Input_iterator _First, _Input_iterator _Last, _Output_iterator &_Result, const _Unary_operator&_Unary_op, task_group &_Tg)
Definition: ppl.h:3767

Concurrency::parallel_for
void parallel_for(_Index_type _First, _Index_type _Last, const _Function &_Func, affinity_partitioner &_Part)
parallel_for iterates over a range of indices and executes a user-supplied function at each iteration...
Definition: ppl.h:2747

Concurrency::combinable::_AddLocalItem
_Node * _AddLocalItem(unsigned long _Key, size_t _Index)
Definition: ppl.h:4501

Concurrency::_Parallel_reduce_forward_executor
void _Parallel_reduce_forward_executor(_Forward_iterator _First, _Forward_iterator _Last, const _Function &_Func, task_group &_Task_group)
Definition: ppl.h:3462

Concurrency::_Worker_proxy::_M_stop_iterating
volatile long _M_stop_iterating
Definition: ppl.h:2119

Concurrency::task_group::is_canceling
bool is_canceling()
Informs the caller whether or not the task group is currently in the midst of a cancellation. This does not necessarily indicate that the cancel method was called on the task_group object (although such certainly qualifies this method to return true). It may be the case that the task_group object is executing inline and a task group further up in the work tree was canceled. In cases such as these where the runtime can determine ahead of time that cancellation will flow through this task_group object, true will be returned as well.
Definition: ppl.h:831

Concurrency::_Parallel_transform_unary_impl
_Output_iterator _Parallel_transform_unary_impl(_Input_iterator _First, _Input_iterator _Last, _Output_iterator _Result, const _Unary_operator&_Unary_op, _Partitioner &&_Part)
Definition: ppl.h:3788

Concurrency::cancellation_token
The cancellation_token class represents the ability to determine whether some operation has been requ...
Definition: pplcancellation_token.h:616

Concurrency::details::_UnrealizedChore::_InternalAlloc
static _ChoreType * _InternalAlloc(const _Function &_Func)
Definition: concrt.h:4239

Concurrency::task_group::wait
task_group_status wait()
Waits until all work on the task_group object has either completed or been canceled.
Definition: ppl.h:719

Concurrency::task_handle::task_handle
task_handle(const _Function &_Func)
Constructs a new task_handle object. The work of the task is performed by invoking the function speci...
Definition: ppl.h:101

Concurrency::_Binary_transform_impl_helper::_Parallel_transform_binary_impl
static void _Parallel_transform_binary_impl(_Input_iterator1 _Begin1, _Input_iterator1 _End1, _Input_iterator2 _Begin2, _Output_iterator &_Result, const _Binary_operator&_Binary_op, const auto_partitioner &)
Definition: ppl.h:3548