// Copyright 2025, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <nvhpc/openacc_scan.hpp>

// _Index is a signed integral type suitable for indexing into any of the
// sequences that are passed to that algorithm.  It is always the
// difference_type for one of the iterator types.

namespace std { namespace __stdpar { namespace __detail {

// Helper function to perform just one chunk of a parallel merge.
// Used by both merge and sort.

template <class _FIt1, class _FIt2, class _FIt3, class _Index, class _BF>
void __merge_one_chunk(_FIt1 __input1, _Index __size1, _FIt2 __input2,
                       _Index __size2, _FIt3 __output, _Index __chunk_start,
                       _Index __chunk_end, _BF __cmp) {
  // Binary search to find the start of the merging for this chunk
  _Index __begin_start = __detail::max(__chunk_start - __size2, 0);
  _Index __end_start = __detail::min(__chunk_start, __size1);
  while (__begin_start < __end_start) {
    _Index __mid = (__begin_start + __end_start) / 2;
    _Index __mid2 = __chunk_start - __mid - 1;
    if (__mid2 >= 0 && __mid < __size1 &&
        !__cmp(__input2[__mid2], __input1[__mid])) {
      __begin_start = __mid + 1;
    } else {
      __end_start = __mid;
    }
  }
  _Index __start1 = __begin_start;
  _Index __start2 = __chunk_start - __start1;
  // Binary search to find the end of the merging for this chunk
  _Index __begin_end = __detail::max(__chunk_end - __size2, 0);
  _Index __end_end = __detail::min(__chunk_end, __size1);
  while (__begin_end < __end_end) {
    _Index __mid = (__begin_end + __end_end) / 2;
    _Index __mid2 = __chunk_end - __mid - 1;
    if (__mid2 >= 0 && __mid < __size1 &&
        !__cmp(__input2[__mid2], __input1[__mid])) {
      __begin_end = __mid + 1;
    } else {
      __end_end = __mid;
    }
  }
  _Index __end1 = __begin_end;
  _Index __end2 = __chunk_end - __end1;
  // Merge [start1,end1) of input1 and [start2,end2} of input2 into
  // [chunk_start,chunk_end) of output.
  _Index __pos1 = __start1;
  _Index __pos2 = __start2;
  for (_Index __i = __chunk_start; __i < __chunk_end; ++__i) {
    if (__pos2 >= __end2 ||
        (__pos1 < __end1 && !__cmp(__input2[__pos2], __input1[__pos1]))) {
      __output[__i] = __input1[__pos1];
      ++__pos1;
    } else {
      __output[__i] = __input2[__pos2];
      ++__pos2;
    }
  }
}

}}} // namespace std::__stdpar::__detail

namespace std { namespace __stdpar { namespace __openacc {

//========== adjacent_find ==========

template <class _FIt, class _BF>
_FIt adjacent_find(_FIt __first, _FIt __last, _BF __f) {
  if (__first == __last) {
    return __last;
  }
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last) - 1;
  _Index __result = __input_size;
  constexpr _Index __chunk_size = 1 << 20;
  for (_Index __chunk_start = 0; __chunk_start < __input_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(min : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      _Index __idx = __chunk_start + __j;
      if (__result > __idx && __f(__first[__idx], __first[__idx + 1])) {
        __result = __idx;
      }
    }
    if (__result < __input_size) {
      return __first + __result;
    }
  }
  return __last;
}

//========== all_of ==========

template <class _FIt, class _UF>
bool all_of(_FIt __first, _FIt __last, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __chunk_size = __detail::__get_device_total_thread_count() << 7;
  bool __result = true;
  for (_Index __chunk_start = 0; __result && __chunk_start < __input_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(&& : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      __result = __result && __f(__first[__chunk_start + __j]);
    }
  }
  return __result;
}

//========== any_of ==========

template <class _FIt, class _UF>
bool any_of(_FIt __first, _FIt __last, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __chunk_size = __detail::__get_device_total_thread_count() << 7;
  bool __result = false;
  for (_Index __chunk_start = 0; !__result && __chunk_start < __input_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(|| : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      __result = __result || __f(__first[__chunk_start + __j]);
    }
  }
  return __result;
}

//========== copy ==========

template <class _FIt1, class _FIt2>
_FIt2 copy(_FIt1 __first, _FIt1 __last, _FIt2 __d_first) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    return std::copy(__first, __last, __d_first);
  }
  _Index __chunk_size = __input_size / __num_chunks;
  _Index __leftover = __input_size % __num_chunks;
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    for (_Index __j = __chunk_start; __j < __chunk_end; ++__j) {
      __d_first[__j] = __first[__j];
    }
  }
  return __d_first + __input_size;
}

//========== copy_if ==========

template <class _FIt1, class _FIt2, class _UF>
_FIt2 copy_if(_FIt1 __first, _FIt1 __last, _FIt2 __d_first, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  if (__input_size == 0) {
    return __d_first;
  }
  _Index* __predicates = new _Index[__input_size];
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    __predicates[__i] = static_cast<_Index>(__f(__first[__i]));
  }
  _Index* __scatter_idx = new _Index[__input_size];
  __openacc::exclusive_scan(__predicates, __predicates + __input_size,
                            __scatter_idx, static_cast<_Index>(0),
                            std::plus<void>{});
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__predicates[__i]) {
      __d_first[__scatter_idx[__i]] = __first[__i];
    }
  }
  _Index __output_size =
      __scatter_idx[__input_size - 1] + __predicates[__input_size - 1];
  delete[] __scatter_idx;
  delete[] __predicates;
  return __d_first + __output_size;
}

//========== copy_n ==========

template <class _FIt1, class _Size, class _FIt2>
_FIt2 copy_n(_FIt1 __first, _Size __count, _FIt2 __d_first) {
  #pragma acc_stdpar parallel loop
  for (_Size __i = 0; __i < __count; ++__i) {
    __d_first[__i] = __first[__i];
  }
  return __count > 0 ? __d_first + __count : __d_first;
}

//========== count ==========

template <class _FIt, class _T>
typename std::iterator_traits<_FIt>::difference_type
count(_FIt __first, _FIt __last, _T const& __value) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __result = 0;
  #pragma acc_stdpar parallel loop reduction(+ : __result)
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__first[__i] == __value) {
      ++__result;
    }
  }
  return __result;
}

//========== count_if ==========

template <class _FIt, class _UF>
typename std::iterator_traits<_FIt>::difference_type
count_if(_FIt __first, _FIt __last, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __result = 0;
  #pragma acc_stdpar parallel loop reduction(+ : __result)
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__f(__first[__i])) {
      ++__result;
    }
  }
  return __result;
}

//========== fill ==========

template <class _FIt, class _T>
void fill(_FIt __first, _FIt __last, _T const& __value) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    __first[__i] = __value;
  }
}

//========== fill_n ==========

template <class _FIt, class _Size, class _T>
_FIt fill_n(_FIt __first, _Size __count, _T const& __value) {
  #pragma acc_stdpar parallel loop
  for (_Size __i = 0; __i < __count; ++__i) {
    __first[__i] = __value;
  }
  return (__count > 0) ? __first + __count : __first;
}

//========== find_if ==========

template <class _FIt, class _UF>
_FIt find_if(_FIt __first, _FIt __last, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __result = __input_size;
  constexpr _Index __chunk_size = 1 << 20;
  for (_Index __chunk_start = 0; __chunk_start < __input_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(min : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      _Index __idx = __chunk_start + __j;
      if (__result > __idx && __f(__first[__idx])) {
        __result = __idx;
      }
    }
    if (__result < __input_size) {
      return __first + __result;
    }
  }
  return __last;
}

//========== for_each ==========

template <class _FIt, class _UF>
void for_each(_FIt __first, _FIt __last, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  if constexpr (!std::is_pointer<_FIt>::value) {
    #pragma acc_stdpar parallel loop
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __f(__first[__i]);
    }
  } else {
    static_assert(std::is_pointer<_FIt>::value,
                  "internal error: unhandled OpenACC parallel loop variant");
    #pragma acc_stdpar parallel loop deviceptr(__first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __f(__first[__i]);
    }
  }
}

//========== for_each_n ==========

template <class _FIt, class _Size, class _UF>
_FIt for_each_n(_FIt __first, _Size __n, _UF __f) {
  if constexpr (!std::is_pointer<_FIt>::value) {
    #pragma acc_stdpar parallel loop
    for (_Size __i = 0; __i < __n; ++__i) {
      __f(__first[__i]);
    }
  } else {
    static_assert(std::is_pointer<_FIt>::value,
                  "internal error: unhandled OpenACC parallel loop variant");
    #pragma acc_stdpar parallel loop deviceptr(__first)
    for (_Size __i = 0; __i < __n; ++__i) {
      __f(__first[__i]);
    }
  }
  return __first + __n;
}

//========== max_element ==========

template <class _FIt, class _BF>
_FIt max_element(_FIt __first, _FIt __last, _BF __cmp) {
  if (__first == __last) {
    return __last;
  }
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    return std::max_element(__first, __last, __cmp);
  }
  _Index __chunk_size = __input_size / __num_chunks;
  _Index __leftover = __input_size % __num_chunks;
  _Index* __partial_max = new _Index[__num_chunks];
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    _Index __max_idx = __chunk_start;
    for (_Index __j = __chunk_start + 1; __j < __chunk_end; ++__j) {
      if (__cmp(__first[__max_idx], __first[__j])) {
        __max_idx = __j;
      }
    }
    __partial_max[__i] = __max_idx;
  }
  _Index __max_idx = __partial_max[0];
  for (_Index __i = 1; __i < __num_chunks; ++__i) {
    if (__cmp(__first[__max_idx], __first[__partial_max[__i]])) {
      __max_idx = __partial_max[__i];
    }
  }
  delete[] __partial_max;
  return __first + __max_idx;
}

//========== merge ==========

template <class _FIt1, class _FIt2, class _FIt3, class _BF>
_FIt3 merge(_FIt1 __first1, _FIt1 __last1, _FIt2 __first2, _FIt2 __last2,
            _FIt3 __d_first, _BF __cmp) {
  using _Index = typename std::iterator_traits<_FIt3>::difference_type;
  _Index __input_size1 = std::distance(__first1, __last1);
  _Index __input_size2 = std::distance(__first2, __last2);
  _Index __input_size = __input_size1 + __input_size2;
  if (__input_size == 0) {
    return __d_first;
  }
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    return std::merge(__first1, __last1, __first2, __last2, __d_first, __cmp);
  }
  _Index __chunk_size = __input_size / __num_chunks;
  _Index __leftover = __input_size % __num_chunks;
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    __detail::__merge_one_chunk(__first1, __input_size1, __first2,
                                __input_size2, __d_first, __chunk_start,
                                __chunk_end, __cmp);
  }
  return __d_first + __input_size;
}

//========== min_element ==========

template <class _FIt, class _BF>
_FIt min_element(_FIt __first, _FIt __last, _BF __cmp) {
  if (__first == __last) {
    return __last;
  }
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    return std::min_element(__first, __last, __cmp);
  }
  _Index __chunk_size = __input_size / __num_chunks;
  _Index __leftover = __input_size % __num_chunks;
  _Index* __partial_min = new _Index[__num_chunks];
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start =
        __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end = __detail::__chunk_end(__i, __chunk_size, __leftover);
    _Index __min_idx = __chunk_start;
    for (_Index __j = __chunk_start + 1; __j < __chunk_end; ++__j) {
      if (__cmp(__first[__j], __first[__min_idx])) {
        __min_idx = __j;
      }
    }
    __partial_min[__i] = __min_idx;
  }
  _Index __min_idx = __partial_min[0];
  for (_Index __i = 1; __i < __num_chunks; ++__i) {
    if (__cmp(__first[__partial_min[__i]], __first[__min_idx])) {
      __min_idx = __partial_min[__i];
    }
  }
  delete[] __partial_min;
  return __first + __min_idx;
}

//========== minmax_element ==========

template <class _FIt, class _BF>
std::pair<_FIt, _FIt> minmax_element(_FIt __first, _FIt __last, _BF __cmp) {
  if (__first == __last) {
    return std::make_pair(__first, __first);
  }
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __num_pairs = __input_size / 2;
  _Index __num_chunks = __detail::__iterations_for_reduce_or_scan(__input_size);
  if (__num_chunks == 0) {
    return std::minmax_element(__first, __last, __cmp);
  }
  _Index __chunk_size = __num_pairs / __num_chunks;
  _Index __leftover = __num_pairs % __num_chunks;
  // If the input size is odd, the last element is its own chunk.
  _Index __num_partials = __num_chunks + __input_size % 2;
  _Index* __partial_min = new _Index[__num_partials];
  _Index* __partial_max = new _Index[__num_partials];
  // Find the min and max of each chunk.
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __min_idx;
    _Index __max_idx;
    // __chunk_size is measured in pairs, so times 2 to get the index
    // of the element.
    _Index __chunk_start =
        2 * __detail::__chunk_start(__i, __chunk_size, __leftover);
    _Index __chunk_end =
        2 * __detail::__chunk_end(__i, __chunk_size, __leftover);
    if (!__cmp(__first[__chunk_start + 1], __first[__chunk_start])) {
      __min_idx = __chunk_start;
      __max_idx = __chunk_start + 1;
    } else {
      __min_idx = __chunk_start + 1;
      __max_idx = __chunk_start;
    }
    for (_Index __j = __chunk_start + 2; __j < __chunk_end; __j += 2) {
      if (__cmp(__first[__j + 1], __first[__j])) {
        if (__cmp(__first[__j + 1], __first[__min_idx])) {
          __min_idx = __j + 1;
        }
        if (!__cmp(__first[__j], __first[__max_idx])) {
          __max_idx = __j;
        }
      } else {
        if (__cmp(__first[__j], __first[__min_idx])) {
          __min_idx = __j;
        }
        if (!__cmp(__first[__j + 1], __first[__max_idx])) {
          __max_idx = __j + 1;
        }
      }
    }
    __partial_min[__i] = __min_idx;
    __partial_max[__i] = __max_idx;
  }
  if (__input_size % 2 != 0) {
    __partial_min[__num_chunks] = __input_size - 1;
    __partial_max[__num_chunks] = __input_size - 1;
  }
  // Find the min and max of all the chunks.
  _Index __min_idx = __partial_min[0];
  _Index __max_idx = __partial_max[0];
  for (_Index __i = 1; __i < __num_partials; ++__i) {
    if (__cmp(__first[__partial_min[__i]], __first[__min_idx])) {
      __min_idx = __partial_min[__i];
    }
    if (!__cmp(__first[__partial_max[__i]], __first[__max_idx])) {
      __max_idx = __partial_max[__i];
    }
  }
  delete[] __partial_min;
  delete[] __partial_max;
  return std::make_pair(__first + __min_idx, __first + __max_idx);
}

//========== mismatch ==========

template <class _FIt1, class _FIt2, class _BF>
std::pair<_FIt1, _FIt2> mismatch(_FIt1 __first1, _FIt1 __last1, _FIt2 __first2,
                                 _BF __f) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first1, __last1);
  _Index __result = __input_size;
  constexpr _Index __chunk_size = 1 << 20;
  for (_Index __chunk_start = 0; __chunk_start < __input_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(min : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      _Index __idx = __chunk_start + __j;
      if (__result > __idx && !__f(__first1[__idx], __first2[__idx])) {
        __result = __idx;
      }
    }
    if (__result < __input_size) {
      return { __first1 + __result, __first2 + __result };
    }
  }
  return { __first1 + __result, __first2 + __result };
}

//========== replace ==========

template <class _FIt, class _T>
void replace(_FIt __first, _FIt __last, _T const& __old_value,
             _T const& __new_value) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__first[__i] == __old_value) {
      __first[__i] = __new_value;
    }
  }
}

//========== replace_copy ==========

template <class _FIt1, class _FIt2, class _T>
_FIt2 replace_copy(_FIt1 __first, _FIt1 __last, _FIt2 __d_first,
                   _T const& __old_value, _T const& __new_value) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__first[__i] == __old_value) {
      __d_first[__i] = __new_value;
    } else {
      __d_first[__i] = __first[__i];
    }
  }
  return __d_first + __input_size;
}

//========== replace_copy_if ==========

template <class _FIt1, class _FIt2, class _UF, class _T>
_FIt2 replace_copy_if(_FIt1 __first, _FIt1 __last, _FIt2 __d_first, _UF __f,
                      _T const& __new_value) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__f(__first[__i])) {
      __d_first[__i] = __new_value;
    } else {
      __d_first[__i] = __first[__i];
    }
  }
  return __d_first + __input_size;
}

//========== replace_if ==========

template <class _FIt, class _UF, class _T>
void replace_if(_FIt __first, _FIt __last, _UF __f, _T const& __new_value) {
  using _Index = typename std::iterator_traits<_FIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    if (__f(__first[__i])) {
      __first[__i] = __new_value;
    }
  }
}

//========== reverse ==========

template <class _BDIt> void reverse(_BDIt __first, _BDIt __last) {
  using _Index = typename std::iterator_traits<_BDIt>::difference_type;
  _Index __input_size_half = std::distance(__first, __last) / 2;
  if (__input_size_half == 0) {
    return;
  }
  _BDIt __last_elem = __last - 1;
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size_half; ++__i) {
    std::iter_swap(__first + __i, __last_elem - __i);
  }
}

//========== reverse_copy ==========

template <class _BDIt, class _FIt>
_FIt reverse_copy(_BDIt __first, _BDIt __last, _FIt __d_first) {
  using _Index = typename std::iterator_traits<_BDIt>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  if (__input_size == 0) {
    return __d_first;
  }
  _BDIt __last_elem = __last - 1;
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    __d_first[__i] = __last_elem[-__i];
  }
  return __d_first + __input_size;
}

//========== rotate_copy ==========

template <class _FIt1, class _FIt2>
_FIt2 rotate_copy(_FIt1 __first, _FIt1 __n_first, _FIt1 __last,
                  _FIt2 __d_first) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  _Index __back_size = std::distance(__n_first, __last);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    _Index __idx = __i;
    if (__idx >= __back_size) {
      __idx = __i - __input_size;
    }
    __d_first[__i] = __n_first[__idx];
  }
  return __d_first + __input_size;
}

//========== search ==========

template <class _FIt1, class _FIt2, class _BF>
_FIt1 search(_FIt1 __first, _FIt1 __last, _FIt2 __s_first, _FIt2 __s_last,
             _BF __f) {
  if (__s_first == __s_last) {
    return __first;
  }
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  auto __search_size = std::distance(__s_first, __s_last);
  if (__input_size < __search_size) {
    return __last;
  }
  constexpr _Index __chunk_size = 1 << 20;
  _Index __range_size = __input_size - __search_size + 1;
  _Index __result = __range_size;
  for (_Index __chunk_start = 0; __chunk_start < __range_size;
       __chunk_start += __chunk_size) {
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __range_size);
    _Index __this_size = __chunk_end - __chunk_start;
    #pragma acc_stdpar parallel loop reduction(min : __result)
    for (_Index __j = 0; __j < __this_size; ++__j) {
      _Index __idx = __chunk_start + __j;
      bool __found = true;
      for (_Index __s = 0; __s < __search_size; ++__s) {
        if (!__f(__first[__idx + __s], __s_first[__s])) {
          __found = false;
          break;
        }
      }
      if (__found && __result > __idx) {
        __result = __idx;
      }
    }
    if (__result < __range_size) {
      return __first + __result;
    }
  }
  return __last;
}

//========== stable_sort ==========

template <class _RIt, class _BF>
void stable_sort(_RIt __first, _RIt __last, _BF __cmp) {
  using _Index = typename std::iterator_traits<_RIt>::difference_type;
  using _ValT = typename std::iterator_traits<_RIt>::value_type;
  _Index __input_size = std::distance(__first, __last);
  if (__input_size <= 1) {
    return;
  }
  constexpr _Index __chunk_size = 32;
  _Index __num_chunks = __detail::__div_round_up(__input_size, __chunk_size);
  _Index __num_strides =
      static_cast<_Index>(std::ceil(std::log2(__num_chunks)));
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __num_chunks; ++__i) {
    _Index __chunk_start = __i * __chunk_size;
    _Index __chunk_end =
        __detail::min(__chunk_start + __chunk_size, __input_size);
    // Sort just this chunk with a sequential insertion sort.
    #pragma acc_stdpar loop seq
    for (_Index __j = __chunk_start; __j < __chunk_end; ++__j) {
      _ValT __temp = std::move(__first[__j]);
      _Index __k = __j;
      #pragma acc_stdpar loop seq
      while (__k > __chunk_start && __cmp(__temp, __first[__k - 1])) {
        __first[__k] = std::move(__first[__k - 1]);
        --__k;
      }
      __first[__k] = std::move(__temp);
    }
  }
  if (__num_chunks == 1) {
    return;
  }
  _ValT* __temp_array = new _ValT[__input_size];
  // The loop below alternates between copying from first to temp_array vs
  // copying from temp_array to first.  __temp_is_out controls which way the
  // copying happens; true means temp_array is the output.
  bool __temp_is_out = true;
  _Index __desired_threads = __detail::max(
      __detail::__iterations_for_reduce_or_scan(__input_size), __num_chunks);
  for (_Index __round = 0; __round < __num_strides;
       ++__round, __temp_is_out = !__temp_is_out) {
    auto __in =
        __detail::__make_either_or(__first, __temp_array, __temp_is_out);
    auto __out =
        __detail::__make_either_or(__first, __temp_array, !__temp_is_out);
    _Index __merge_size = __chunk_size << __round;
    _Index __stride = __merge_size * 2;
    _Index __num_merges = __detail::__div_round_up(__input_size, __stride);
    _Index __threads_per_merge =
        __detail::__div_round_up(__desired_threads, __num_merges);
    _Index __merge_chunk_size =
        __detail::__div_round_up(__stride, __threads_per_merge);
    _Index __actual_threads = (__num_merges - 1) * __threads_per_merge +
        __detail::__div_round_up(__input_size - ((__num_merges - 1) * __stride),
                                 __merge_chunk_size);
    #pragma acc_stdpar parallel loop
    for (_Index __i = 0; __i < __actual_threads; ++__i) {
      _Index __merge_no = __i / __threads_per_merge;
      _Index __chunk_no = __i % __threads_per_merge;
      _Index __merge_offset = __merge_no * __stride;
      _Index __size1 =
          __detail::min(__merge_size, __input_size - __merge_offset);
      _Index __size2 =
          __detail::min(__merge_size, __input_size - __merge_offset - __size1);
      _Index __chunk_start =
          __detail::min(__chunk_no * __merge_chunk_size, __size1 + __size2);
      _Index __chunk_end = __detail::min((__chunk_no + 1) * __merge_chunk_size,
                                         __size1 + __size2);
      __detail::__merge_one_chunk(
          __in + __merge_offset, __size1, __in + __merge_offset + __size1,
          __size2, __out + __merge_offset, __chunk_start, __chunk_end, __cmp);
    }
  }
  if (!__temp_is_out) {
    #pragma acc_stdpar parallel loop vector_length(1024)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __first[__i] = std::move(__temp_array[__i]);
    }
  }
  delete[] __temp_array;
}

//========== swap_ranges ==========

template <class _FIt1, class _FIt2>
_FIt2 swap_ranges(_FIt1 __first1, _FIt1 __last1, _FIt2 __first2) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first1, __last1);
  #pragma acc_stdpar parallel loop
  for (_Index __i = 0; __i < __input_size; ++__i) {
    std::iter_swap(__first1 + __i, __first2 + __i);
  }
  return __first2 + __input_size;
}

//========== transform ==========

template <class _FIt1, class _FIt2, class _UF>
_FIt2 transform(_FIt1 __first, _FIt1 __last, _FIt2 __d_first, _UF __f) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first, __last);
  if constexpr (!std::is_pointer<_FIt1>::value &&
                !std::is_pointer<_FIt2>::value) {
    #pragma acc_stdpar parallel loop
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first[__i]);
    }
  } else if constexpr (std::is_pointer<_FIt1>::value &&
                       !std::is_pointer<_FIt2>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first[__i]);
    }
  } else if constexpr (!std::is_pointer<_FIt1>::value &&
                       std::is_pointer<_FIt2>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first[__i]);
    }
  } else {
    static_assert(std::is_pointer<_FIt1>::value &&
                      std::is_pointer<_FIt2>::value,
                  "internal error: unhandled OpenACC parallel loop variant");
    #pragma acc_stdpar parallel loop deviceptr(__first, __d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first[__i]);
    }
  }
  return __d_first + __input_size;
}

template <class _FIt1, class _FIt2, class _FIt3, class _BF>
_FIt3 transform(_FIt1 __first1, _FIt1 __last1, _FIt2 __first2, _FIt3 __d_first,
                _BF __f) {
  using _Index = typename std::iterator_traits<_FIt1>::difference_type;
  _Index __input_size = std::distance(__first1, __last1);
  if constexpr (!std::is_pointer<_FIt1>::value &&
                !std::is_pointer<_FIt2>::value &&
                !std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (std::is_pointer<_FIt1>::value &&
                       !std::is_pointer<_FIt2>::value &&
                       !std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first1)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (!std::is_pointer<_FIt1>::value &&
                       std::is_pointer<_FIt2>::value &&
                       !std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first2)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (!std::is_pointer<_FIt1>::value &&
                       !std::is_pointer<_FIt2>::value &&
                       std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (std::is_pointer<_FIt1>::value &&
                       std::is_pointer<_FIt2>::value &&
                       !std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first1, __first2)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (std::is_pointer<_FIt1>::value &&
                       !std::is_pointer<_FIt2>::value &&
                       std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first1, __d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else if constexpr (!std::is_pointer<_FIt1>::value &&
                       std::is_pointer<_FIt2>::value &&
                       std::is_pointer<_FIt3>::value) {
    #pragma acc_stdpar parallel loop deviceptr(__first2, __d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  } else {
    static_assert(std::is_pointer<_FIt1>::value &&
                      std::is_pointer<_FIt2>::value &&
                      std::is_pointer<_FIt3>::value,
                  "internal error: unhandled OpenACC parallel loop variant");
    #pragma acc_stdpar parallel loop deviceptr(__first1, __first2, __d_first)
    for (_Index __i = 0; __i < __input_size; ++__i) {
      __d_first[__i] = __f(__first1[__i], __first2[__i]);
    }
  }
  return __d_first + __input_size;
}

}}} // namespace std::__stdpar::__openacc
