libstdc++
simd_builtin.h
1 // Simd Abi specific implementations -*- C++ -*-
2 
3 // Copyright (C) 2020-2021 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27 
28 #if __cplusplus >= 201703L
29 
30 #include <array>
31 #include <cmath>
32 #include <cstdlib>
33 
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
35 // _S_allbits{{{
36 template <typename _V>
37  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38  = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39 
40 // }}}
41 // _S_signmask, _S_absmask{{{
42 template <typename _V, typename = _VectorTraits<_V>>
43  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44  = __xor(_V() + 1, _V() - 1);
45 
46 template <typename _V, typename = _VectorTraits<_V>>
47  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48  = __andnot(_S_signmask<_V>, _S_allbits<_V>);
49 
50 //}}}
51 // __vector_permute<Indices...>{{{
52 // Index == -1 requests zeroing of the output element
53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
54  constexpr _Tp
55  __vector_permute(_Tp __x)
56  {
57  static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
58  return __make_vector<typename _TVT::value_type>(
59  (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
60  }
61 
62 // }}}
63 // __vector_shuffle<Indices...>{{{
64 // Index == -1 requests zeroing of the output element
65 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
66  constexpr _Tp
67  __vector_shuffle(_Tp __x, _Tp __y)
68  {
69  return _Tp{(_Indices == -1 ? 0
70  : _Indices < _TVT::_S_full_size
71  ? __x[_Indices]
72  : __y[_Indices - _TVT::_S_full_size])...};
73  }
74 
75 // }}}
76 // __make_wrapper{{{
77 template <typename _Tp, typename... _Args>
78  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
79  __make_wrapper(const _Args&... __args)
80  { return __make_vector<_Tp>(__args...); }
81 
82 // }}}
83 // __wrapper_bitcast{{{
84 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
85  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
86  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
87  __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
88  {
89  static_assert(_Np > 1);
90  return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
91  }
92 
93 // }}}
94 // __shift_elements_right{{{
95 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct
96 template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
97  _GLIBCXX_SIMD_INTRINSIC _Tp
98  __shift_elements_right(_Tp __v)
99  {
100  [[maybe_unused]] const auto __iv = __to_intrin(__v);
101  static_assert(__shift <= sizeof(_Tp));
102  if constexpr (__shift == 0)
103  return __v;
104  else if constexpr (__shift == sizeof(_Tp))
105  return _Tp();
106 #if _GLIBCXX_SIMD_X86INTRIN // {{{
107  else if constexpr (__have_sse && __shift == 8
108  && _TVT::template _S_is<float, 4>)
109  return _mm_movehl_ps(__iv, __iv);
110  else if constexpr (__have_sse2 && __shift == 8
111  && _TVT::template _S_is<double, 2>)
112  return _mm_unpackhi_pd(__iv, __iv);
113  else if constexpr (__have_sse2 && sizeof(_Tp) == 16)
114  return reinterpret_cast<typename _TVT::type>(
115  _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
116  else if constexpr (__shift == 16 && sizeof(_Tp) == 32)
117  {
118  /*if constexpr (__have_avx && _TVT::template _S_is<double, 4>)
119  return _mm256_permute2f128_pd(__iv, __iv, 0x81);
120  else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
121  return _mm256_permute2f128_ps(__iv, __iv, 0x81);
122  else if constexpr (__have_avx)
123  return reinterpret_cast<typename _TVT::type>(
124  _mm256_permute2f128_si256(__iv, __iv, 0x81));
125  else*/
126  return __zero_extend(__hi128(__v));
127  }
128  else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16)
129  {
130  const auto __vll = __vector_bitcast<_LLong>(__v);
131  return reinterpret_cast<typename _TVT::type>(
132  _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81),
133  __vll, __shift));
134  }
135  else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16)
136  {
137  const auto __vll = __vector_bitcast<_LLong>(__v);
138  return reinterpret_cast<typename _TVT::type>(
139  __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
140  _mm_srli_si128(__hi128(__vll), __shift)));
141  }
142  else if constexpr (sizeof(_Tp) == 32 && __shift > 16)
143  return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v)));
144  else if constexpr (sizeof(_Tp) == 64 && __shift == 32)
145  return __zero_extend(__hi256(__v));
146  else if constexpr (__have_avx512f && sizeof(_Tp) == 64)
147  {
148  if constexpr (__shift >= 48)
149  return __zero_extend(
150  __shift_elements_right<__shift - 48>(__extract<3, 4>(__v)));
151  else if constexpr (__shift >= 32)
152  return __zero_extend(
153  __shift_elements_right<__shift - 32>(__hi256(__v)));
154  else if constexpr (__shift % 8 == 0)
155  return reinterpret_cast<typename _TVT::type>(
156  _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
157  __shift / 8));
158  else if constexpr (__shift % 4 == 0)
159  return reinterpret_cast<typename _TVT::type>(
160  _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
161  __shift / 4));
162  else if constexpr (__have_avx512bw && __shift < 16)
163  {
164  const auto __vll = __vector_bitcast<_LLong>(__v);
165  return reinterpret_cast<typename _TVT::type>(
166  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9),
167  __vll, __shift));
168  }
169  else if constexpr (__have_avx512bw && __shift < 32)
170  {
171  const auto __vll = __vector_bitcast<_LLong>(__v);
172  return reinterpret_cast<typename _TVT::type>(
173  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee),
174  _mm512_shuffle_i32x4(__vll, __vll, 0xf9),
175  __shift - 16));
176  }
177  else
178  __assert_unreachable<_Tp>();
179  }
180  /*
181  } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
182  return __auto_bitcast(__extract<__shift / 16, 4>(__v));
183  */
184 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
185  else
186  {
187  constexpr int __chunksize = __shift % 8 == 0 ? 8
188  : __shift % 4 == 0 ? 4
189  : __shift % 2 == 0 ? 2
190  : 1;
191  auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
192  using _Up = decltype(__w);
193  return __intrin_bitcast<_Tp>(
194  __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
195  [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
196  return _Up{__chunks...};
197  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
198  return __w[__shift / __chunksize + __i];
199  }));
200  }
201  }
202 
203 // }}}
204 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
205 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
206  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
207  _SimdWrapper<_Tp, _Np / _Total * _Combine>
208  __extract_part(const _SimdWrapper<_Tp, _Np> __x)
209  {
210  if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
211  return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
212  else
213  {
214  constexpr size_t __values_per_part = _Np / _Total;
215  constexpr size_t __values_to_skip = _Index * __values_per_part;
216  constexpr size_t __return_size = __values_per_part * _Combine;
217  using _R = __vector_type_t<_Tp, __return_size>;
218  static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
219  <= sizeof(__x),
220  "out of bounds __extract_part");
221  // the following assertion would ensure no "padding" to be read
222  // static_assert(_Total >= _Index + _Combine, "_Total must be greater
223  // than _Index");
224 
225  // static_assert(__return_size * _Total == _Np, "_Np must be divisible
226  // by _Total");
227  if (__x._M_is_constprop())
228  return __generate_from_n_evaluations<__return_size, _R>(
229  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
230  return __x[__values_to_skip + __i];
231  });
232  if constexpr (_Index == 0 && _Total == 1)
233  return __x;
234  else if constexpr (_Index == 0)
235  return __intrin_bitcast<_R>(__as_vector(__x));
236 #if _GLIBCXX_SIMD_X86INTRIN // {{{
237  else if constexpr (sizeof(__x) == 32
238  && __return_size * sizeof(_Tp) <= 16)
239  {
240  constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
241  if constexpr (__bytes_to_skip == 16)
242  return __vector_bitcast<_Tp, __return_size>(
243  __hi128(__as_vector(__x)));
244  else
245  return __vector_bitcast<_Tp, __return_size>(
246  _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
247  __lo128(__vector_bitcast<_LLong>(__x)),
248  __bytes_to_skip));
249  }
250 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
251  else if constexpr (_Index > 0
252  && (__values_to_skip % __return_size != 0
253  || sizeof(_R) >= 8)
254  && (__values_to_skip + __return_size) * sizeof(_Tp)
255  <= 64
256  && sizeof(__x) >= 16)
257  return __intrin_bitcast<_R>(
258  __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
259  __as_vector(__x)));
260  else
261  {
262  _R __r = {};
263  __builtin_memcpy(&__r,
264  reinterpret_cast<const char*>(&__x)
265  + sizeof(_Tp) * __values_to_skip,
266  __return_size * sizeof(_Tp));
267  return __r;
268  }
269  }
270  }
271 
272 // }}}
273 // __extract_part(_SimdWrapper<bool, _Np>) {{{
274 template <int _Index, int _Total, int _Combine = 1, size_t _Np>
275  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
276  __extract_part(const _SimdWrapper<bool, _Np> __x)
277  {
278  static_assert(_Combine == 1, "_Combine != 1 not implemented");
279  static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
280  return __x._M_data >> (_Index * _Np / _Total);
281  }
282 
283 // }}}
284 
285 // __vector_convert {{{
286 // implementation requires an index sequence
287 template <typename _To, typename _From, size_t... _I>
288  _GLIBCXX_SIMD_INTRINSIC constexpr _To
289  __vector_convert(_From __a, index_sequence<_I...>)
290  {
291  using _Tp = typename _VectorTraits<_To>::value_type;
292  return _To{static_cast<_Tp>(__a[_I])...};
293  }
294 
295 template <typename _To, typename _From, size_t... _I>
296  _GLIBCXX_SIMD_INTRINSIC constexpr _To
297  __vector_convert(_From __a, _From __b, index_sequence<_I...>)
298  {
299  using _Tp = typename _VectorTraits<_To>::value_type;
300  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
301  }
302 
303 template <typename _To, typename _From, size_t... _I>
304  _GLIBCXX_SIMD_INTRINSIC constexpr _To
305  __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
306  {
307  using _Tp = typename _VectorTraits<_To>::value_type;
308  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
309  static_cast<_Tp>(__c[_I])...};
310  }
311 
312 template <typename _To, typename _From, size_t... _I>
313  _GLIBCXX_SIMD_INTRINSIC constexpr _To
314  __vector_convert(_From __a, _From __b, _From __c, _From __d,
315  index_sequence<_I...>)
316  {
317  using _Tp = typename _VectorTraits<_To>::value_type;
318  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
319  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
320  }
321 
322 template <typename _To, typename _From, size_t... _I>
323  _GLIBCXX_SIMD_INTRINSIC constexpr _To
324  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
325  index_sequence<_I...>)
326  {
327  using _Tp = typename _VectorTraits<_To>::value_type;
328  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
329  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
330  static_cast<_Tp>(__e[_I])...};
331  }
332 
333 template <typename _To, typename _From, size_t... _I>
334  _GLIBCXX_SIMD_INTRINSIC constexpr _To
335  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
336  _From __f, index_sequence<_I...>)
337  {
338  using _Tp = typename _VectorTraits<_To>::value_type;
339  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
340  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
341  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
342  }
343 
344 template <typename _To, typename _From, size_t... _I>
345  _GLIBCXX_SIMD_INTRINSIC constexpr _To
346  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
347  _From __f, _From __g, index_sequence<_I...>)
348  {
349  using _Tp = typename _VectorTraits<_To>::value_type;
350  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
351  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
352  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
353  static_cast<_Tp>(__g[_I])...};
354  }
355 
356 template <typename _To, typename _From, size_t... _I>
357  _GLIBCXX_SIMD_INTRINSIC constexpr _To
358  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
359  _From __f, _From __g, _From __h, index_sequence<_I...>)
360  {
361  using _Tp = typename _VectorTraits<_To>::value_type;
362  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
363  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
364  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
365  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
366  }
367 
368 template <typename _To, typename _From, size_t... _I>
369  _GLIBCXX_SIMD_INTRINSIC constexpr _To
370  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
371  _From __f, _From __g, _From __h, _From __i,
372  index_sequence<_I...>)
373  {
374  using _Tp = typename _VectorTraits<_To>::value_type;
375  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
376  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
377  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
378  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
379  static_cast<_Tp>(__i[_I])...};
380  }
381 
382 template <typename _To, typename _From, size_t... _I>
383  _GLIBCXX_SIMD_INTRINSIC constexpr _To
384  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
385  _From __f, _From __g, _From __h, _From __i, _From __j,
386  index_sequence<_I...>)
387  {
388  using _Tp = typename _VectorTraits<_To>::value_type;
389  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
390  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
391  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
392  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
393  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
394  }
395 
396 template <typename _To, typename _From, size_t... _I>
397  _GLIBCXX_SIMD_INTRINSIC constexpr _To
398  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
399  _From __f, _From __g, _From __h, _From __i, _From __j,
400  _From __k, index_sequence<_I...>)
401  {
402  using _Tp = typename _VectorTraits<_To>::value_type;
403  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
404  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
405  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
406  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
407  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
408  static_cast<_Tp>(__k[_I])...};
409  }
410 
411 template <typename _To, typename _From, size_t... _I>
412  _GLIBCXX_SIMD_INTRINSIC constexpr _To
413  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
414  _From __f, _From __g, _From __h, _From __i, _From __j,
415  _From __k, _From __l, index_sequence<_I...>)
416  {
417  using _Tp = typename _VectorTraits<_To>::value_type;
418  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
419  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
420  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
421  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
422  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
423  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
424  }
425 
426 template <typename _To, typename _From, size_t... _I>
427  _GLIBCXX_SIMD_INTRINSIC constexpr _To
428  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
429  _From __f, _From __g, _From __h, _From __i, _From __j,
430  _From __k, _From __l, _From __m, index_sequence<_I...>)
431  {
432  using _Tp = typename _VectorTraits<_To>::value_type;
433  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
434  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
435  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
436  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
437  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
438  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
439  static_cast<_Tp>(__m[_I])...};
440  }
441 
442 template <typename _To, typename _From, size_t... _I>
443  _GLIBCXX_SIMD_INTRINSIC constexpr _To
444  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
445  _From __f, _From __g, _From __h, _From __i, _From __j,
446  _From __k, _From __l, _From __m, _From __n,
447  index_sequence<_I...>)
448  {
449  using _Tp = typename _VectorTraits<_To>::value_type;
450  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
451  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
452  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
453  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
454  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
455  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
456  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
457  }
458 
459 template <typename _To, typename _From, size_t... _I>
460  _GLIBCXX_SIMD_INTRINSIC constexpr _To
461  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
462  _From __f, _From __g, _From __h, _From __i, _From __j,
463  _From __k, _From __l, _From __m, _From __n, _From __o,
464  index_sequence<_I...>)
465  {
466  using _Tp = typename _VectorTraits<_To>::value_type;
467  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
468  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
469  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
470  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
471  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
472  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
473  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
474  static_cast<_Tp>(__o[_I])...};
475  }
476 
477 template <typename _To, typename _From, size_t... _I>
478  _GLIBCXX_SIMD_INTRINSIC constexpr _To
479  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
480  _From __f, _From __g, _From __h, _From __i, _From __j,
481  _From __k, _From __l, _From __m, _From __n, _From __o,
482  _From __p, index_sequence<_I...>)
483  {
484  using _Tp = typename _VectorTraits<_To>::value_type;
485  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
486  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
487  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
488  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
489  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
490  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
491  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
492  static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
493  }
494 
495 // Defer actual conversion to the overload that takes an index sequence. Note
496 // that this function adds zeros or drops values off the end if you don't ensure
497 // matching width.
498 template <typename _To, typename... _From, size_t _FromSize>
499  _GLIBCXX_SIMD_INTRINSIC constexpr _To
500  __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
501  {
502 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
503  using _From0 = __first_of_pack_t<_From...>;
504  using _FW = _SimdWrapper<_From0, _FromSize>;
505  if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
506  {
507  if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
508  == 0) // power-of-two number of arguments
509  return __convert_x86<_To>(__as_vector(__xs)...);
510  else // append zeros and recurse until the above branch is taken
511  return __vector_convert<_To>(__xs..., _FW{});
512  }
513  else
514 #endif
515  return __vector_convert<_To>(
516  __as_vector(__xs)...,
517  make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
518  _VectorTraits<_To>::_S_full_size, int(_FromSize))
519  : _FromSize)>());
520  }
521 
522 // }}}
523 // __convert function{{{
524 template <typename _To, typename _From, typename... _More>
525  _GLIBCXX_SIMD_INTRINSIC constexpr auto
526  __convert(_From __v0, _More... __vs)
527  {
528  static_assert((true && ... && is_same_v<_From, _More>) );
529  if constexpr (__is_vectorizable_v<_From>)
530  {
531  using _V = typename _VectorTraits<_To>::type;
532  using _Tp = typename _VectorTraits<_To>::value_type;
533  return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
534  }
535  else if constexpr (__is_vector_type_v<_From>)
536  return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
537  else // _SimdWrapper arguments
538  {
539  constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
540  if constexpr (__is_vectorizable_v<_To>)
541  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
542  else if constexpr (!__is_vector_type_v<_To>)
543  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
544  else
545  {
546  static_assert(
547  sizeof...(_More) == 0
548  || _VectorTraits<_To>::_S_full_size >= __input_size,
549  "__convert(...) requires the input to fit into the output");
550  return __vector_convert<_To>(__v0, __vs...);
551  }
552  }
553  }
554 
555 // }}}
556 // __convert_all{{{
557 // Converts __v into array<_To, N>, where N is _NParts if non-zero or
558 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
559 // Note: this function may return less than all converted elements
560 template <typename _To,
561  size_t _NParts = 0, // allows to convert fewer or more (only last
562  // _To, to be partially filled) than all
563  size_t _Offset = 0, // where to start, # of elements (not Bytes or
564  // Parts)
565  typename _From, typename _FromVT = _VectorTraits<_From>>
566  _GLIBCXX_SIMD_INTRINSIC auto
567  __convert_all(_From __v)
568  {
569  if constexpr (is_arithmetic_v<_To> && _NParts != 1)
570  {
571  static_assert(_Offset < _FromVT::_S_full_size);
572  constexpr auto _Np
573  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
574  return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
575  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
576  return static_cast<_To>(__v[__i + _Offset]);
577  });
578  }
579  else
580  {
581  static_assert(__is_vector_type_v<_To>);
582  using _ToVT = _VectorTraits<_To>;
583  if constexpr (__is_vector_type_v<_From>)
584  return __convert_all<_To, _NParts>(__as_wrapper(__v));
585  else if constexpr (_NParts == 1)
586  {
587  static_assert(_Offset % _ToVT::_S_full_size == 0);
588  return array<_To, 1>{__vector_convert<_To>(
589  __extract_part<_Offset / _ToVT::_S_full_size,
590  __div_roundup(_FromVT::_S_partial_width,
591  _ToVT::_S_full_size)>(__v))};
592  }
593 #if _GLIBCXX_SIMD_X86INTRIN // {{{
594  else if constexpr (!__have_sse4_1 && _Offset == 0
595  && is_integral_v<typename _FromVT::value_type>
596  && sizeof(typename _FromVT::value_type)
597  < sizeof(typename _ToVT::value_type)
598  && !(sizeof(typename _FromVT::value_type) == 4
599  && is_same_v<typename _ToVT::value_type, double>))
600  {
601  using _ToT = typename _ToVT::value_type;
602  using _FromT = typename _FromVT::value_type;
603  constexpr size_t _Np
604  = _NParts != 0
605  ? _NParts
606  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
607  using _R = array<_To, _Np>;
608  // __adjust modifies its input to have _Np (use _SizeConstant)
609  // entries so that no unnecessary intermediate conversions are
610  // requested and, more importantly, no intermediate conversions are
611  // missing
612  [[maybe_unused]] auto __adjust
613  = [](auto __n,
614  auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
615  return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
616  };
617  [[maybe_unused]] const auto __vi = __to_intrin(__v);
618  auto&& __make_array
619  = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
620  if constexpr (_Np == 1)
621  return _R{__intrin_bitcast<_To>(__x0)};
622  else
623  return _R{__intrin_bitcast<_To>(__x0),
624  __intrin_bitcast<_To>(__x1)};
625  };
626 
627  if constexpr (_Np == 0)
628  return _R{};
629  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
630  {
631  static_assert(is_integral_v<_FromT>);
632  static_assert(is_integral_v<_ToT>);
633  if constexpr (is_unsigned_v<_FromT>)
634  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
635  _mm_unpackhi_epi8(__vi, __m128i()));
636  else
637  return __make_array(
638  _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
639  _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
640  }
641  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
642  {
643  static_assert(is_integral_v<_FromT>);
644  if constexpr (is_floating_point_v<_ToT>)
645  {
646  const auto __ints
647  = __convert_all<__vector_type16_t<int>, _Np>(
648  __adjust(_SizeConstant<_Np * 4>(), __v));
649  return __generate_from_n_evaluations<_Np, _R>(
650  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
651  return __vector_convert<_To>(__as_wrapper(__ints[__i]));
652  });
653  }
654  else if constexpr (is_unsigned_v<_FromT>)
655  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
656  _mm_unpackhi_epi16(__vi, __m128i()));
657  else
658  return __make_array(
659  _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
660  _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
661  }
662  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
663  && is_integral_v<_FromT> && is_integral_v<_ToT>)
664  {
665  if constexpr (is_unsigned_v<_FromT>)
666  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
667  _mm_unpackhi_epi32(__vi, __m128i()));
668  else
669  return __make_array(
670  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
671  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
672  }
673  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
674  && is_integral_v<_FromT> && is_integral_v<_ToT>)
675  {
676  if constexpr (is_unsigned_v<_FromT>)
677  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
678  _mm_unpackhi_epi32(__vi, __m128i()));
679  else
680  return __make_array(
681  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
682  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
683  }
684  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
685  && is_signed_v<_FromT>)
686  {
687  const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
688  _mm_unpackhi_epi8(__vi, __vi)};
689  const __vector_type_t<int, 4> __vvvv[4] = {
690  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
691  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
692  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
693  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
694  if constexpr (sizeof(_ToT) == 4)
695  return __generate_from_n_evaluations<_Np, _R>(
696  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
697  return __vector_convert<_To>(
698  _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
699  });
700  else if constexpr (is_integral_v<_ToT>)
701  return __generate_from_n_evaluations<_Np, _R>(
702  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
703  const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
704  const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
705  return __vector_bitcast<_ToT>(
706  __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
707  : _mm_unpackhi_epi32(__sx32, __signbits));
708  });
709  else
710  return __generate_from_n_evaluations<_Np, _R>(
711  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
712  const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
713  return __vector_convert<_To>(
714  __i % 2 == 0 ? __int4
715  : _SimdWrapper<int, 4>(
716  _mm_unpackhi_epi64(__to_intrin(__int4),
717  __to_intrin(__int4))));
718  });
719  }
720  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
721  {
722  const auto __shorts = __convert_all<__vector_type16_t<
723  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
724  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
725  return __generate_from_n_evaluations<_Np, _R>(
726  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
727  return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
728  });
729  }
730  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
731  && is_signed_v<_FromT> && is_integral_v<_ToT>)
732  {
733  const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
734  _mm_unpackhi_epi16(__vi, __vi)};
735  const __vector_type16_t<int> __vvvv[4]
736  = {__vector_bitcast<int>(
737  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
738  _mm_srai_epi32(__vv[0], 31))),
739  __vector_bitcast<int>(
740  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
741  _mm_srai_epi32(__vv[0], 31))),
742  __vector_bitcast<int>(
743  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
744  _mm_srai_epi32(__vv[1], 31))),
745  __vector_bitcast<int>(
746  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
747  _mm_srai_epi32(__vv[1], 31)))};
748  return __generate_from_n_evaluations<_Np, _R>(
749  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
750  return __vector_bitcast<_ToT>(__vvvv[__i]);
751  });
752  }
753  else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
754  {
755  const auto __ints
756  = __convert_all<__vector_type16_t<conditional_t<
757  is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
758  unsigned int>>>(
759  __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
760  return __generate_from_n_evaluations<_Np, _R>(
761  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
762  return __convert_all<_To>(__ints[__i / 2])[__i % 2];
763  });
764  }
765  else
766  __assert_unreachable<_To>();
767  }
768 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
769  else if constexpr ((_FromVT::_S_partial_width - _Offset)
770  > _ToVT::_S_full_size)
771  {
772  /*
773  static_assert(
774  (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
775  0,
776  "__convert_all only supports power-of-2 number of elements.
777  Otherwise " "the return type cannot be array<_To, N>.");
778  */
779  constexpr size_t _NTotal
780  = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
781  constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
782  static_assert(
783  _Np <= _NTotal
784  || (_Np == _NTotal + 1
785  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
786  > 0));
787  using _R = array<_To, _Np>;
788  if constexpr (_Np == 1)
789  return _R{__vector_convert<_To>(
790  __extract_part<_Offset, _FromVT::_S_partial_width,
791  _ToVT::_S_full_size>(__v))};
792  else
793  return __generate_from_n_evaluations<_Np, _R>(
794  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
795  auto __part
796  = __extract_part<__i * _ToVT::_S_full_size + _Offset,
797  _FromVT::_S_partial_width,
798  _ToVT::_S_full_size>(__v);
799  return __vector_convert<_To>(__part);
800  });
801  }
802  else if constexpr (_Offset == 0)
803  return array<_To, 1>{__vector_convert<_To>(__v)};
804  else
805  return array<_To, 1>{__vector_convert<_To>(
806  __extract_part<_Offset, _FromVT::_S_partial_width,
807  _FromVT::_S_partial_width - _Offset>(__v))};
808  }
809  }
810 
811 // }}}
812 
813 // _GnuTraits {{{
814 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
815  struct _GnuTraits
816  {
817  using _IsValid = true_type;
818  using _SimdImpl = typename _Abi::_SimdImpl;
819  using _MaskImpl = typename _Abi::_MaskImpl;
820 
821  // simd and simd_mask member types {{{
822  using _SimdMember = _SimdWrapper<_Tp, _Np>;
823  using _MaskMember = _SimdWrapper<_Mp, _Np>;
824  static constexpr size_t _S_simd_align = alignof(_SimdMember);
825  static constexpr size_t _S_mask_align = alignof(_MaskMember);
826 
827  // }}}
828  // size metadata {{{
829  static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
830  static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
831 
832  // }}}
833  // _SimdBase / base class for simd, providing extra conversions {{{
834  struct _SimdBase2
835  {
836  explicit
837  operator __intrinsic_type_t<_Tp, _Np>() const
838  { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); }
839 
840  explicit
841  operator __vector_type_t<_Tp, _Np>() const
842  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
843  };
844 
845  struct _SimdBase1
846  {
847  explicit
848  operator __intrinsic_type_t<_Tp, _Np>() const
849  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
850  };
851 
852  using _SimdBase = conditional_t<
853  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
854  _SimdBase1, _SimdBase2>;
855 
856  // }}}
857  // _MaskBase {{{
858  struct _MaskBase2
859  {
860  explicit
861  operator __intrinsic_type_t<_Tp, _Np>() const
862  { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); }
863 
864  explicit
865  operator __vector_type_t<_Tp, _Np>() const
866  { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; }
867  };
868 
869  struct _MaskBase1
870  {
871  explicit
872  operator __intrinsic_type_t<_Tp, _Np>() const
873  { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
874  };
875 
876  using _MaskBase = conditional_t<
877  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
878  _MaskBase1, _MaskBase2>;
879 
880  // }}}
881  // _MaskCastType {{{
882  // parameter type of one explicit simd_mask constructor
883  class _MaskCastType
884  {
885  using _Up = __intrinsic_type_t<_Tp, _Np>;
886  _Up _M_data;
887 
888  public:
889  _MaskCastType(_Up __x) : _M_data(__x) {}
890 
891  operator _MaskMember() const { return _M_data; }
892  };
893 
894  // }}}
895  // _SimdCastType {{{
896  // parameter type of one explicit simd constructor
897  class _SimdCastType1
898  {
899  using _Ap = __intrinsic_type_t<_Tp, _Np>;
900  _SimdMember _M_data;
901 
902  public:
903  constexpr
904  _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
905 
906  constexpr
907  operator _SimdMember() const { return _M_data; }
908  };
909 
910  class _SimdCastType2
911  {
912  using _Ap = __intrinsic_type_t<_Tp, _Np>;
913  using _Bp = __vector_type_t<_Tp, _Np>;
914  _SimdMember _M_data;
915 
916  public:
917  constexpr
918  _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
919 
920  constexpr
921  _SimdCastType2(_Bp __b) : _M_data(__b) {}
922 
923  constexpr
924  operator _SimdMember() const { return _M_data; }
925  };
926 
927  using _SimdCastType = conditional_t<
928  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
929  _SimdCastType1, _SimdCastType2>;
930  //}}}
931  };
932 
933 // }}}
934 struct _CommonImplX86;
935 struct _CommonImplNeon;
936 struct _CommonImplBuiltin;
937 template <typename _Abi> struct _SimdImplBuiltin;
938 template <typename _Abi> struct _MaskImplBuiltin;
939 template <typename _Abi> struct _SimdImplX86;
940 template <typename _Abi> struct _MaskImplX86;
941 template <typename _Abi> struct _SimdImplNeon;
942 template <typename _Abi> struct _MaskImplNeon;
943 template <typename _Abi> struct _SimdImplPpc;
944 template <typename _Abi> struct _MaskImplPpc;
945 
946 // simd_abi::_VecBuiltin {{{
947 template <int _UsedBytes>
948  struct simd_abi::_VecBuiltin
949  {
950  template <typename _Tp>
951  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
952 
953  // validity traits {{{
954  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
955 
956  template <typename _Tp>
957  struct _IsValidSizeFor
958  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
959  && _UsedBytes % sizeof(_Tp) == 0
960  && _UsedBytes <= __vectorized_sizeof<_Tp>()
961  && (!__have_avx512f || _UsedBytes <= 32))> {};
962 
963  template <typename _Tp>
964  struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
965  _IsValidSizeFor<_Tp>> {};
966 
967  template <typename _Tp>
968  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
969 
970  // }}}
971  // _SimdImpl/_MaskImpl {{{
972 #if _GLIBCXX_SIMD_X86INTRIN
973  using _CommonImpl = _CommonImplX86;
974  using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
975  using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
976 #elif _GLIBCXX_SIMD_HAVE_NEON
977  using _CommonImpl = _CommonImplNeon;
978  using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
979  using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
980 #else
981  using _CommonImpl = _CommonImplBuiltin;
982 #ifdef __ALTIVEC__
983  using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
984  using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
985 #else
986  using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
987  using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
988 #endif
989 #endif
990 
991  // }}}
992  // __traits {{{
993  template <typename _Tp>
994  using _MaskValueType = __int_for_sizeof_t<_Tp>;
995 
996  template <typename _Tp>
997  using __traits
998  = conditional_t<_S_is_valid_v<_Tp>,
999  _GnuTraits<_Tp, _MaskValueType<_Tp>,
1000  _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
1001  _InvalidTraits>;
1002 
1003  //}}}
1004  // size metadata {{{
1005  template <typename _Tp>
1006  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1007 
1008  template <typename _Tp>
1009  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1010 
1011  // }}}
1012  // implicit masks {{{
1013  template <typename _Tp>
1014  using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
1015 
1016  template <typename _Tp>
1017  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1018  _S_implicit_mask()
1019  {
1020  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
1021  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1022  return ~_UV();
1023  else
1024  {
1025  constexpr auto __size = _S_size<_Tp>;
1026  _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
1027  = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1028  { return __i < __size ? -1 : 0; });
1029  return __r;
1030  }
1031  }
1032 
1033  template <typename _Tp>
1034  _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
1035  _S_implicit_mask_intrin()
1036  { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
1037 
1038  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1039  _GLIBCXX_SIMD_INTRINSIC static constexpr _TW
1040  _S_masked(_TW __x)
1041  {
1042  using _Tp = typename _TVT::value_type;
1043  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1044  return __x;
1045  else
1046  return __and(__as_vector(__x),
1047  __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
1048  }
1049 
1050  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1051  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1052  __make_padding_nonzero(_TW __x)
1053  {
1054  using _Tp = typename _TVT::value_type;
1055  if constexpr (!_S_is_partial<_Tp>)
1056  return __x;
1057  else
1058  {
1059  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
1060  = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
1061  if constexpr (is_integral_v<_Tp>)
1062  return __or(__x, ~__implicit_mask);
1063  else
1064  {
1065  _GLIBCXX_SIMD_USE_CONSTEXPR auto __one
1066  = __andnot(__implicit_mask,
1067  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
1068  // it's not enough to return `x | 1_in_padding` because the
1069  // padding in x might be inf or nan (independent of
1070  // __FINITE_MATH_ONLY__, because it's about padding bits)
1071  return __or(__and(__x, __implicit_mask), __one);
1072  }
1073  }
1074  }
1075  // }}}
1076  };
1077 
1078 // }}}
1079 // simd_abi::_VecBltnBtmsk {{{
1080 template <int _UsedBytes>
1081  struct simd_abi::_VecBltnBtmsk
1082  {
1083  template <typename _Tp>
1084  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
1085 
1086  // validity traits {{{
1087  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
1088 
1089  template <typename _Tp>
1090  struct _IsValidSizeFor
1091  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
1092  && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
1093  && (_UsedBytes > 32 || __have_avx512vl))> {};
1094 
1095  // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
1096  // required.
1097  template <typename _Tp>
1098  struct _IsValid
1099  : conjunction<
1100  _IsValidAbiTag, __bool_constant<__have_avx512f>,
1101  __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
1102  __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
1103  _IsValidSizeFor<_Tp>> {};
1104 
1105  template <typename _Tp>
1106  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
1107 
1108  // }}}
1109  // simd/_MaskImpl {{{
1110  #if _GLIBCXX_SIMD_X86INTRIN
1111  using _CommonImpl = _CommonImplX86;
1112  using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
1113  using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
1114  #else
1115  template <int>
1116  struct _MissingImpl;
1117 
1118  using _CommonImpl = _MissingImpl<_UsedBytes>;
1119  using _SimdImpl = _MissingImpl<_UsedBytes>;
1120  using _MaskImpl = _MissingImpl<_UsedBytes>;
1121  #endif
1122 
1123  // }}}
1124  // __traits {{{
1125  template <typename _Tp>
1126  using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
1127 
1128  template <typename _Tp>
1129  using __traits = conditional_t<
1130  _S_is_valid_v<_Tp>,
1131  _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1132  _InvalidTraits>;
1133 
1134  //}}}
1135  // size metadata {{{
1136  template <typename _Tp>
1137  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1138  template <typename _Tp>
1139  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1140 
1141  // }}}
1142  // implicit mask {{{
1143  private:
1144  template <typename _Tp>
1145  using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1146 
1147  public:
1148  template <size_t _Np>
1149  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1150  __implicit_mask_n()
1151  {
1152  using _Tp = __bool_storage_member_type_t<_Np>;
1153  return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
1154  }
1155 
1156  template <typename _Tp>
1157  _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1158  _S_implicit_mask()
1159  { return __implicit_mask_n<_S_size<_Tp>>(); }
1160 
1161  template <typename _Tp>
1162  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1163  _S_implicit_mask_intrin()
1164  { return __implicit_mask_n<_S_size<_Tp>>(); }
1165 
1166  template <typename _Tp, size_t _Np>
1167  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1168  _S_masked(_SimdWrapper<_Tp, _Np> __x)
1169  {
1170  if constexpr (is_same_v<_Tp, bool>)
1171  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
1172  return _MaskImpl::_S_bit_and(
1173  __x, _SimdWrapper<_Tp, _Np>(
1174  __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
1175  else
1176  return __x;
1177  else
1178  return _S_masked(__x._M_data);
1179  }
1180 
1181  template <typename _TV>
1182  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1183  _S_masked(_TV __x)
1184  {
1185  using _Tp = typename _VectorTraits<_TV>::value_type;
1186  static_assert(
1187  !__is_bitmask_v<_TV>,
1188  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1189  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
1190  if constexpr (_S_is_partial<_Tp>)
1191  {
1192  constexpr size_t _Np = _S_size<_Tp>;
1193  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1194  _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1195  _SimdWrapper<_Tp, _Np>(__x));
1196  }
1197  else
1198  return __x;
1199  }
1200 
1201  template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1202  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1203  __make_padding_nonzero(_TV __x)
1204  {
1205  using _Tp = typename _TVT::value_type;
1206  if constexpr (!_S_is_partial<_Tp>)
1207  return __x;
1208  else
1209  {
1210  constexpr size_t _Np = _S_size<_Tp>;
1211  if constexpr (is_integral_v<typename _TVT::value_type>)
1212  return __x
1213  | __generate_vector<_Tp, _S_full_size<_Tp>>(
1214  [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1215  if (__i < _Np)
1216  return 0;
1217  else
1218  return 1;
1219  });
1220  else
1221  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1222  _S_implicit_mask<_Tp>(),
1223  _SimdWrapper<_Tp, _Np>(
1224  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
1225  _SimdWrapper<_Tp, _Np>(__x))
1226  ._M_data;
1227  }
1228  }
1229 
1230  // }}}
1231  };
1232 
1233 //}}}
1234 // _CommonImplBuiltin {{{
1235 struct _CommonImplBuiltin
1236 {
1237  // _S_converts_via_decomposition{{{
1238  // This lists all cases where a __vector_convert needs to fall back to
1239  // conversion of individual scalars (i.e. decompose the input vector into
1240  // scalars, convert, compose output vector). In those cases, _S_masked_load &
1241  // _S_masked_store prefer to use the _S_bit_iteration implementation.
1242  template <typename _From, typename _To, size_t _ToSize>
1243  static inline constexpr bool __converts_via_decomposition_v
1244  = sizeof(_From) != sizeof(_To);
1245 
1246  // }}}
1247  // _S_load{{{
1248  template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1249  _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1250  _S_load(const void* __p)
1251  {
1252  static_assert(_Np > 1);
1253  static_assert(_Bytes % sizeof(_Tp) == 0);
1254  using _Rp = __vector_type_t<_Tp, _Np>;
1255  if constexpr (sizeof(_Rp) == _Bytes)
1256  {
1257  _Rp __r;
1258  __builtin_memcpy(&__r, __p, _Bytes);
1259  return __r;
1260  }
1261  else
1262  {
1263 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1264  using _Up = conditional_t<
1265  is_integral_v<_Tp>,
1266  conditional_t<_Bytes % 4 == 0,
1267  conditional_t<_Bytes % 8 == 0, long long, int>,
1268  conditional_t<_Bytes % 2 == 0, short, signed char>>,
1269  conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
1270  double>>;
1271  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1272  if constexpr (sizeof(_V) != sizeof(_Rp))
1273  { // on i386 with 4 < _Bytes <= 8
1274  _Rp __r{};
1275  __builtin_memcpy(&__r, __p, _Bytes);
1276  return __r;
1277  }
1278  else
1279 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1280  using _V = _Rp;
1281 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1282  {
1283  _V __r{};
1284  static_assert(_Bytes <= sizeof(_V));
1285  __builtin_memcpy(&__r, __p, _Bytes);
1286  return reinterpret_cast<_Rp>(__r);
1287  }
1288  }
1289  }
1290 
1291  // }}}
1292  // _S_store {{{
1293  template <size_t _Bytes>
1294  _GLIBCXX_SIMD_INTRINSIC static void
1295  _S_memcpy(char* __dst, const char* __src)
1296  {
1297  if constexpr (_Bytes > 0)
1298  {
1299  constexpr size_t _Ns = std::__bit_floor(_Bytes);
1300  __builtin_memcpy(__dst, __src, _Ns);
1301  _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns);
1302  }
1303  }
1304 
1305  template <size_t _ReqBytes = 0, typename _TV>
1306  _GLIBCXX_SIMD_INTRINSIC static void
1307  _S_store(_TV __x, void* __addr)
1308  {
1309  constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
1310  static_assert(sizeof(__x) >= _Bytes);
1311 
1312 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424
1313  if constexpr (__is_vector_type_v<_TV>)
1314  _S_memcpy<_Bytes>(reinterpret_cast<char*>(__addr), reinterpret_cast<const char*>(&__x));
1315  else
1316 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1317  __builtin_memcpy(__addr, &__x, _Bytes);
1318  }
1319 
1320  template <typename _Tp, size_t _Np>
1321  _GLIBCXX_SIMD_INTRINSIC static void
1322  _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1323  { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1324 
1325  // }}}
1326  // _S_store_bool_array(_BitMask) {{{
1327  template <size_t _Np, bool _Sanitized>
1328  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1329  _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1330  {
1331  if constexpr (_Np == 1)
1332  __mem[0] = __x[0];
1333  else if (__builtin_is_constant_evaluated())
1334  {
1335  for (size_t __i = 0; __i < _Np; ++__i)
1336  __mem[__i] = __x[__i];
1337  }
1338  else if constexpr (_Np == 2)
1339  {
1340  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
1341  _S_store<_Np>(__bool2, __mem);
1342  }
1343  else if constexpr (_Np == 3)
1344  {
1345  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
1346  _S_store<_Np>(__bool3, __mem);
1347  }
1348  else
1349  {
1350  __execute_n_times<__div_roundup(_Np, 4)>(
1351  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1352  constexpr int __offset = __i * 4;
1353  constexpr int __remaining = _Np - __offset;
1354  if constexpr (__remaining > 4 && __remaining <= 7)
1355  {
1356  const _ULLong __bool7
1357  = (__x.template _M_extract<__offset>()._M_to_bits()
1358  * 0x40810204081ULL)
1359  & 0x0101010101010101ULL;
1360  _S_store<__remaining>(__bool7, __mem + __offset);
1361  }
1362  else if constexpr (__remaining >= 4)
1363  {
1364  int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1365  if constexpr (__remaining > 7)
1366  __bits &= 0xf;
1367  const int __bool4 = (__bits * 0x204081) & 0x01010101;
1368  _S_store<4>(__bool4, __mem + __offset);
1369  }
1370  });
1371  }
1372  }
1373 
1374  // }}}
1375  // _S_blend{{{
1376  template <typename _Tp, size_t _Np>
1377  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1378  _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1379  _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1380  { return __k._M_data ? __at1._M_data : __at0._M_data; }
1381 
1382  // }}}
1383 };
1384 
1385 // }}}
1386 // _SimdImplBuiltin {{{1
1387 template <typename _Abi>
1388  struct _SimdImplBuiltin
1389  {
1390  // member types {{{2
1391  template <typename _Tp>
1392  static constexpr size_t _S_max_store_size = 16;
1393 
1394  using abi_type = _Abi;
1395 
1396  template <typename _Tp>
1397  using _TypeTag = _Tp*;
1398 
1399  template <typename _Tp>
1400  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1401 
1402  template <typename _Tp>
1403  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1404 
1405  template <typename _Tp>
1406  static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1407 
1408  template <typename _Tp>
1409  static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1410 
1411  using _CommonImpl = typename _Abi::_CommonImpl;
1412  using _SuperImpl = typename _Abi::_SimdImpl;
1413  using _MaskImpl = typename _Abi::_MaskImpl;
1414 
1415  // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1416  template <typename _Tp, size_t _Np>
1417  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1418  _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1419  { return {__private_init, __x}; }
1420 
1421  template <typename _Tp, size_t _Np>
1422  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1423  _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1424  { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1425 
1426  // _S_broadcast {{{2
1427  template <typename _Tp>
1428  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1429  _S_broadcast(_Tp __x) noexcept
1430  { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1431 
1432  // _S_generator {{{2
1433  template <typename _Fp, typename _Tp>
1434  inline static constexpr _SimdMember<_Tp>
1435  _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1436  {
1437  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1438  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1439  if constexpr (__i < _S_size<_Tp>)
1440  return __gen(__i);
1441  else
1442  return 0;
1443  });
1444  }
1445 
1446  // _S_load {{{2
1447  template <typename _Tp, typename _Up>
1448  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1449  _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1450  {
1451  constexpr size_t _Np = _S_size<_Tp>;
1452  constexpr size_t __max_load_size
1453  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64
1454  : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
1455  : 16;
1456  constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1457  if (__builtin_is_constant_evaluated())
1458  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1459  [&](auto __i) constexpr {
1460  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1461  });
1462  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1463  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1464  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1465  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1466  });
1467  else if constexpr (is_same_v<_Up, _Tp>)
1468  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1469  _Np * sizeof(_Tp)>(__mem);
1470  else if constexpr (__bytes_to_load <= __max_load_size)
1471  return __convert<_SimdMember<_Tp>>(
1472  _CommonImpl::template _S_load<_Up, _Np>(__mem));
1473  else if constexpr (__bytes_to_load % __max_load_size == 0)
1474  {
1475  constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1476  constexpr size_t __elements_per_load = _Np / __n_loads;
1477  return __call_with_n_evaluations<__n_loads>(
1478  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1479  return __convert<_SimdMember<_Tp>>(__uncvted...);
1480  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1481  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1482  __mem + __i * __elements_per_load);
1483  });
1484  }
1485  else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
1486  && __max_load_size > 16)
1487  { // e.g. int[] -> <char, 12> with AVX2
1488  constexpr size_t __n_loads
1489  = __bytes_to_load / (__max_load_size / 2);
1490  constexpr size_t __elements_per_load = _Np / __n_loads;
1491  return __call_with_n_evaluations<__n_loads>(
1492  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1493  return __convert<_SimdMember<_Tp>>(__uncvted...);
1494  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1495  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1496  __mem + __i * __elements_per_load);
1497  });
1498  }
1499  else // e.g. int[] -> <char, 9>
1500  return __call_with_subscripts(
1501  __mem, make_index_sequence<_Np>(),
1502  [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1503  return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1504  });
1505  }
1506 
1507  // _S_masked_load {{{2
1508  template <typename _Tp, size_t _Np, typename _Up>
1509  static constexpr inline _SimdWrapper<_Tp, _Np>
1510  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1511  const _Up* __mem) noexcept
1512  {
1513  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1514  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1515  __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1516  });
1517  return __merge;
1518  }
1519 
1520  // _S_store {{{2
1521  template <typename _Tp, typename _Up>
1522  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1523  _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1524  {
1525  // TODO: converting int -> "smaller int" can be optimized with AVX512
1526  constexpr size_t _Np = _S_size<_Tp>;
1527  constexpr size_t __max_store_size
1528  = _SuperImpl::template _S_max_store_size<_Up>;
1529  if (__builtin_is_constant_evaluated())
1530  {
1531  for (size_t __i = 0; __i < _Np; ++__i)
1532  __mem[__i] = __v[__i];
1533  }
1534  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1535  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1536  __mem[__i] = __v[__i];
1537  });
1538  else if constexpr (is_same_v<_Up, _Tp>)
1539  _CommonImpl::_S_store(__v, __mem);
1540  else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1541  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1542  __mem);
1543  else
1544  {
1545  constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1546  // round up to convert the last partial vector as well:
1547  constexpr size_t __stores = __div_roundup(_Np, __vsize);
1548  constexpr size_t __full_stores = _Np / __vsize;
1549  using _V = __vector_type_t<_Up, __vsize>;
1550  const array<_V, __stores> __converted
1551  = __convert_all<_V, __stores>(__v);
1552  __execute_n_times<__full_stores>(
1553  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1554  _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1555  });
1556  if constexpr (__full_stores < __stores)
1557  _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1558  * sizeof(_Up)>(
1559  __converted[__full_stores], __mem + __full_stores * __vsize);
1560  }
1561  }
1562 
1563  // _S_masked_store_nocvt {{{2
1564  template <typename _Tp, size_t _Np>
1565  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1566  _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1567  {
1568  _BitOps::_S_bit_iteration(
1569  _MaskImpl::_S_to_bits(__k),
1570  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1571  __mem[__i] = __v[__i];
1572  });
1573  }
1574 
1575  // _S_masked_store {{{2
1576  template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1577  typename _Tp = typename _TVT::value_type, typename _Up>
1578  static constexpr inline void
1579  _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1580  {
1581  constexpr size_t _TV_size = _S_size<_Tp>;
1582  [[maybe_unused]] const auto __vi = __to_intrin(__v);
1583  constexpr size_t __max_store_size
1584  = _SuperImpl::template _S_max_store_size<_Up>;
1585  if constexpr (
1586  is_same_v<
1587  _Tp,
1588  _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1589  {
1590  // bitwise or no conversion, reinterpret:
1591  const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1592  if constexpr (__is_bitmask_v<decltype(__k)>)
1593  return _MaskMember<_Up>(__k._M_data);
1594  else
1595  return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1596  }();
1597  _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1598  __mem, __kk);
1599  }
1600  else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1601  && !_CommonImpl::
1602  template __converts_via_decomposition_v<
1603  _Tp, _Up, __max_store_size>)
1604  { // conversion via decomposition is better handled via the
1605  // bit_iteration
1606  // fallback below
1607  constexpr size_t _UW_size
1608  = std::min(_TV_size, __max_store_size / sizeof(_Up));
1609  static_assert(_UW_size <= _TV_size);
1610  using _UW = _SimdWrapper<_Up, _UW_size>;
1611  using _UV = __vector_type_t<_Up, _UW_size>;
1612  using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1613  if constexpr (_UW_size == _TV_size) // one convert+store
1614  {
1615  const _UW __converted = __convert<_UW>(__v);
1616  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1617  __converted, __mem,
1618  _UAbi::_MaskImpl::template _S_convert<
1619  __int_for_sizeof_t<_Up>>(__k));
1620  }
1621  else
1622  {
1623  static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1624  constexpr size_t _NFullStores = _TV_size / _UW_size;
1625  constexpr size_t _NAllStores
1626  = __div_roundup(_TV_size, _UW_size);
1627  constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1628  const array<_UV, _NAllStores> __converted
1629  = __convert_all<_UV, _NAllStores>(__v);
1630  __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1631  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1632  _UW(__converted[__i]), __mem + __i * _UW_size,
1633  _UAbi::_MaskImpl::template _S_convert<
1634  __int_for_sizeof_t<_Up>>(
1635  __extract_part<__i, _NParts>(__k.__as_full_vector())));
1636  });
1637  if constexpr (_NAllStores
1638  > _NFullStores) // one partial at the end
1639  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1640  _UW(__converted[_NFullStores]),
1641  __mem + _NFullStores * _UW_size,
1642  _UAbi::_MaskImpl::template _S_convert<
1643  __int_for_sizeof_t<_Up>>(
1644  __extract_part<_NFullStores, _NParts>(
1645  __k.__as_full_vector())));
1646  }
1647  }
1648  else
1649  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1650  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1651  __mem[__i] = static_cast<_Up>(__v[__i]);
1652  });
1653  }
1654 
1655  // _S_complement {{{2
1656  template <typename _Tp, size_t _Np>
1657  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1658  _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1659  { return ~__x._M_data; }
1660 
1661  // _S_unary_minus {{{2
1662  template <typename _Tp, size_t _Np>
1663  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1664  _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1665  {
1666  // GCC doesn't use the psign instructions, but pxor & psub seem to be
1667  // just as good a choice as pcmpeqd & psign. So meh.
1668  return -__x._M_data;
1669  }
1670 
1671  // arithmetic operators {{{2
1672  template <typename _Tp, size_t _Np>
1673  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1674  _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1675  { return __x._M_data + __y._M_data; }
1676 
1677  template <typename _Tp, size_t _Np>
1678  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1679  _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1680  { return __x._M_data - __y._M_data; }
1681 
1682  template <typename _Tp, size_t _Np>
1683  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1684  _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1685  { return __x._M_data * __y._M_data; }
1686 
1687  template <typename _Tp, size_t _Np>
1688  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1689  _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1690  {
1691  // Note that division by 0 is always UB, so we must ensure we avoid the
1692  // case for partial registers
1693  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1694  return __x._M_data / __y._M_data;
1695  else
1696  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1697  }
1698 
1699  template <typename _Tp, size_t _Np>
1700  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1701  _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1702  {
1703  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1704  return __x._M_data % __y._M_data;
1705  else
1706  return __as_vector(__x)
1707  % _Abi::__make_padding_nonzero(__as_vector(__y));
1708  }
1709 
1710  template <typename _Tp, size_t _Np>
1711  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1712  _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1713  { return __and(__x, __y); }
1714 
1715  template <typename _Tp, size_t _Np>
1716  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1717  _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1718  { return __or(__x, __y); }
1719 
1720  template <typename _Tp, size_t _Np>
1721  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1722  _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1723  { return __xor(__x, __y); }
1724 
1725  template <typename _Tp, size_t _Np>
1726  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1727  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1728  { return __x._M_data << __y._M_data; }
1729 
1730  template <typename _Tp, size_t _Np>
1731  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1732  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1733  { return __x._M_data >> __y._M_data; }
1734 
1735  template <typename _Tp, size_t _Np>
1736  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1737  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1738  { return __x._M_data << __y; }
1739 
1740  template <typename _Tp, size_t _Np>
1741  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1742  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1743  { return __x._M_data >> __y; }
1744 
1745  // compares {{{2
1746  // _S_equal_to {{{3
1747  template <typename _Tp, size_t _Np>
1748  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1749  _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1750  { return __x._M_data == __y._M_data; }
1751 
1752  // _S_not_equal_to {{{3
1753  template <typename _Tp, size_t _Np>
1754  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1755  _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1756  { return __x._M_data != __y._M_data; }
1757 
1758  // _S_less {{{3
1759  template <typename _Tp, size_t _Np>
1760  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1761  _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1762  { return __x._M_data < __y._M_data; }
1763 
1764  // _S_less_equal {{{3
1765  template <typename _Tp, size_t _Np>
1766  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1767  _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1768  { return __x._M_data <= __y._M_data; }
1769 
1770  // _S_negate {{{2
1771  template <typename _Tp, size_t _Np>
1772  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1773  _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1774  { return !__x._M_data; }
1775 
1776  // _S_min, _S_max, _S_minmax {{{2
1777  template <typename _Tp, size_t _Np>
1778  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1779  _SimdWrapper<_Tp, _Np>
1780  _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1781  { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1782 
1783  template <typename _Tp, size_t _Np>
1784  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1785  _SimdWrapper<_Tp, _Np>
1786  _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1787  { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1788 
1789  template <typename _Tp, size_t _Np>
1790  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1791  pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1792  _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1793  {
1794  return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1795  __a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1796  }
1797 
1798  // reductions {{{2
1799  template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1800  typename _BinaryOperation>
1801  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1802  _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1803  simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1804  {
1805  using _V = __vector_type_t<_Tp, _Np / 2>;
1806  static_assert(sizeof(_V) <= sizeof(__x));
1807  // _S_full_size is the size of the smallest native SIMD register that
1808  // can store _Np/2 elements:
1809  using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1810  using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
1811  const auto __xx = __as_vector(__x);
1812  return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1813  static_cast<_HalfSimd>(__as_vector(__binary_op(
1814  static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1815  static_cast<_FullSimd>(__intrin_bitcast<_V>(
1816  __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
1817  __xx)))))),
1818  __binary_op);
1819  }
1820 
1821  template <typename _Tp, typename _BinaryOperation>
1822  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1823  _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1824  {
1825  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1826  if constexpr (_Np == 1)
1827  return __x[0];
1828  else if constexpr (_Np == 2)
1829  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1830  simd<_Tp, simd_abi::scalar>(__x[1]))[0];
1831  else if (__builtin_is_constant_evaluated())
1832  {
1833  simd<_Tp, simd_abi::scalar> __acc = __x[0];
1834  for (size_t __i = 1; __i < _Np; ++__i)
1835  __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1836  return __acc[0];
1837  }
1838  else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1839  {
1840  [[maybe_unused]] constexpr auto __full_size
1841  = _Abi::template _S_full_size<_Tp>;
1842  if constexpr (_Np == 3)
1843  return __binary_op(
1844  __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1845  simd<_Tp, simd_abi::scalar>(__x[1])),
1846  simd<_Tp, simd_abi::scalar>(__x[2]))[0];
1847  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1848  plus<>>)
1849  {
1850  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1851  return _Ap::_SimdImpl::_S_reduce(
1852  simd<_Tp, _Ap>(__private_init,
1853  _Abi::_S_masked(__as_vector(__x))),
1854  __binary_op);
1855  }
1856  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1857  multiplies<>>)
1858  {
1859  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1860  using _TW = _SimdWrapper<_Tp, __full_size>;
1861  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1862  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1863  _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1864  = __vector_broadcast<__full_size>(_Tp(1));
1865  const _TW __x_full = __data(__x).__as_full_vector();
1866  const _TW __x_padded_with_ones
1867  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1868  __x_full);
1869  return _Ap::_SimdImpl::_S_reduce(
1870  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1871  __binary_op);
1872  }
1873  else if constexpr (_Np & 1)
1874  {
1875  using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
1876  return __binary_op(
1877  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1878  simd<_Tp, _Ap>(
1879  __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
1880  __as_vector(__x))),
1881  __binary_op)),
1882  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
1883  }
1884  else
1885  return _S_reduce_partial<_Np>(
1886  make_index_sequence<_Np / 2>(),
1887  make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
1888  } //}}}
1889  else if constexpr (sizeof(__x) == 16) //{{{
1890  {
1891  if constexpr (_Np == 16)
1892  {
1893  const auto __y = __data(__x);
1894  __x = __binary_op(
1895  _M_make_simd<_Tp, _Np>(
1896  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
1897  7, 7>(__y)),
1898  _M_make_simd<_Tp, _Np>(
1899  __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
1900  14, 14, 15, 15>(__y)));
1901  }
1902  if constexpr (_Np >= 8)
1903  {
1904  const auto __y = __vector_bitcast<short>(__data(__x));
1905  __x = __binary_op(
1906  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1907  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
1908  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1909  __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
1910  }
1911  if constexpr (_Np >= 4)
1912  {
1913  using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1914  const auto __y = __vector_bitcast<_Up>(__data(__x));
1915  __x = __binary_op(__x,
1916  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1917  __vector_permute<3, 2, 1, 0>(__y))));
1918  }
1919  using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1920  const auto __y = __vector_bitcast<_Up>(__data(__x));
1921  __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1922  __vector_permute<1, 1>(__y))));
1923  return __x[0];
1924  } //}}}
1925  else
1926  {
1927  static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1928  static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
1929  using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
1930  using _V = simd<_Tp, _Ap>;
1931  return _Ap::_SimdImpl::_S_reduce(
1932  __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
1933  _V(__private_init,
1934  __extract<1, 2>(__as_vector(__x)))),
1935  static_cast<_BinaryOperation&&>(__binary_op));
1936  }
1937  }
1938 
1939  // math {{{2
1940  // frexp, modf and copysign implemented in simd_math.h
1941 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1942  template <typename _Tp, typename... _More> \
1943  static _Tp \
1944  _S_##__name(const _Tp& __x, const _More&... __more) \
1945  { \
1946  return __generate_vector<_Tp>( \
1947  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1948  return __name(__x[__i], __more[__i]...); \
1949  }); \
1950  }
1951 
1952 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1953  template <typename _Tp, typename... _More> \
1954  static typename _Tp::mask_type \
1955  _S_##__name(const _Tp& __x, const _More&... __more) \
1956  { \
1957  return __generate_vector<_Tp>( \
1958  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1959  return __name(__x[__i], __more[__i]...); \
1960  }); \
1961  }
1962 
1963 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1964  template <typename _Tp, typename... _More> \
1965  static auto \
1966  _S_##__name(const _Tp& __x, const _More&... __more) \
1967  { \
1968  return __fixed_size_storage_t<_RetTp, \
1969  _VectorTraits<_Tp>::_S_partial_width>:: \
1970  _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1971  return __meta._S_generator( \
1972  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1973  return __name(__x[__meta._S_offset + __i], \
1974  __more[__meta._S_offset + __i]...); \
1975  }, \
1976  static_cast<_RetTp*>(nullptr)); \
1977  }); \
1978  }
1979 
1980  _GLIBCXX_SIMD_MATH_FALLBACK(acos)
1981  _GLIBCXX_SIMD_MATH_FALLBACK(asin)
1982  _GLIBCXX_SIMD_MATH_FALLBACK(atan)
1983  _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1984  _GLIBCXX_SIMD_MATH_FALLBACK(cos)
1985  _GLIBCXX_SIMD_MATH_FALLBACK(sin)
1986  _GLIBCXX_SIMD_MATH_FALLBACK(tan)
1987  _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1988  _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
1989  _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
1990  _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
1991  _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
1992  _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
1993  _GLIBCXX_SIMD_MATH_FALLBACK(exp)
1994  _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
1995  _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
1996  _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
1997  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
1998  _GLIBCXX_SIMD_MATH_FALLBACK(log)
1999  _GLIBCXX_SIMD_MATH_FALLBACK(log10)
2000  _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
2001  _GLIBCXX_SIMD_MATH_FALLBACK(log2)
2002  _GLIBCXX_SIMD_MATH_FALLBACK(logb)
2003 
2004  // modf implemented in simd_math.h
2005  _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
2006  _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
2007  _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
2008  _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
2009  _GLIBCXX_SIMD_MATH_FALLBACK(pow)
2010  _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
2011  _GLIBCXX_SIMD_MATH_FALLBACK(erf)
2012  _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
2013  _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
2014  _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
2015 
2016  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
2017  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
2018 
2019  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
2020  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
2021 
2022  _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
2023  _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
2024 
2025  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2026  static _Tp
2027  _S_remquo(const _Tp __x, const _Tp __y,
2028  __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
2029  {
2030  return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2031  int __tmp;
2032  auto __r = remquo(__x[__i], __y[__i], &__tmp);
2033  __z->_M_set(__i, __tmp);
2034  return __r;
2035  });
2036  }
2037 
2038  // copysign in simd_math.h
2039  _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
2040  _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
2041  _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
2042  _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
2043  _GLIBCXX_SIMD_MATH_FALLBACK(fma)
2044 
2045  template <typename _Tp, size_t _Np>
2046  static constexpr _MaskMember<_Tp>
2047  _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
2048  _SimdWrapper<_Tp, _Np> __y) noexcept
2049  {
2050  using _Ip = __int_for_sizeof_t<_Tp>;
2051  const auto __xn = __vector_bitcast<_Ip>(__x);
2052  const auto __yn = __vector_bitcast<_Ip>(__y);
2053  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2054  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2055  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2056  __xp > __yp);
2057  }
2058 
2059  template <typename _Tp, size_t _Np>
2060  static constexpr _MaskMember<_Tp>
2061  _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
2062  _SimdWrapper<_Tp, _Np> __y) noexcept
2063  {
2064  using _Ip = __int_for_sizeof_t<_Tp>;
2065  const auto __xn = __vector_bitcast<_Ip>(__x);
2066  const auto __yn = __vector_bitcast<_Ip>(__y);
2067  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2068  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2069  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2070  __xp >= __yp);
2071  }
2072 
2073  template <typename _Tp, size_t _Np>
2074  static constexpr _MaskMember<_Tp>
2075  _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
2076  {
2077  using _Ip = __int_for_sizeof_t<_Tp>;
2078  const auto __xn = __vector_bitcast<_Ip>(__x);
2079  const auto __yn = __vector_bitcast<_Ip>(__y);
2080  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2081  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2082  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2083  __xp < __yp);
2084  }
2085 
2086  template <typename _Tp, size_t _Np>
2087  static constexpr _MaskMember<_Tp>
2088  _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
2089  _SimdWrapper<_Tp, _Np> __y) noexcept
2090  {
2091  using _Ip = __int_for_sizeof_t<_Tp>;
2092  const auto __xn = __vector_bitcast<_Ip>(__x);
2093  const auto __yn = __vector_bitcast<_Ip>(__y);
2094  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2095  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2096  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2097  __xp <= __yp);
2098  }
2099 
2100  template <typename _Tp, size_t _Np>
2101  static constexpr _MaskMember<_Tp>
2102  _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
2103  _SimdWrapper<_Tp, _Np> __y) noexcept
2104  {
2105  return __andnot(_SuperImpl::_S_isunordered(__x, __y),
2106  _SuperImpl::_S_not_equal_to(__x, __y));
2107  }
2108 
2109 #undef _GLIBCXX_SIMD_MATH_FALLBACK
2110 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
2111 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
2112  // _S_abs {{{3
2113  template <typename _Tp, size_t _Np>
2114  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2115  _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
2116  {
2117  // if (__builtin_is_constant_evaluated())
2118  // {
2119  // return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2120  // }
2121  if constexpr (is_floating_point_v<_Tp>)
2122  // `v < 0 ? -v : v` cannot compile to the efficient implementation of
2123  // masking the signbit off because it must consider v == -0
2124 
2125  // ~(-0.) & v would be easy, but breaks with fno-signed-zeros
2126  return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2127  else
2128  return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2129  }
2130 
2131  // }}}3
2132  // _S_plus_minus {{{
2133  // Returns __x + __y - __y without -fassociative-math optimizing to __x.
2134  // - _TV must be __vector_type_t<floating-point type, N>.
2135  // - _UV must be _TV or floating-point type.
2136  template <typename _TV, typename _UV>
2137  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2138  _S_plus_minus(_TV __x, _UV __y) noexcept
2139  {
2140 #if defined __i386__ && !defined __SSE_MATH__
2141  if constexpr (sizeof(__x) == 8)
2142  { // operations on __x would use the FPU
2143  static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
2144  const auto __x4 = __vector_bitcast<float, 4>(__x);
2145  if constexpr (is_same_v<_TV, _UV>)
2146  return __vector_bitcast<float, 2>(
2147  _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
2148  else
2149  return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
2150  }
2151 #endif
2152 #if !defined __clang__ && __GCC_IEC_559 == 0
2153  if (__builtin_is_constant_evaluated()
2154  || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2155  return (__x + __y) - __y;
2156  else
2157  return [&] {
2158  __x += __y;
2159  if constexpr(__have_sse)
2160  {
2161  if constexpr (sizeof(__x) >= 16)
2162  asm("" : "+x"(__x));
2163  else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2164  asm("" : "+x"(__x[0]), "+x"(__x[1]));
2165  else
2166  __assert_unreachable<_TV>();
2167  }
2168  else if constexpr(__have_neon)
2169  asm("" : "+w"(__x));
2170  else if constexpr (__have_power_vmx)
2171  {
2172  if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2173  asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
2174  else
2175  asm("" : "+v"(__x));
2176  }
2177  else
2178  asm("" : "+g"(__x));
2179  return __x - __y;
2180  }();
2181 #else
2182  return (__x + __y) - __y;
2183 #endif
2184  }
2185 
2186  // }}}
2187  // _S_nearbyint {{{3
2188  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2189  _GLIBCXX_SIMD_INTRINSIC static _Tp
2190  _S_nearbyint(_Tp __x_) noexcept
2191  {
2192  using value_type = typename _TVT::value_type;
2193  using _V = typename _TVT::type;
2194  const _V __x = __x_;
2195  const _V __absx = __and(__x, _S_absmask<_V>);
2196  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
2197  _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2198  = _V() + (1ull << (__digits_v<value_type> - 1));
2199  const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2200  const _V __shifted = _S_plus_minus(__x, __shifter);
2201  return __absx < __shifter_abs ? __shifted : __x;
2202  }
2203 
2204  // _S_rint {{{3
2205  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2206  _GLIBCXX_SIMD_INTRINSIC static _Tp
2207  _S_rint(_Tp __x) noexcept
2208  { return _SuperImpl::_S_nearbyint(__x); }
2209 
2210  // _S_trunc {{{3
2211  template <typename _Tp, size_t _Np>
2212  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2213  _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2214  {
2215  using _V = __vector_type_t<_Tp, _Np>;
2216  const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2217  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
2218  constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
2219  _V __truncated = _S_plus_minus(__absx, __shifter);
2220  __truncated -= __truncated > __absx ? _V() + 1 : _V();
2221  return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2222  : __x._M_data;
2223  }
2224 
2225  // _S_round {{{3
2226  template <typename _Tp, size_t _Np>
2227  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2228  _S_round(_SimdWrapper<_Tp, _Np> __x)
2229  {
2230  const auto __abs_x = _SuperImpl::_S_abs(__x);
2231  const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2232  const auto __r_abs // round(abs(x)) =
2233  = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
2234  return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2235  }
2236 
2237  // _S_floor {{{3
2238  template <typename _Tp, size_t _Np>
2239  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2240  _S_floor(_SimdWrapper<_Tp, _Np> __x)
2241  {
2242  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2243  const auto __negative_input
2244  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2245  const auto __mask
2246  = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2247  return __or(__andnot(__mask, __y),
2248  __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
2249  }
2250 
2251  // _S_ceil {{{3
2252  template <typename _Tp, size_t _Np>
2253  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2254  _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2255  {
2256  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2257  const auto __negative_input
2258  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2259  const auto __inv_mask
2260  = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2261  return __or(__and(__inv_mask, __y),
2262  __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
2263  }
2264 
2265  // _S_isnan {{{3
2266  template <typename _Tp, size_t _Np>
2267  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2268  _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2269  {
2270 #if __FINITE_MATH_ONLY__
2271  return {}; // false
2272 #elif !defined __SUPPORT_SNAN__
2273  return ~(__x._M_data == __x._M_data);
2274 #elif defined __STDC_IEC_559__
2275  using _Ip = __int_for_sizeof_t<_Tp>;
2276  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2277  const auto __infn
2278  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2279  return __infn < __absn;
2280 #else
2281 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2282 #endif
2283  }
2284 
2285  // _S_isfinite {{{3
2286  template <typename _Tp, size_t _Np>
2287  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2288  _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2289  {
2290 #if __FINITE_MATH_ONLY__
2291  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2292  _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2293  return __alltrue;
2294 #else
2295  // if all exponent bits are set, __x is either inf or NaN
2296  using _Ip = __int_for_sizeof_t<_Tp>;
2297  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2298  const auto __maxn
2299  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2300  return __absn <= __maxn;
2301 #endif
2302  }
2303 
2304  // _S_isunordered {{{3
2305  template <typename _Tp, size_t _Np>
2306  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2307  _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2308  { return __or(_S_isnan(__x), _S_isnan(__y)); }
2309 
2310  // _S_signbit {{{3
2311  template <typename _Tp, size_t _Np>
2312  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2313  _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2314  {
2315  using _Ip = __int_for_sizeof_t<_Tp>;
2316  return __vector_bitcast<_Ip>(__x) < 0;
2317  // Arithmetic right shift (SRA) would also work (instead of compare), but
2318  // 64-bit SRA isn't available on x86 before AVX512. And in general,
2319  // compares are more likely to be efficient than SRA.
2320  }
2321 
2322  // _S_isinf {{{3
2323  template <typename _Tp, size_t _Np>
2324  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2325  _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2326  {
2327 #if __FINITE_MATH_ONLY__
2328  return {}; // false
2329 #else
2330  return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2331  __vector_broadcast<_Np>(
2332  __infinity_v<_Tp>));
2333  // alternative:
2334  // compare to inf using the corresponding integer type
2335  /*
2336  return
2337  __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2338  _S_abs(__x)._M_data)
2339  ==
2340  __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2341  __infinity_v<_Tp>)));
2342  */
2343 #endif
2344  }
2345 
2346  // _S_isnormal {{{3
2347  template <typename _Tp, size_t _Np>
2348  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2349  _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2350  {
2351  using _Ip = __int_for_sizeof_t<_Tp>;
2352  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2353  const auto __minn
2354  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2355 #if __FINITE_MATH_ONLY__
2356  return __absn >= __minn;
2357 #else
2358  const auto __maxn
2359  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2360  return __minn <= __absn && __absn <= __maxn;
2361 #endif
2362  }
2363 
2364  // _S_fpclassify {{{3
2365  template <typename _Tp, size_t _Np>
2366  _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2367  _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2368  {
2369  using _I = __int_for_sizeof_t<_Tp>;
2370  const auto __xn
2371  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2372  constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2373  _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2374  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2375 
2376  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2377  = __vector_broadcast<_NI, _I>(FP_NORMAL);
2378 #if !__FINITE_MATH_ONLY__
2379  _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2380  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2381  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2382  = __vector_broadcast<_NI, _I>(FP_NAN);
2383  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2384  = __vector_broadcast<_NI, _I>(FP_INFINITE);
2385 #endif
2386 #ifndef __FAST_MATH__
2387  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2388  = __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2389 #endif
2390  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2391  = __vector_broadcast<_NI, _I>(FP_ZERO);
2392 
2393  __vector_type_t<_I, _NI>
2394  __tmp = __xn < __minn
2395  #ifdef __FAST_MATH__
2396  ? __fp_zero
2397  #else
2398  ? (__xn == 0 ? __fp_zero : __fp_subnormal)
2399  #endif
2400  #if __FINITE_MATH_ONLY__
2401  : __fp_normal;
2402  #else
2403  : (__xn < __infn ? __fp_normal
2404  : (__xn == __infn ? __fp_infinite : __fp_nan));
2405  #endif
2406 
2407  if constexpr (sizeof(_I) == sizeof(int))
2408  {
2409  using _FixedInt = __fixed_size_storage_t<int, _Np>;
2410  const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2411  if constexpr (_FixedInt::_S_tuple_size == 1)
2412  return {__as_int};
2413  else if constexpr (_FixedInt::_S_tuple_size == 2
2414  && is_same_v<
2415  typename _FixedInt::_SecondType::_FirstAbi,
2416  simd_abi::scalar>)
2417  return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
2418  else if constexpr (_FixedInt::_S_tuple_size == 2)
2419  return {__extract<0, 2>(__as_int),
2420  __auto_bitcast(__extract<1, 2>(__as_int))};
2421  else
2422  __assert_unreachable<_Tp>();
2423  }
2424  else if constexpr (_Np == 2 && sizeof(_I) == 8
2425  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
2426  {
2427  const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2428  return {int(__aslong[0]), {int(__aslong[1])}};
2429  }
2430 #if _GLIBCXX_SIMD_X86INTRIN
2431  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
2432  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2433  return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2434  __to_intrin(__hi128(__tmp)))};
2435  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
2436  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2437  return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2438 #endif // _GLIBCXX_SIMD_X86INTRIN
2439  else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2440  return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2441  [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2442  return __make_wrapper<int>(__l...);
2443  })};
2444  else
2445  __assert_unreachable<_Tp>();
2446  }
2447 
2448  // _S_increment & _S_decrement{{{2
2449  template <typename _Tp, size_t _Np>
2450  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2451  _S_increment(_SimdWrapper<_Tp, _Np>& __x)
2452  { __x = __x._M_data + 1; }
2453 
2454  template <typename _Tp, size_t _Np>
2455  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2456  _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2457  { __x = __x._M_data - 1; }
2458 
2459  // smart_reference access {{{2
2460  template <typename _Tp, size_t _Np, typename _Up>
2461  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2462  _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2463  { __v._M_set(__i, static_cast<_Up&&>(__x)); }
2464 
2465  // _S_masked_assign{{{2
2466  template <typename _Tp, typename _K, size_t _Np>
2467  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2468  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2469  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2470  {
2471  if (__k._M_is_constprop_none_of())
2472  return;
2473  else if (__k._M_is_constprop_all_of())
2474  __lhs = __rhs;
2475  else
2476  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2477  }
2478 
2479  template <typename _Tp, typename _K, size_t _Np>
2480  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2481  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2482  __type_identity_t<_Tp> __rhs)
2483  {
2484  if (__k._M_is_constprop_none_of())
2485  return;
2486  else if (__k._M_is_constprop_all_of())
2487  __lhs = __vector_broadcast<_Np>(__rhs);
2488  else if (__builtin_constant_p(__rhs) && __rhs == 0)
2489  {
2490  if constexpr (!is_same_v<bool, _K>)
2491  // the __andnot optimization only makes sense if __k._M_data is a
2492  // vector register
2493  __lhs._M_data
2494  = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2495  else
2496  // for AVX512/__mmask, a _mm512_maskz_mov is best
2497  __lhs
2498  = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2499  }
2500  else
2501  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2502  _SimdWrapper<_Tp, _Np>(
2503  __vector_broadcast<_Np>(__rhs)));
2504  }
2505 
2506  // _S_masked_cassign {{{2
2507  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2508  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2509  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2510  _SimdWrapper<_Tp, _Np>& __lhs,
2511  const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2512  _Op __op)
2513  {
2514  if (__k._M_is_constprop_none_of())
2515  return;
2516  else if (__k._M_is_constprop_all_of())
2517  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
2518  else
2519  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2520  __op(_SuperImpl{}, __lhs, __rhs));
2521  }
2522 
2523  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2524  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2525  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2526  _SimdWrapper<_Tp, _Np>& __lhs,
2527  const __type_identity_t<_Tp> __rhs, _Op __op)
2528  { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2529 
2530  // _S_masked_unary {{{2
2531  template <template <typename> class _Op, typename _Tp, typename _K,
2532  size_t _Np>
2533  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2534  _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2535  const _SimdWrapper<_Tp, _Np> __v)
2536  {
2537  if (__k._M_is_constprop_none_of())
2538  return __v;
2539  auto __vv = _M_make_simd(__v);
2540  _Op<decltype(__vv)> __op;
2541  if (__k._M_is_constprop_all_of())
2542  return __data(__op(__vv));
2543  else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2544  {
2545  static_assert(not std::is_same_v<_K, bool>);
2546  if constexpr (is_integral_v<_Tp>)
2547  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2548  return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2549  else if constexpr (not __have_avx2)
2550  return __v._M_data
2551  + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2552  _K, _Tp(1)));
2553  // starting with AVX2 it is more efficient to blend after add
2554  }
2555  else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2556  {
2557  static_assert(not std::is_same_v<_K, bool>);
2558  if constexpr (is_integral_v<_Tp>)
2559  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2560  return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2561  else if constexpr (not __have_avx2)
2562  return __v._M_data
2563  - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2564  _K, _Tp(1)));
2565  // starting with AVX2 it is more efficient to blend after sub
2566  }
2567  return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2568  }
2569 
2570  //}}}2
2571  };
2572 
2573 // _MaskImplBuiltinMixin {{{1
2574 struct _MaskImplBuiltinMixin
2575 {
2576  template <typename _Tp>
2577  using _TypeTag = _Tp*;
2578 
2579  // _S_to_maskvector {{{
2580  template <typename _Up, size_t _ToN = 1>
2581  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2582  _S_to_maskvector(bool __x)
2583  {
2584  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2585  return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2586  : __vector_type_t<_Up, _ToN>{};
2587  }
2588 
2589  template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
2590  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2591  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2592  _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2593  {
2594  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2595  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2596  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2597  if constexpr (__i < _Np)
2598  return __x[__i] ? ~_Up() : _Up();
2599  else
2600  return _Up();
2601  });
2602  }
2603 
2604  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
2605  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2606  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2607  _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2608  {
2609  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2610  using _TW = _SimdWrapper<_Tp, _Np>;
2611  using _UW = _SimdWrapper<_Up, _ToN>;
2612  if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2613  return __wrapper_bitcast<_Up, _ToN>(__x);
2614  else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2615  return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2616  else
2617  { // vector -> vector
2618  /*
2619  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2620  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2621  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2622  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
2623  && sizeof(__y) == 16)
2624  return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2625  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2626  && sizeof(__y) == 16)
2627  return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2628  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2629  && sizeof(__y) == 16)
2630  return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2631  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2632  sizeof(_Up) == 1
2633  && sizeof(__y) == 16)
2634  return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2635  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2636  sizeof(_Up) == 1
2637  && sizeof(__y) == 16)
2638  return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2639  -1, -1, -1, -1, -1>(__y); else
2640  */
2641  {
2642  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2643  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2644  if constexpr (__i < _Np)
2645  return _Up(__x[__i.value]);
2646  else
2647  return _Up();
2648  });
2649  }
2650  }
2651  }
2652 
2653  // }}}
2654  // _S_to_bits {{{
2655  template <typename _Tp, size_t _Np>
2656  _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2657  _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2658  {
2659  static_assert(!is_same_v<_Tp, bool>);
2660  static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2661  using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2662  const auto __bools
2663  = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
2664  _ULLong __r = 0;
2665  __execute_n_times<_Np>(
2666  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2667  __r |= _ULLong(__bools[__i.value]) << __i;
2668  });
2669  return __r;
2670  }
2671 
2672  // }}}
2673 };
2674 
2675 // _MaskImplBuiltin {{{1
2676 template <typename _Abi>
2677  struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2678  {
2679  using _MaskImplBuiltinMixin::_S_to_bits;
2680  using _MaskImplBuiltinMixin::_S_to_maskvector;
2681 
2682  // member types {{{
2683  template <typename _Tp>
2684  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2685 
2686  template <typename _Tp>
2687  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2688 
2689  using _SuperImpl = typename _Abi::_MaskImpl;
2690  using _CommonImpl = typename _Abi::_CommonImpl;
2691 
2692  template <typename _Tp>
2693  static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2694 
2695  // }}}
2696  // _S_broadcast {{{
2697  template <typename _Tp>
2698  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2699  _S_broadcast(bool __x)
2700  { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2701 
2702  // }}}
2703  // _S_load {{{
2704  template <typename _Tp>
2705  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2706  _S_load(const bool* __mem)
2707  {
2708  using _I = __int_for_sizeof_t<_Tp>;
2709  if (not __builtin_is_constant_evaluated())
2710  if constexpr (sizeof(_Tp) == sizeof(bool))
2711  {
2712  const auto __bools
2713  = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2714  // bool is {0, 1}, everything else is UB
2715  return __bools > 0;
2716  }
2717  return __generate_vector<_I, _S_size<_Tp>>(
2718  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2719  return __mem[__i] ? ~_I() : _I();
2720  });
2721  }
2722 
2723  // }}}
2724  // _S_convert {{{
2725  template <typename _Tp, size_t _Np, bool _Sanitized>
2726  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2727  _S_convert(_BitMask<_Np, _Sanitized> __x)
2728  {
2729  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2730  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2731  else
2732  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2733  _S_size<_Tp>>(
2734  __x._M_sanitized());
2735  }
2736 
2737  template <typename _Tp, size_t _Np>
2738  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2739  _S_convert(_SimdWrapper<bool, _Np> __x)
2740  {
2741  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2742  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2743  else
2744  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2745  _S_size<_Tp>>(
2746  _BitMask<_Np>(__x._M_data)._M_sanitized());
2747  }
2748 
2749  template <typename _Tp, typename _Up, size_t _Np>
2750  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2751  _S_convert(_SimdWrapper<_Up, _Np> __x)
2752  {
2753  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2754  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2755  _SuperImpl::_S_to_bits(__x));
2756  else
2757  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2758  _S_size<_Tp>>(__x);
2759  }
2760 
2761  template <typename _Tp, typename _Up, typename _UAbi>
2762  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2763  _S_convert(simd_mask<_Up, _UAbi> __x)
2764  {
2765  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2766  {
2767  using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2768  if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2769  return _R(__data(__x));
2770  else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2771  return _R(__data(__x));
2772  else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2773  return _R(__data(__x)._M_to_bits());
2774  else // vector -> bits
2775  return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2776  }
2777  else
2778  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2779  _S_size<_Tp>>(
2780  __data(__x));
2781  }
2782 
2783  // }}}
2784  // _S_masked_load {{{2
2785  template <typename _Tp, size_t _Np>
2786  static inline _SimdWrapper<_Tp, _Np>
2787  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2788  _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2789  {
2790  // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2791  auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2792  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2793  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2794  __tmp._M_set(__i, -__mem[__i]);
2795  });
2796  __merge = __wrapper_bitcast<_Tp>(__tmp);
2797  return __merge;
2798  }
2799 
2800  // _S_store {{{2
2801  template <typename _Tp, size_t _Np>
2802  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2803  _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2804  {
2805  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2806  __mem[__i] = __v[__i];
2807  });
2808  }
2809 
2810  // _S_masked_store {{{2
2811  template <typename _Tp, size_t _Np>
2812  static inline void
2813  _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2814  const _SimdWrapper<_Tp, _Np> __k) noexcept
2815  {
2816  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2817  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2818  __mem[__i] = __v[__i];
2819  });
2820  }
2821 
2822  // _S_from_bitmask{{{2
2823  template <size_t _Np, typename _Tp>
2824  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2825  _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2826  { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2827 
2828  // logical and bitwise operators {{{2
2829  template <typename _Tp, size_t _Np>
2830  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2831  _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2832  { return __and(__x._M_data, __y._M_data); }
2833 
2834  template <typename _Tp, size_t _Np>
2835  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2836  _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2837  { return __or(__x._M_data, __y._M_data); }
2838 
2839  template <typename _Tp, size_t _Np>
2840  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2841  _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2842  {
2843  if constexpr (_Abi::template _S_is_partial<_Tp>)
2844  return __andnot(__x, __wrapper_bitcast<_Tp>(
2845  _Abi::template _S_implicit_mask<_Tp>()));
2846  else
2847  return __not(__x._M_data);
2848  }
2849 
2850  template <typename _Tp, size_t _Np>
2851  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2852  _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2853  { return __and(__x._M_data, __y._M_data); }
2854 
2855  template <typename _Tp, size_t _Np>
2856  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2857  _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2858  { return __or(__x._M_data, __y._M_data); }
2859 
2860  template <typename _Tp, size_t _Np>
2861  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2862  _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2863  { return __xor(__x._M_data, __y._M_data); }
2864 
2865  // smart_reference access {{{2
2866  template <typename _Tp, size_t _Np>
2867  static constexpr void
2868  _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2869  {
2870  if constexpr (is_same_v<_Tp, bool>)
2871  __k._M_set(__i, __x);
2872  else
2873  {
2874  static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2875  if (__builtin_is_constant_evaluated())
2876  {
2877  __k = __generate_from_n_evaluations<_Np,
2878  __vector_type_t<_Tp, _Np>>(
2879  [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2880  if (__i == static_cast<int>(__j))
2881  return _Tp(-__x);
2882  else
2883  return __k[+__j];
2884  });
2885  }
2886  else
2887  __k._M_data[__i] = -__x;
2888  }
2889  }
2890 
2891  // _S_masked_assign{{{2
2892  template <typename _Tp, size_t _Np>
2893  _GLIBCXX_SIMD_INTRINSIC static void
2894  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2895  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2896  { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2897 
2898  template <typename _Tp, size_t _Np>
2899  _GLIBCXX_SIMD_INTRINSIC static void
2900  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2901  {
2902  if (__builtin_constant_p(__rhs))
2903  {
2904  if (__rhs == false)
2905  __lhs = __andnot(__k, __lhs);
2906  else
2907  __lhs = __or(__k, __lhs);
2908  return;
2909  }
2910  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2911  __data(simd_mask<_Tp, _Abi>(__rhs)));
2912  }
2913 
2914  //}}}2
2915  // _S_all_of {{{
2916  template <typename _Tp>
2917  _GLIBCXX_SIMD_INTRINSIC static bool
2918  _S_all_of(simd_mask<_Tp, _Abi> __k)
2919  {
2920  return __call_with_subscripts(
2921  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2922  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2923  { return (... && !(__ent == 0)); });
2924  }
2925 
2926  // }}}
2927  // _S_any_of {{{
2928  template <typename _Tp>
2929  _GLIBCXX_SIMD_INTRINSIC static bool
2930  _S_any_of(simd_mask<_Tp, _Abi> __k)
2931  {
2932  return __call_with_subscripts(
2933  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2934  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2935  { return (... || !(__ent == 0)); });
2936  }
2937 
2938  // }}}
2939  // _S_none_of {{{
2940  template <typename _Tp>
2941  _GLIBCXX_SIMD_INTRINSIC static bool
2942  _S_none_of(simd_mask<_Tp, _Abi> __k)
2943  {
2944  return __call_with_subscripts(
2945  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2946  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2947  { return (... && (__ent == 0)); });
2948  }
2949 
2950  // }}}
2951  // _S_some_of {{{
2952  template <typename _Tp>
2953  _GLIBCXX_SIMD_INTRINSIC static bool
2954  _S_some_of(simd_mask<_Tp, _Abi> __k)
2955  {
2956  const int __n_true = _SuperImpl::_S_popcount(__k);
2957  return __n_true > 0 && __n_true < int(_S_size<_Tp>);
2958  }
2959 
2960  // }}}
2961  // _S_popcount {{{
2962  template <typename _Tp>
2963  _GLIBCXX_SIMD_INTRINSIC static int
2964  _S_popcount(simd_mask<_Tp, _Abi> __k)
2965  {
2966  using _I = __int_for_sizeof_t<_Tp>;
2967  if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2968  return -reduce(
2969  simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2970  else
2971  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2972  simd<_Tp, _Abi>(__private_init, __data(__k))));
2973  }
2974 
2975  // }}}
2976  // _S_find_first_set {{{
2977  template <typename _Tp>
2978  _GLIBCXX_SIMD_INTRINSIC static int
2979  _S_find_first_set(simd_mask<_Tp, _Abi> __k)
2980  { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
2981 
2982  // }}}
2983  // _S_find_last_set {{{
2984  template <typename _Tp>
2985  _GLIBCXX_SIMD_INTRINSIC static int
2986  _S_find_last_set(simd_mask<_Tp, _Abi> __k)
2987  { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; }
2988 
2989  // }}}
2990  };
2991 
2992 //}}}1
2993 _GLIBCXX_SIMD_END_NAMESPACE
2994 #endif // __cplusplus >= 201703L
2995 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
2996 
2997 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=80
complex< _Tp > log10(const complex< _Tp > &)
Return complex base 10 logarithm of z.
Definition: complex:829
complex< _Tp > sin(const complex< _Tp > &)
Return complex sine of z.
Definition: complex:859
complex< _Tp > log(const complex< _Tp > &)
Return complex natural logarithm of z.
Definition: complex:824
complex< _Tp > tan(const complex< _Tp > &)
Return complex tangent of z.
Definition: complex:960
complex< _Tp > exp(const complex< _Tp > &)
Return complex base e exponential of z.
Definition: complex:797
complex< _Tp > cosh(const complex< _Tp > &)
Return complex hyperbolic cosine of z.
Definition: complex:771
complex< _Tp > tanh(const complex< _Tp > &)
Return complex hyperbolic tangent of z.
Definition: complex:988
complex< _Tp > pow(const complex< _Tp > &, int)
Return x to the y'th power.
Definition: complex:1019
complex< _Tp > sinh(const complex< _Tp > &)
Return complex hyperbolic sine of z.
Definition: complex:889
complex< _Tp > cos(const complex< _Tp > &)
Return complex cosine of z.
Definition: complex:741
complex< _Tp > sqrt(const complex< _Tp > &)
Return complex square root of z.
Definition: complex:933
integral_constant< bool, true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:83
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2583
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:230
constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
Calculate reduction of values in a range.
Definition: numeric:289
_Tp fabs(const std::complex< _Tp > &)
fabs(__z) [8.1.8].
Definition: complex:1846
std::complex< _Tp > asinh(const std::complex< _Tp > &)
asinh(__z) [8.1.6].
Definition: complex:1793
std::complex< _Tp > atan(const std::complex< _Tp > &)
atan(__z) [8.1.4].
Definition: complex:1718
make_integer_sequence< size_t, _Num > make_index_sequence
Alias template make_index_sequence.
Definition: utility:343
std::complex< _Tp > atanh(const std::complex< _Tp > &)
atanh(__z) [8.1.7].
Definition: complex:1837
std::complex< _Tp > acosh(const std::complex< _Tp > &)
acosh(__z) [8.1.5].
Definition: complex:1754
std::complex< _Tp > acos(const std::complex< _Tp > &)
acos(__z) [8.1.2].
Definition: complex:1638
std::complex< _Tp > asin(const std::complex< _Tp > &)
asin(__z) [8.1.3].
Definition: complex:1674