STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
arm_neon.h
Go to the documentation of this file.
1 /***
2 * arm_neon.h - declarations/definitions for ARM NEON specific intrinsics
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 *Purpose:
7 * This include file contains the declarations for ARM NEON intrinsic functions
8 *
9 ****/
10 
11 #pragma once
12 
13 #include <stdint.h>
14 #include <sal.h>
15 
16 #if !defined (_M_ARM)
17 #error This header is specific to ARM targets
18 #endif /* !defined (_M_ARM) */
19 
20 
21 #if defined (__cplusplus)
22 extern "C" {
23 #endif /* defined (__cplusplus) */
24 
25 
27 //
28 #if !defined (_ADVSIMD_ALIGN)
29 #if defined (__midl)
30 #define _ADVSIMD_ALIGN(x)
31 #else /* defined (__midl) */
32 #define _ADVSIMD_ALIGN(x) __declspec(align(x))
33 #endif /* defined (__midl) */
34 #endif /* !defined (_ADVSIMD_ALIGN) */
35 
36 #ifndef DUMMYNEONSTRUCT
37 #define DUMMYNEONSTRUCT s
38 #endif /* DUMMYNEONSTRUCT */
39 
41 //
42 // ARM Advanced SIMD 64bit type
43 //
44 typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n64
45 {
46  unsigned __int64 n64_u64[1];
47  unsigned __int32 n64_u32[2];
48  unsigned __int16 n64_u16[4];
49  unsigned __int8 n64_u8[8];
50  __int64 n64_i64[1];
51  __int32 n64_i32[2];
52  __int16 n64_i16[4];
53  __int8 n64_i8[8];
54  float n64_f32[2];
56 
57 
59 //
60 // ARM Advanced SIMD 128bit type
61 //
62 typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n128
63 {
64  unsigned __int64 n128_u64[2];
65  unsigned __int32 n128_u32[4];
66  unsigned __int16 n128_u16[8];
67  unsigned __int8 n128_u8[16];
68  __int64 n128_i64[2];
69  __int32 n128_i32[4];
70  __int16 n128_i16[8];
71  __int8 n128_i8[16];
72  float n128_f32[4];
73 
74  struct
75  {
76  __n64 low64;
77  __n64 high64;
79 
81 
82 typedef struct __n64x2
83 {
84  __n64 val[2];
85 } __n64x2;
86 
87 typedef struct __n64x3
88 {
89  __n64 val[3];
90 } __n64x3;
91 
92 typedef struct __n64x4
93 {
94  __n64 val[4];
95 } __n64x4;
96 
97 typedef struct __n128x2
98 {
99  __n128 val[2];
100 } __n128x2;
101 
102 typedef struct __n128x3
103 {
105 } __n128x3;
106 
107 typedef struct __n128x4
108 {
110 } __n128x4;
111 
113 //
114 typedef unsigned __int8 poly8_t;
115 typedef unsigned __int16 poly16_t;
116 
117 typedef float float32_t;
118 
119 #if !defined(_ARM_USE_NEW_NEON_INTRINSICS)
120 // Once a version is determined, this should default based on _MSC_FULL_VER
121 #define _ARM_USE_DEPRECATED_NEON_INTRINSICS
122 #endif
123 
124 #if defined(_ARM_USE_DEPRECATED_NEON_INTRINSICS)
125 
127 //
128 __inline _Post_equal_to_(p) __n64 *__int8ToN64(_Pre_notnull_ _Const_ int8_t *p) { return (__n64 *)p; }
129 __inline _Post_equal_to_(p) __n64 *__int16ToN64(_Pre_notnull_ _Const_ int16_t *p) { return (__n64 *)p; }
130 __inline _Post_equal_to_(p) __n64 *__int32ToN64(_Pre_notnull_ _Const_ int32_t *p) { return (__n64 *)p; }
131 __inline _Post_equal_to_(p) __n64 *__int64ToN64(_Pre_notnull_ _Const_ int64_t *p) { return (__n64 *)p; }
132 __inline _Post_equal_to_(p) __n64 *__uint8ToN64(_Pre_notnull_ _Const_ uint8_t *p) { return (__n64 *)p; }
133 __inline _Post_equal_to_(p) __n64 *__uint16ToN64(_Pre_notnull_ _Const_ uint16_t *p) { return (__n64 *)p; }
134 __inline _Post_equal_to_(p) __n64 *__uint32ToN64(_Pre_notnull_ _Const_ uint32_t *p) { return (__n64 *)p; }
135 __inline _Post_equal_to_(p) __n64 *__uint64ToN64(_Pre_notnull_ _Const_ uint64_t *p) { return (__n64 *)p; }
136 __inline _Post_equal_to_(p) __n64 *__poly8ToN64(_Pre_notnull_ _Const_ poly8_t *p) { return (__n64 *)p; }
137 __inline _Post_equal_to_(p) __n64 *__poly16ToN64(_Pre_notnull_ _Const_ poly16_t *p) { return (__n64 *)p; }
138 __inline _Post_equal_to_(p) __n64 *__float32ToN64(_Pre_notnull_ _Const_ float32_t *p) { return (__n64 *)p; }
139 
140 __inline _Post_equal_to_(p) const __n64 *__int8ToN64_c(_Pre_notnull_ _Const_ const int8_t *p) { return (const __n64 *)p; }
141 __inline _Post_equal_to_(p) const __n64 *__int16ToN64_c(_Pre_notnull_ _Const_ const int16_t *p) { return (const __n64 *)p; }
142 __inline _Post_equal_to_(p) const __n64 *__int32ToN64_c(_Pre_notnull_ _Const_ const int32_t *p) { return (const __n64 *)p; }
143 __inline _Post_equal_to_(p) const __n64 *__int64ToN64_c(_Pre_notnull_ _Const_ const int64_t *p) { return (const __n64 *)p; }
144 __inline _Post_equal_to_(p) const __n64 *__uint8ToN64_c(_Pre_notnull_ _Const_ const uint8_t *p) { return (const __n64 *)p; }
145 __inline _Post_equal_to_(p) const __n64 *__uint16ToN64_c(_Pre_notnull_ _Const_ const uint16_t *p) { return (const __n64 *)p; }
146 __inline _Post_equal_to_(p) const __n64 *__uint32ToN64_c(_Pre_notnull_ _Const_ const uint32_t *p) { return (const __n64 *)p; }
147 __inline _Post_equal_to_(p) const __n64 *__uint64ToN64_c(_Pre_notnull_ _Const_ const uint64_t *p) { return (const __n64 *)p; }
148 __inline _Post_equal_to_(p) const __n64 *__poly8ToN64_c(_Pre_notnull_ _Const_ const poly8_t *p) { return (const __n64 *)p; }
149 __inline _Post_equal_to_(p) const __n64 *__poly16ToN64_c(_Pre_notnull_ _Const_ const poly16_t *p) { return (const __n64 *)p; }
150 __inline _Post_equal_to_(p) const __n64 *__float32ToN64_c(_Pre_notnull_ _Const_ const float32_t *p) { return (const __n64 *)p; }
151 
152 __inline int32_t __int8ToInt32(int8_t i) { return (int32_t)i; }
153 __inline int32_t __int16ToInt32(int16_t i) { return (int32_t)i; }
154 __inline int32_t __int32ToInt32(int32_t i) { return (int32_t)i; }
155 __inline int64_t __int64ToInt64(int64_t i) { return (int64_t)i; }
156 
157 __inline int32_t __uint8ToInt32(uint8_t i) { return (int32_t)i; }
158 __inline int32_t __uint16ToInt32(uint16_t i) { return (int32_t)i; }
159 __inline int32_t __uint32ToInt32(uint32_t i) { return (int32_t)i; }
160 __inline int64_t __uint64ToInt64(uint64_t i) { return (int64_t)i; }
161 
162 __inline int32_t __poly8ToInt32(poly8_t i) { return (int32_t)i; }
163 __inline int32_t __poly16ToInt32(poly16_t i) { return (int32_t)i; }
164 
166 //
167 #define vshll_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 8, "invalid shift amount"), ((shift_amount) == 8) ? __internal_vshll_n_t2_s8((Dm)) : __internal_vshll_n_t1_s8((Dm), (shift_amount)) )
168 #define vshll_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 16, "invalid shift amount"), ((shift_amount) == 16) ? __internal_vshll_n_t2_s16((Dm)) : __internal_vshll_n_t1_s16((Dm), (shift_amount)) )
169 #define vshll_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 32, "invalid shift amount"), ((shift_amount) == 32) ? __internal_vshll_n_t2_s32((Dm)) : __internal_vshll_n_t1_s32((Dm), (shift_amount)) )
170 #define vshll_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 8, "invalid shift amount"), ((shift_amount) == 8) ? __internal_vshll_n_t2_u8((Dm)) : __internal_vshll_n_t1_u8((Dm), (shift_amount)) )
171 #define vshll_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 16, "invalid shift amount"), ((shift_amount) == 16) ? __internal_vshll_n_t2_u16((Dm)) : __internal_vshll_n_t1_u16((Dm), (shift_amount)) )
172 #define vshll_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 32, "invalid shift amount"), ((shift_amount) == 32) ? __internal_vshll_n_t2_u32((Dm)) : __internal_vshll_n_t1_u32((Dm), (shift_amount)) )
173 
174 #endif
175 
176 
178 //
179 // { +++ auto-generated code begins (explicit types)
180 
185 typedef __n64 int8x8_t;
189 typedef __n64 int16x4_t;
193 typedef __n64 int32x2_t;
197 typedef __n64 int64x1_t;
201 typedef __n64 poly8x8_t;
209 typedef __n64 uint8x8_t;
269 
270 // } +++ auto-generated code ends (explicit types)
271 
272 #if defined(_ARM_USE_DEPRECATED_NEON_INTRINSICS)
273 
275 //
276 // { +++ auto-generated code begins (prototypes)
277 
278 __n64x2 __neon_DdDm_acc2(unsigned int _Enc, __n64, __n64);
279 __n64x2 __neon_Dx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
280 __n64x2 __neon_Dx2Adr_acc(unsigned int _Enc, __n64x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
281 __n64x3 __neon_Dx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
282 __n64x3 __neon_Dx3Adr_acc(unsigned int _Enc, __n64x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
283 __n64x4 __neon_Dx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
284 __n64x4 __neon_Dx4Adr_acc(unsigned int _Enc, __n64x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
285 __n64 __neon_DdDm(unsigned int _Enc, __n64);
286 __n64 __neon_DdDx2Dm(unsigned int _Enc, __n64x2, __n64);
287 __n64 __neon_DdDx2Dm_acc(unsigned int _Enc, __n64, __n64x2, __n64);
288 __n64 __neon_DdDx3Dm(unsigned int _Enc, __n64x3, __n64);
289 __n64 __neon_DdDx3Dm_acc(unsigned int _Enc, __n64, __n64x3, __n64);
290 __n64 __neon_DdDx4Dm(unsigned int _Enc, __n64x4, __n64);
291 __n64 __neon_DdDx4Dm_acc(unsigned int _Enc, __n64, __n64x4, __n64);
292 __n64 __neon_DdDm_acc(unsigned int _Enc, __n64, __n64);
293 __n64 __neon_DdDnDm(unsigned int _Enc, __n64, __n64);
294 __n64 __neon_DdDnDm_acc(unsigned int _Enc, __n64, __n64, __n64);
295 __n64 __neon_DdDnDmx(unsigned int _Enc, __n64, __n64);
296 __n64 __neon_DdDnDmx_acc(unsigned int _Enc, __n64, __n64, __n64);
297 __n64 __neon_DdDnFt(unsigned int, __n64, float);
298 __n64 __neon_DdDnFt_acc(unsigned int, __n64, __n64, float);
299 __n64 __neon_DdFt(unsigned int _Enc, float);
300 __n64 __neon_DdFt_acc(unsigned int _Enc, __n64, float);
301 __n64 __neon_D1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
302 __n64 __neon_D1Adr_acc(unsigned int _Enc, __n64, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
303 __n64 __neon_DdQm(unsigned int _Enc, __n128);
304 __n64 __neon_DdQm_high(unsigned int _Enc, __n128);
305 __n64 __neon_DdQm_low(unsigned int _Enc, __n128);
306 __n64 __neon_DdQnQm(unsigned int _Enc, __n128, __n128);
307 __n64 __neon_DdRt(unsigned int _Enc, int);
308 __n64 __neon_DdRtRt2(unsigned int _Enc, __int64);
309 __n64 __neon_DdRtRt2_acc(unsigned int _Enc, __n64, __int64);
310 __n64 __neon_DdRt_acc(unsigned int _Enc, __n64, int);
311 float __neon_FtDn(unsigned int _Enc, __n64);
312 float __neon_FtQn(unsigned int _Enc, __n128);
313 __n128x2 __neon_Qx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
314 __n128x2 __neon_Qx2Adr_acc(unsigned int _Enc, __n128x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
315 __n128x2 __neon_QdQm_acc2(unsigned int _Enc, __n128, __n128);
316 __n128x2 __neon_QdQm_acc3(unsigned int _Enc, __n128, __n128);
317 __n128x3 __neon_Qx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
318 __n128x3 __neon_Qx3Adr_acc(unsigned int _Enc, __n128x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
319 __n128x4 __neon_Qx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
320 __n128x4 __neon_Qx4Adr_acc(unsigned int _Enc, __n128x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
321 __n128 __neon_QdDm(unsigned int _Enc, __n64);
322 __n128 __neon_QdDnDm(unsigned int _Enc, __n64, __n64);
323 __n128 __neon_QdDnDm_acc(unsigned int _Enc, __n128, __n64, __n64);
324 __n128 __neon_QdDnDm_merge(unsigned int _Enc, __n64, __n64);
325 __n128 __neon_QdDnDmx(unsigned int _Enc, __n64, __n64);
326 __n128 __neon_QdDnDmx_acc(unsigned int _Enc, __n128, __n64, __n64);
327 __n128 __neon_QdFt(unsigned int _Enc, float);
328 __n128 __neon_QdFt_acc(unsigned int _Enc, __n128, float);
329 __n128 __neon_Q1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
330 __n128 __neon_Q1Adr_acc(unsigned int _Enc, __n128, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
331 __n128 __neon_QdQm(unsigned int _Enc, __n128);
332 __n128 __neon_QdQm_acc(unsigned int _Enc, __n128, __n128);
333 __n128 __neon_QdQnDm(unsigned int _Enc, __n128, __n64);
334 __n128 __neon_QdQnDmx(unsigned int _Enc, __n128, __n64);
335 __n128 __neon_QdQnDmx_acc(unsigned int _Enc, __n128, __n128, __n64);
336 __n128 __neon_QdQnFt(unsigned int, __n128, float);
337 __n128 __neon_QdQnFt_acc(unsigned int, __n128, __n128, float);
338 __n128 __neon_QdQnQm(unsigned int _Enc, __n128, __n128);
339 __n128 __neon_QdQnQm_acc(unsigned int _Enc, __n128, __n128, __n128);
340 __n128 __neon_QdRt(unsigned int _Enc, int);
341 __n128 __neon_QdRtRt2_acc(unsigned int _Enc, __n128, __int64);
342 __n128 __neon_QdRtRt2_dup(unsigned int _Enc, __int64);
343 __n128 __neon_QdRt_acc(unsigned int _Enc, __n128, int);
344 __int64 __neon_RtRt2Dm(unsigned int _Enc, __n64);
345 __int64 __neon_RtRt2Qm(unsigned int _Enc, __n128);
346 int __neon_RtDn(unsigned int _Enc, __n64);
347 int __neon_RtQn(unsigned int _Enc, __n128);
348 void __neon_AdrD1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64);
349 void __neon_AdrDx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x2);
350 void __neon_AdrDx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x2);
351 void __neon_AdrDx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x3);
352 void __neon_AdrDx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x3);
353 void __neon_AdrDx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x4);
354 void __neon_AdrDx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x4);
355 void __neon_AdrQ1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128);
356 void __neon_AdrQx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x2);
357 void __neon_AdrQx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x2);
358 void __neon_AdrQx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x3);
359 void __neon_AdrQx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x3);
360 void __neon_AdrQx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x4);
361 void __neon_AdrQx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x4);
362 
363 // } +++ auto-generated code ends (prototypes)
364 
365 #endif
366 
367 #if defined (__cplusplus)
368 }
369 #endif /* defined (__cplusplus) */
370 
371 #if defined(_ARM_USE_DEPRECATED_NEON_INTRINSICS)
372 
374 //
375 // VLDx/VSTx alignment specifications
376 //
377 
378 
379 #define _NEON_ALIGN16(a) \
380  ( \
381  ((a) == 8) ? 0 : \
382  ((a) == 16) ? 1 : \
383  -1)
384 
385 #define _NEON_ALIGN32(a) \
386  ( \
387  ((a) == 8) ? 0 : \
388  ((a) == 32) ? 1 : \
389  -1)
390 
391 #define _NEON_ALIGN64(a) \
392  ( \
393  ((a) == 8) ? 0 : \
394  ((a) == 64) ? 1 : \
395  -1)
396 
397 #define _NEON_ALIGN64_128(a) \
398  ( \
399  ((a) == 8) ? 0 : \
400  ((a) == 64) ? 1 : \
401  ((a) == 128) ? 2 : \
402  -1)
403 
404 
405 #define _NEON_ALIGN64_128_256(a) \
406  ( \
407  ((a) == 8) ? 0 : \
408  ((a) == 64) ? 1 : \
409  ((a) == 128) ? 2 : \
410  ((a) == 256) ? 3 : \
411  -1)
412 
413 
415 //
416 // { +++ auto-generated code begins (encoding macros)
417 
418 #define _NENC_0(x) ((x) & 0x1)
419 #define _NENC_11_8(x) (((x) << 8) & 0xf00)
420 #define _NENC_12(x) (((x) << 12) & 0x1000)
421 #define _NENC_16(x) (((x) << 16) & 0x10000)
422 #define _NENC_18_16(x) (((x) << 16) & 0x70000)
423 #define _NENC_19(x) (((x) << 19) & 0x80000)
424 #define _NENC_19_16(x) (((x) << 16) & 0xf0000)
425 #define _NENC_19_17(x) (((x) << 17) & 0xe0000)
426 #define _NENC_19_18(x) (((x) << 18) & 0xc0000)
427 #define _NENC_20_16(x) (((x) << 16) & 0x1f0000)
428 #define _NENC_21(x) (((x) << 21) & 0x200000)
429 #define _NENC_21_16(x) (((x) << 16) & 0x3f0000)
430 #define _NENC_21x6(x) (((x) << 6) & 0x40 | ((x) << 20) & 0x200000)
431 #define _NENC_21x6_5(x) (((x) << 5) & 0x60 | ((x) << 19) & 0x200000)
432 #define _NENC_4(x) (((x) << 4) & 0x10)
433 #define _NENC_5(x) (((x) << 5) & 0x20)
434 #define _NENC_5_4(x) (((x) << 4) & 0x30)
435 #define _NENC_5x3(x) (((x) << 3) & 0x8 | ((x) << 4) & 0x20)
436 #define _NENC_7(x) (((x) << 7) & 0x80)
437 #define _NENC_7_5(x) (((x) << 5) & 0xe0)
438 #define _NENC_7_6(x) (((x) << 6) & 0xc0)
439 
440 // } +++ auto-generated code ends (encoding macros)
441 
442 
444 //
445 // { +++ auto-generated code begins (Neon macros)
446 
447 // AES
448 #define aesd_p8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
449 #define aesd_s8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
450 #define aesd_u8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
451 #define aese_p8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
452 #define aese_s8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
453 #define aese_u8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
454 #define aesimc_p8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
455 #define aesimc_s8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
456 #define aesimc_u8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
457 #define aesmc_p8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
458 #define aesmc_s8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
459 #define aesmc_u8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
460 
461 // SHA (2-operand)
462 #define sha1h_f32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
463 #define sha1h_s32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
464 #define sha1h_u32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
465 #define sha1su1_f32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
466 #define sha1su1_s32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
467 #define sha1su1_u32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
468 #define sha256su0_f32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
469 #define sha256su0_s32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
470 #define sha256su0_u32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
471 
472 // SHA (3-operand)
473 #define sha1c_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
474 #define sha1c_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
475 #define sha1c_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
476 #define sha1m_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
477 #define sha1m_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
478 #define sha1m_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
479 #define sha1p_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
480 #define sha1p_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
481 #define sha1p_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
482 #define sha1su0_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
483 #define sha1su0_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
484 #define sha1su0_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
485 #define sha256h_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
486 #define sha256h_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
487 #define sha256h_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
488 #define sha256h2_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
489 #define sha256h2_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
490 #define sha256h2_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
491 #define sha256su1_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
492 #define sha256su1_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
493 #define sha256su1_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
494 
495 // VABA, VABAL
496 #define vaba_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100710, (Dd), (Dn), (Dm)) )
497 #define vaba_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200710, (Dd), (Dn), (Dm)) )
498 #define vaba_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000710, (Dd), (Dn), (Dm)) )
499 #define vaba_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100710, (Dd), (Dn), (Dm)) )
500 #define vaba_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200710, (Dd), (Dn), (Dm)) )
501 #define vaba_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000710, (Dd), (Dn), (Dm)) )
502 #define vabal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900500, (Qd), (Dn), (Dm)) )
503 #define vabal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00500, (Qd), (Dn), (Dm)) )
504 #define vabal_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800500, (Qd), (Dn), (Dm)) )
505 #define vabal_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900500, (Qd), (Dn), (Dm)) )
506 #define vabal_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00500, (Qd), (Dn), (Dm)) )
507 #define vabal_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800500, (Qd), (Dn), (Dm)) )
508 #define vabaq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100750, (Qd), (Qn), (Qm)) )
509 #define vabaq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200750, (Qd), (Qn), (Qm)) )
510 #define vabaq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000750, (Qd), (Qn), (Qm)) )
511 #define vabaq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100750, (Qd), (Qn), (Qm)) )
512 #define vabaq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200750, (Qd), (Qn), (Qm)) )
513 #define vabaq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000750, (Qd), (Qn), (Qm)) )
514 
515 // VABD (floating point)
516 #define vabd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200d00, (Dn), (Dm)) )
517 #define vabdq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200d40, (Qn), (Qm)) )
518 
519 // VABD[L] (integer)
520 #define vabd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100700, (Dn), (Dm)) )
521 #define vabd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200700, (Dn), (Dm)) )
522 #define vabd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000700, (Dn), (Dm)) )
523 #define vabd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100700, (Dn), (Dm)) )
524 #define vabd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200700, (Dn), (Dm)) )
525 #define vabd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000700, (Dn), (Dm)) )
526 #define vabdl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900700, (Dn), (Dm)) )
527 #define vabdl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00700, (Dn), (Dm)) )
528 #define vabdl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800700, (Dn), (Dm)) )
529 #define vabdl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900700, (Dn), (Dm)) )
530 #define vabdl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00700, (Dn), (Dm)) )
531 #define vabdl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800700, (Dn), (Dm)) )
532 #define vabdq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100740, (Qn), (Qm)) )
533 #define vabdq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200740, (Qn), (Qm)) )
534 #define vabdq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000740, (Qn), (Qm)) )
535 #define vabdq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100740, (Qn), (Qm)) )
536 #define vabdq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200740, (Qn), (Qm)) )
537 #define vabdq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000740, (Qn), (Qm)) )
538 
539 // VABS, VNEG
540 #define vabs_f32(Dm) ( __neon_DdDm( 0xf3b90700, (Dm)) )
541 #define vabs_s16(Dm) ( __neon_DdDm( 0xf3b50300, (Dm)) )
542 #define vabs_s32(Dm) ( __neon_DdDm( 0xf3b90300, (Dm)) )
543 #define vabs_s8(Dm) ( __neon_DdDm( 0xf3b10300, (Dm)) )
544 #define vneg_f32(Dm) ( __neon_DdDm( 0xf3b90780, (Dm)) )
545 #define vneg_s16(Dm) ( __neon_DdDm( 0xf3b50380, (Dm)) )
546 #define vneg_s32(Dm) ( __neon_DdDm( 0xf3b90380, (Dm)) )
547 #define vneg_s8(Dm) ( __neon_DdDm( 0xf3b10380, (Dm)) )
548 #define vabsq_f32(Qm) ( __neon_QdQm( 0xf3b90740, (Qm)) )
549 #define vabsq_s16(Qm) ( __neon_QdQm( 0xf3b50340, (Qm)) )
550 #define vabsq_s32(Qm) ( __neon_QdQm( 0xf3b90340, (Qm)) )
551 #define vabsq_s8(Qm) ( __neon_QdQm( 0xf3b10340, (Qm)) )
552 #define vnegq_f32(Qm) ( __neon_QdQm( 0xf3b907c0, (Qm)) )
553 #define vnegq_s16(Qm) ( __neon_QdQm( 0xf3b503c0, (Qm)) )
554 #define vnegq_s32(Qm) ( __neon_QdQm( 0xf3b903c0, (Qm)) )
555 #define vnegq_s8(Qm) ( __neon_QdQm( 0xf3b103c0, (Qm)) )
556 
557 // VACGE, VACGT, VACLE, VACLT
558 #define vacge_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e10, (Dn), (Dm)) )
559 #define vacgt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e10, (Dn), (Dm)) )
560 #define vacle_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e10, (Dm), (Dn)) )
561 #define vaclt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e10, (Dm), (Dn)) )
562 #define vacgeq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e50, (Qn), (Qm)) )
563 #define vacgtq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e50, (Qn), (Qm)) )
564 #define vacleq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e50, (Qm), (Qn)) )
565 #define vacltq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e50, (Qm), (Qn)) )
566 
567 // VADD
568 #define vadd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000d00, (Dn), (Dm)) )
569 #define vadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100800, (Dn), (Dm)) )
570 #define vadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200800, (Dn), (Dm)) )
571 #define vadd_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300800, (Dn), (Dm)) )
572 #define vadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000800, (Dn), (Dm)) )
573 #define vadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100800, (Dn), (Dm)) )
574 #define vadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200800, (Dn), (Dm)) )
575 #define vadd_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2300800, (Dn), (Dm)) )
576 #define vadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000800, (Dn), (Dm)) )
577 #define vaddq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000d40, (Qn), (Qm)) )
578 #define vaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100840, (Qn), (Qm)) )
579 #define vaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200840, (Qn), (Qm)) )
580 #define vaddq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300840, (Qn), (Qm)) )
581 #define vaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000840, (Qn), (Qm)) )
582 #define vaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100840, (Qn), (Qm)) )
583 #define vaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200840, (Qn), (Qm)) )
584 #define vaddq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2300840, (Qn), (Qm)) )
585 #define vaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000840, (Qn), (Qm)) )
586 
587 // VADDHN, VRADDHN
588 #define vaddhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf2800400, (Qn), (Qm)) )
589 #define vaddhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf2900400, (Qn), (Qm)) )
590 #define vaddhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00400, (Qn), (Qm)) )
591 #define vaddhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf2800400, (Qn), (Qm)) )
592 #define vaddhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf2900400, (Qn), (Qm)) )
593 #define vaddhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00400, (Qn), (Qm)) )
594 #define vraddhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf3800400, (Qn), (Qm)) )
595 #define vraddhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf3900400, (Qn), (Qm)) )
596 #define vraddhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00400, (Qn), (Qm)) )
597 #define vraddhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf3800400, (Qn), (Qm)) )
598 #define vraddhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf3900400, (Qn), (Qm)) )
599 #define vraddhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00400, (Qn), (Qm)) )
600 
601 // VADDL, VADDW
602 #define vaddl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900000, (Dn), (Dm)) )
603 #define vaddl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00000, (Dn), (Dm)) )
604 #define vaddl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800000, (Dn), (Dm)) )
605 #define vaddl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900000, (Dn), (Dm)) )
606 #define vaddl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00000, (Dn), (Dm)) )
607 #define vaddl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800000, (Dn), (Dm)) )
608 #define vaddw_s16(Qn, Dm) ( __neon_QdQnDm( 0xf2900100, (Qn), (Dm)) )
609 #define vaddw_s32(Qn, Dm) ( __neon_QdQnDm( 0xf2a00100, (Qn), (Dm)) )
610 #define vaddw_s8(Qn, Dm) ( __neon_QdQnDm( 0xf2800100, (Qn), (Dm)) )
611 #define vaddw_u16(Qn, Dm) ( __neon_QdQnDm( 0xf3900100, (Qn), (Dm)) )
612 #define vaddw_u32(Qn, Dm) ( __neon_QdQnDm( 0xf3a00100, (Qn), (Dm)) )
613 #define vaddw_u8(Qn, Dm) ( __neon_QdQnDm( 0xf3800100, (Qn), (Dm)) )
614 
615 // VAND, VORR
616 #define vand_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
617 #define vand_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
618 #define vand_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
619 #define vand_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
620 #define vand_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
621 #define vand_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
622 #define vand_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
623 #define vand_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
624 #define vorr_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
625 #define vorr_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
626 #define vorr_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
627 #define vorr_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
628 #define vorr_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
629 #define vorr_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
630 #define vorr_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
631 #define vorr_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
632 #define vandq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
633 #define vandq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
634 #define vandq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
635 #define vandq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
636 #define vandq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
637 #define vandq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
638 #define vandq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
639 #define vandq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
640 #define vorrq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
641 #define vorrq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
642 #define vorrq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
643 #define vorrq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
644 #define vorrq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
645 #define vorrq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
646 #define vorrq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
647 #define vorrq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
648 
649 // VBIF, VBIT, VBSL
650 #define vbif_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
651 #define vbif_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
652 #define vbif_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
653 #define vbif_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
654 #define vbif_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
655 #define vbif_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
656 #define vbif_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
657 #define vbif_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
658 #define vbif_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
659 #define vbif_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
660 #define vbif_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
661 #define vbit_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
662 #define vbit_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
663 #define vbit_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
664 #define vbit_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
665 #define vbit_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
666 #define vbit_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
667 #define vbit_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
668 #define vbit_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
669 #define vbit_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
670 #define vbit_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
671 #define vbit_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
672 #define vbsl_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
673 #define vbsl_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
674 #define vbsl_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
675 #define vbsl_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
676 #define vbsl_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
677 #define vbsl_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
678 #define vbsl_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
679 #define vbsl_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
680 #define vbsl_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
681 #define vbsl_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
682 #define vbsl_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
683 #define vbifq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
684 #define vbifq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
685 #define vbifq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
686 #define vbifq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
687 #define vbifq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
688 #define vbifq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
689 #define vbifq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
690 #define vbifq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
691 #define vbifq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
692 #define vbifq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
693 #define vbifq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
694 #define vbitq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
695 #define vbitq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
696 #define vbitq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
697 #define vbitq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
698 #define vbitq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
699 #define vbitq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
700 #define vbitq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
701 #define vbitq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
702 #define vbitq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
703 #define vbitq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
704 #define vbitq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
705 #define vbslq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
706 #define vbslq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
707 #define vbslq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
708 #define vbslq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
709 #define vbslq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
710 #define vbslq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
711 #define vbslq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
712 #define vbslq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
713 #define vbslq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
714 #define vbslq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
715 #define vbslq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
716 
717 // VCEQ (immediate #0)
718 #define vceq_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90500, (Dm)) )
719 #define vceq_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50100, (Dm)) )
720 #define vceq_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90100, (Dm)) )
721 #define vceq_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10100, (Dm)) )
722 #define vceq_z_u16_ex(Dm) ( __neon_DdDm( 0xf3b50100, (Dm)) )
723 #define vceq_z_u32_ex(Dm) ( __neon_DdDm( 0xf3b90100, (Dm)) )
724 #define vceq_z_u8_ex(Dm) ( __neon_DdDm( 0xf3b10100, (Dm)) )
725 #define vceqq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90540, (Qm)) )
726 #define vceqq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50140, (Qm)) )
727 #define vceqq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90140, (Qm)) )
728 #define vceqq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10140, (Qm)) )
729 #define vceqq_z_u16_ex(Qm) ( __neon_QdQm( 0xf3b50140, (Qm)) )
730 #define vceqq_z_u32_ex(Qm) ( __neon_QdQm( 0xf3b90140, (Qm)) )
731 #define vceqq_z_u8_ex(Qm) ( __neon_QdQm( 0xf3b10140, (Qm)) )
732 
733 // VCEQ (register)
734 #define vceq_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000e00, (Dn), (Dm)) )
735 #define vceq_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
736 #define vceq_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100810, (Dn), (Dm)) )
737 #define vceq_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200810, (Dn), (Dm)) )
738 #define vceq_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
739 #define vceq_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100810, (Dn), (Dm)) )
740 #define vceq_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200810, (Dn), (Dm)) )
741 #define vceq_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
742 #define vceqq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000e40, (Qn), (Qm)) )
743 #define vceqq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
744 #define vceqq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100850, (Qn), (Qm)) )
745 #define vceqq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200850, (Qn), (Qm)) )
746 #define vceqq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
747 #define vceqq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100850, (Qn), (Qm)) )
748 #define vceqq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200850, (Qn), (Qm)) )
749 #define vceqq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
750 
751 // VCGE (immediate #0)
752 #define vcge_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90480, (Dm)) )
753 #define vcge_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50080, (Dm)) )
754 #define vcge_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90080, (Dm)) )
755 #define vcge_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10080, (Dm)) )
756 #define vcgeq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b904c0, (Qm)) )
757 #define vcgeq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b500c0, (Qm)) )
758 #define vcgeq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b900c0, (Qm)) )
759 #define vcgeq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b100c0, (Qm)) )
760 
761 // VCGE, VCLE (register)
762 #define vcge_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e00, (Dn), (Dm)) )
763 #define vcge_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100310, (Dn), (Dm)) )
764 #define vcge_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200310, (Dn), (Dm)) )
765 #define vcge_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000310, (Dn), (Dm)) )
766 #define vcge_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100310, (Dn), (Dm)) )
767 #define vcge_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200310, (Dn), (Dm)) )
768 #define vcge_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000310, (Dn), (Dm)) )
769 #define vcle_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e00, (Dm), (Dn)) )
770 #define vcle_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100310, (Dm), (Dn)) )
771 #define vcle_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200310, (Dm), (Dn)) )
772 #define vcle_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000310, (Dm), (Dn)) )
773 #define vcle_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100310, (Dm), (Dn)) )
774 #define vcle_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200310, (Dm), (Dn)) )
775 #define vcle_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000310, (Dm), (Dn)) )
776 #define vcgeq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e40, (Qn), (Qm)) )
777 #define vcgeq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100350, (Qn), (Qm)) )
778 #define vcgeq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200350, (Qn), (Qm)) )
779 #define vcgeq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000350, (Qn), (Qm)) )
780 #define vcgeq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100350, (Qn), (Qm)) )
781 #define vcgeq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200350, (Qn), (Qm)) )
782 #define vcgeq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000350, (Qn), (Qm)) )
783 #define vcleq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e40, (Qm), (Qn)) )
784 #define vcleq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100350, (Qm), (Qn)) )
785 #define vcleq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200350, (Qm), (Qn)) )
786 #define vcleq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000350, (Qm), (Qn)) )
787 #define vcleq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100350, (Qm), (Qn)) )
788 #define vcleq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200350, (Qm), (Qn)) )
789 #define vcleq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000350, (Qm), (Qn)) )
790 
791 // VCGT (immediate #0)
792 #define vcgt_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90400, (Dm)) )
793 #define vcgt_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50000, (Dm)) )
794 #define vcgt_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90000, (Dm)) )
795 #define vcgt_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10000, (Dm)) )
796 #define vcgtq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90440, (Qm)) )
797 #define vcgtq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50040, (Qm)) )
798 #define vcgtq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90040, (Qm)) )
799 #define vcgtq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10040, (Qm)) )
800 
801 // VCGT, VCLT (register)
802 #define vcgt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e00, (Dn), (Dm)) )
803 #define vcgt_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100300, (Dn), (Dm)) )
804 #define vcgt_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200300, (Dn), (Dm)) )
805 #define vcgt_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000300, (Dn), (Dm)) )
806 #define vcgt_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100300, (Dn), (Dm)) )
807 #define vcgt_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200300, (Dn), (Dm)) )
808 #define vcgt_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000300, (Dn), (Dm)) )
809 #define vclt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e00, (Dm), (Dn)) )
810 #define vclt_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100300, (Dm), (Dn)) )
811 #define vclt_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200300, (Dm), (Dn)) )
812 #define vclt_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000300, (Dm), (Dn)) )
813 #define vclt_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100300, (Dm), (Dn)) )
814 #define vclt_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200300, (Dm), (Dn)) )
815 #define vclt_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000300, (Dm), (Dn)) )
816 #define vcgtq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e40, (Qn), (Qm)) )
817 #define vcgtq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100340, (Qn), (Qm)) )
818 #define vcgtq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200340, (Qn), (Qm)) )
819 #define vcgtq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000340, (Qn), (Qm)) )
820 #define vcgtq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100340, (Qn), (Qm)) )
821 #define vcgtq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200340, (Qn), (Qm)) )
822 #define vcgtq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000340, (Qn), (Qm)) )
823 #define vcltq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e40, (Qm), (Qn)) )
824 #define vcltq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100340, (Qm), (Qn)) )
825 #define vcltq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200340, (Qm), (Qn)) )
826 #define vcltq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000340, (Qm), (Qn)) )
827 #define vcltq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100340, (Qm), (Qn)) )
828 #define vcltq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200340, (Qm), (Qn)) )
829 #define vcltq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000340, (Qm), (Qn)) )
830 
831 // VCLE (immediate #0)
832 #define vcle_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90580, (Dm)) )
833 #define vcle_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50180, (Dm)) )
834 #define vcle_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90180, (Dm)) )
835 #define vcle_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10180, (Dm)) )
836 #define vcleq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b905c0, (Qm)) )
837 #define vcleq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b501c0, (Qm)) )
838 #define vcleq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b901c0, (Qm)) )
839 #define vcleq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b101c0, (Qm)) )
840 
841 // VCLS, VCLZ
842 #define vcls_s16(Dm) ( __neon_DdDm( 0xf3b40400, (Dm)) )
843 #define vcls_s32(Dm) ( __neon_DdDm( 0xf3b80400, (Dm)) )
844 #define vcls_s8(Dm) ( __neon_DdDm( 0xf3b00400, (Dm)) )
845 #define vclz_s16(Dm) ( __neon_DdDm( 0xf3b40480, (Dm)) )
846 #define vclz_s32(Dm) ( __neon_DdDm( 0xf3b80480, (Dm)) )
847 #define vclz_s8(Dm) ( __neon_DdDm( 0xf3b00480, (Dm)) )
848 #define vclz_u16(Dm) ( __neon_DdDm( 0xf3b40480, (Dm)) )
849 #define vclz_u32(Dm) ( __neon_DdDm( 0xf3b80480, (Dm)) )
850 #define vclz_u8(Dm) ( __neon_DdDm( 0xf3b00480, (Dm)) )
851 #define vclsq_s16(Qm) ( __neon_QdQm( 0xf3b40440, (Qm)) )
852 #define vclsq_s32(Qm) ( __neon_QdQm( 0xf3b80440, (Qm)) )
853 #define vclsq_s8(Qm) ( __neon_QdQm( 0xf3b00440, (Qm)) )
854 #define vclzq_s16(Qm) ( __neon_QdQm( 0xf3b404c0, (Qm)) )
855 #define vclzq_s32(Qm) ( __neon_QdQm( 0xf3b804c0, (Qm)) )
856 #define vclzq_s8(Qm) ( __neon_QdQm( 0xf3b004c0, (Qm)) )
857 #define vclzq_u16(Qm) ( __neon_QdQm( 0xf3b404c0, (Qm)) )
858 #define vclzq_u32(Qm) ( __neon_QdQm( 0xf3b804c0, (Qm)) )
859 #define vclzq_u8(Qm) ( __neon_QdQm( 0xf3b004c0, (Qm)) )
860 
861 // VCLT (immediate #0)
862 #define vclt_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90600, (Dm)) )
863 #define vclt_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50200, (Dm)) )
864 #define vclt_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90200, (Dm)) )
865 #define vclt_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10200, (Dm)) )
866 #define vcltq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90640, (Qm)) )
867 #define vcltq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50240, (Qm)) )
868 #define vcltq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90240, (Qm)) )
869 #define vcltq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10240, (Qm)) )
870 
871 // VCNT
872 #define vcnt_p8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
873 #define vcnt_s8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
874 #define vcnt_u8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
875 #define vcntq_p8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
876 #define vcntq_s8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
877 #define vcntq_u8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
878 
879 // VCOMBINE (combine 2x64bit into a 128bit register)
880 #define vcombine_f32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
881 #define vcombine_p16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
882 #define vcombine_p8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
883 #define vcombine_s16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
884 #define vcombine_s32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
885 #define vcombine_s64(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
886 #define vcombine_s8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
887 #define vcombine_u16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
888 #define vcombine_u32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
889 #define vcombine_u64(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
890 #define vcombine_u8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
891 
892 // VCREATE (ARM core register pair to Neon 64bit register)
893 #define vcreate_f32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
894 #define vcreate_p16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
895 #define vcreate_p8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
896 #define vcreate_s16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
897 #define vcreate_s32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
898 #define vcreate_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
899 #define vcreate_s8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
900 #define vcreate_u16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
901 #define vcreate_u32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
902 #define vcreate_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
903 #define vcreate_u8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
904 
905 // VCVT (between floating-point and fixed-point)
906 #define vcvt_n_f32_s32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf2800e10 | _NENC_21_16(64 - (fbits)), (Dm)) )
907 #define vcvt_n_f32_u32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf3800e10 | _NENC_21_16(64 - (fbits)), (Dm)) )
908 #define vcvt_n_s32_f32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf2800f10 | _NENC_21_16(64 - (fbits)), (Dm)) )
909 #define vcvt_n_u32_f32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf3800f10 | _NENC_21_16(64 - (fbits)), (Dm)) )
910 #define vcvtq_n_f32_s32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf2800e50 | _NENC_21_16(64 - (fbits)), (Qm)) )
911 #define vcvtq_n_f32_u32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf3800e50 | _NENC_21_16(64 - (fbits)), (Qm)) )
912 #define vcvtq_n_s32_f32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf2800f50 | _NENC_21_16(64 - (fbits)), (Qm)) )
913 #define vcvtq_n_u32_f32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf3800f50 | _NENC_21_16(64 - (fbits)), (Qm)) )
914 
915 // VCVT (between floating-point and integer with directed rounding)
916 #define vcvta_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0000, (Dm)) )
917 #define vcvta_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0080, (Dm)) )
918 #define vcvtm_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0300, (Dm)) )
919 #define vcvtm_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0380, (Dm)) )
920 #define vcvtn_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0100, (Dm)) )
921 #define vcvtn_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0180, (Dm)) )
922 #define vcvtp_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0200, (Dm)) )
923 #define vcvtp_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0280, (Dm)) )
924 #define vcvtaq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0040, (Qm)) )
925 #define vcvtaq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb00c0, (Qm)) )
926 #define vcvtmq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0340, (Qm)) )
927 #define vcvtmq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb03c0, (Qm)) )
928 #define vcvtnq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0140, (Qm)) )
929 #define vcvtnq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb01c0, (Qm)) )
930 #define vcvtpq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0240, (Qm)) )
931 #define vcvtpq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb02c0, (Qm)) )
932 
933 // VCVT (between floating-point and integer)
934 #define vcvt_f32_s32(Dm) ( __neon_DdDm( 0xf3bb0600, (Dm)) )
935 #define vcvt_f32_u32(Dm) ( __neon_DdDm( 0xf3bb0680, (Dm)) )
936 #define vcvt_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0700, (Dm)) )
937 #define vcvt_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0780, (Dm)) )
938 #define vcvtq_f32_s32(Qm) ( __neon_QdQm( 0xf3bb0640, (Qm)) )
939 #define vcvtq_f32_u32(Qm) ( __neon_QdQm( 0xf3bb06c0, (Qm)) )
940 #define vcvtq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0740, (Qm)) )
941 #define vcvtq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb07c0, (Qm)) )
942 
943 // VDUP (scalar)
944 #define vdup_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
945 #define vdup_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
946 #define vdup_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
947 #define vdup_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
948 #define vdup_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
949 #define vdup_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
950 #define vdup_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
951 #define vdup_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
952 #define vdup_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
953 #define vdupq_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
954 #define vdupq_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
955 #define vdupq_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
956 #define vdupq_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
957 #define vdupq_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
958 #define vdupq_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
959 #define vdupq_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
960 #define vdupq_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
961 #define vdupq_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
962 
963 // VDUP, VMOV (ARM core register to Neon register)
964 #define vdup_n_f32(Ft) ( __neon_DdFt( 0xee800b10, (Ft)) )
965 #define vmov_n_f32(Ft) ( __neon_DdFt( 0xee800b10, (Ft)) )
966 #define vdup_n_p16(Rt) ( __neon_DdRt( 0xee800b30, __poly16ToInt32(Rt)) )
967 #define vdup_n_p8(Rt) ( __neon_DdRt( 0xeec00b10, __poly8ToInt32(Rt)) )
968 #define vdup_n_s16(Rt) ( __neon_DdRt( 0xee800b30, __int16ToInt32(Rt)) )
969 #define vdup_n_s32(Rt) ( __neon_DdRt( 0xee800b10, __int32ToInt32(Rt)) )
970 #define vdup_n_s8(Rt) ( __neon_DdRt( 0xeec00b10, __int8ToInt32(Rt)) )
971 #define vdup_n_u16(Rt) ( __neon_DdRt( 0xee800b30, __uint16ToInt32(Rt)) )
972 #define vdup_n_u32(Rt) ( __neon_DdRt( 0xee800b10, __uint32ToInt32(Rt)) )
973 #define vdup_n_u8(Rt) ( __neon_DdRt( 0xeec00b10, __uint8ToInt32(Rt)) )
974 #define vmov_n_p16(Rt) ( __neon_DdRt( 0xee800b30, __poly16ToInt32(Rt)) )
975 #define vmov_n_p8(Rt) ( __neon_DdRt( 0xeec00b10, __poly8ToInt32(Rt)) )
976 #define vmov_n_s16(Rt) ( __neon_DdRt( 0xee800b30, __int16ToInt32(Rt)) )
977 #define vmov_n_s32(Rt) ( __neon_DdRt( 0xee800b10, __int32ToInt32(Rt)) )
978 #define vmov_n_s8(Rt) ( __neon_DdRt( 0xeec00b10, __int8ToInt32(Rt)) )
979 #define vmov_n_u16(Rt) ( __neon_DdRt( 0xee800b30, __uint16ToInt32(Rt)) )
980 #define vmov_n_u32(Rt) ( __neon_DdRt( 0xee800b10, __uint32ToInt32(Rt)) )
981 #define vmov_n_u8(Rt) ( __neon_DdRt( 0xeec00b10, __uint8ToInt32(Rt)) )
982 #define vdupq_n_f32(Ft) ( __neon_QdFt( 0xeea00b10, (Ft)) )
983 #define vmovq_n_f32(Ft) ( __neon_QdFt( 0xeea00b10, (Ft)) )
984 #define vdupq_n_p16(Rt) ( __neon_QdRt( 0xeea00b30, __poly16ToInt32(Rt)) )
985 #define vdupq_n_p8(Rt) ( __neon_QdRt( 0xeee00b10, __poly8ToInt32(Rt)) )
986 #define vdupq_n_s16(Rt) ( __neon_QdRt( 0xeea00b30, __int16ToInt32(Rt)) )
987 #define vdupq_n_s32(Rt) ( __neon_QdRt( 0xeea00b10, __int32ToInt32(Rt)) )
988 #define vdupq_n_s8(Rt) ( __neon_QdRt( 0xeee00b10, __int8ToInt32(Rt)) )
989 #define vdupq_n_u16(Rt) ( __neon_QdRt( 0xeea00b30, __uint16ToInt32(Rt)) )
990 #define vdupq_n_u32(Rt) ( __neon_QdRt( 0xeea00b10, __uint32ToInt32(Rt)) )
991 #define vdupq_n_u8(Rt) ( __neon_QdRt( 0xeee00b10, __uint8ToInt32(Rt)) )
992 #define vmovq_n_p16(Rt) ( __neon_QdRt( 0xeea00b30, __poly16ToInt32(Rt)) )
993 #define vmovq_n_p8(Rt) ( __neon_QdRt( 0xeee00b10, __poly8ToInt32(Rt)) )
994 #define vmovq_n_s16(Rt) ( __neon_QdRt( 0xeea00b30, __int16ToInt32(Rt)) )
995 #define vmovq_n_s32(Rt) ( __neon_QdRt( 0xeea00b10, __int32ToInt32(Rt)) )
996 #define vmovq_n_s8(Rt) ( __neon_QdRt( 0xeee00b10, __int8ToInt32(Rt)) )
997 #define vmovq_n_u16(Rt) ( __neon_QdRt( 0xeea00b30, __uint16ToInt32(Rt)) )
998 #define vmovq_n_u32(Rt) ( __neon_QdRt( 0xeea00b10, __uint32ToInt32(Rt)) )
999 #define vmovq_n_u8(Rt) ( __neon_QdRt( 0xeee00b10, __uint8ToInt32(Rt)) )
1000 
1001 // VDUP.64, VMOV.64 (ARM core register pair to Neon registers)
1002 #define vdup_n_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __int64ToInt64(R64t)) )
1003 #define vdup_n_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
1004 #define vmov_n_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __int64ToInt64(R64t)) )
1005 #define vmov_n_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
1006 #define vdupq_n_s64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __int64ToInt64(R64t)) )
1007 #define vdupq_n_u64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __uint64ToInt64(R64t)) )
1008 #define vmovq_n_s64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __int64ToInt64(R64t)) )
1009 #define vmovq_n_u64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __uint64ToInt64(R64t)) )
1010 
1011 // VEOR, VBIC, VORN
1012 #define vbic_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1013 #define vbic_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1014 #define vbic_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1015 #define vbic_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1016 #define vbic_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1017 #define vbic_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1018 #define vbic_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1019 #define vbic_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1020 #define veor_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1021 #define veor_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1022 #define veor_s64(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1023 #define veor_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1024 #define veor_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1025 #define veor_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1026 #define veor_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1027 #define veor_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1028 #define vorn_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1029 #define vorn_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1030 #define vorn_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1031 #define vorn_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1032 #define vorn_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1033 #define vorn_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1034 #define vorn_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1035 #define vorn_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1036 #define vbicq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1037 #define vbicq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1038 #define vbicq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1039 #define vbicq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1040 #define vbicq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1041 #define vbicq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1042 #define vbicq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1043 #define vbicq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1044 #define veorq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1045 #define veorq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1046 #define veorq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1047 #define veorq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1048 #define veorq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1049 #define veorq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1050 #define veorq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1051 #define veorq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1052 #define vornq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1053 #define vornq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1054 #define vornq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1055 #define vornq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1056 #define vornq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1057 #define vornq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1058 #define vornq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1059 #define vornq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1060 
1061 // VEXT
1062 #define vext_f32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1063 #define vext_p16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1064 #define vext_p8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1065 #define vext_s16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1066 #define vext_s32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1067 #define vext_s64(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 1, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 8), (Dn), (Dm)) )
1068 #define vext_s8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1069 #define vext_u16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1070 #define vext_u32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1071 #define vext_u64(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 1, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 8), (Dn), (Dm)) )
1072 #define vext_u8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1073 #define vextq_f32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1074 #define vextq_p16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1075 #define vextq_p8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1076 #define vextq_s16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1077 #define vextq_s32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1078 #define vextq_s64(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 8), (Qn), (Qm)) )
1079 #define vextq_s8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1080 #define vextq_u16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1081 #define vextq_u32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1082 #define vextq_u64(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 8), (Qn), (Qm)) )
1083 #define vextq_u8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1084 
1085 // VGET (access the 64bit high/low part of a 128bit register)
1086 #define vget_high_f32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1087 #define vget_high_p16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1088 #define vget_high_p8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1089 #define vget_high_s16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1090 #define vget_high_s32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1091 #define vget_high_s64(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1092 #define vget_high_s8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1093 #define vget_high_u16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1094 #define vget_high_u32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1095 #define vget_high_u64(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1096 #define vget_high_u8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1097 #define vget_low_f32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1098 #define vget_low_p16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1099 #define vget_low_p8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1100 #define vget_low_s16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1101 #define vget_low_s32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1102 #define vget_low_s64(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1103 #define vget_low_s8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1104 #define vget_low_u16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1105 #define vget_low_u32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1106 #define vget_low_u64(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1107 #define vget_low_u8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1108 
1109 // VHADD, VRHADD, VHSUB
1110 #define vhadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100000, (Dn), (Dm)) )
1111 #define vhadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200000, (Dn), (Dm)) )
1112 #define vhadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000000, (Dn), (Dm)) )
1113 #define vhadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100000, (Dn), (Dm)) )
1114 #define vhadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200000, (Dn), (Dm)) )
1115 #define vhadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000000, (Dn), (Dm)) )
1116 #define vhsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100200, (Dn), (Dm)) )
1117 #define vhsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200200, (Dn), (Dm)) )
1118 #define vhsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000200, (Dn), (Dm)) )
1119 #define vhsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100200, (Dn), (Dm)) )
1120 #define vhsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200200, (Dn), (Dm)) )
1121 #define vhsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000200, (Dn), (Dm)) )
1122 #define vrhadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100100, (Dn), (Dm)) )
1123 #define vrhadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200100, (Dn), (Dm)) )
1124 #define vrhadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000100, (Dn), (Dm)) )
1125 #define vrhadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100100, (Dn), (Dm)) )
1126 #define vrhadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200100, (Dn), (Dm)) )
1127 #define vrhadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000100, (Dn), (Dm)) )
1128 #define vhaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100040, (Qn), (Qm)) )
1129 #define vhaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200040, (Qn), (Qm)) )
1130 #define vhaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000040, (Qn), (Qm)) )
1131 #define vhaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100040, (Qn), (Qm)) )
1132 #define vhaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200040, (Qn), (Qm)) )
1133 #define vhaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000040, (Qn), (Qm)) )
1134 #define vhsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100240, (Qn), (Qm)) )
1135 #define vhsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200240, (Qn), (Qm)) )
1136 #define vhsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000240, (Qn), (Qm)) )
1137 #define vhsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100240, (Qn), (Qm)) )
1138 #define vhsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200240, (Qn), (Qm)) )
1139 #define vhsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000240, (Qn), (Qm)) )
1140 #define vrhaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100140, (Qn), (Qm)) )
1141 #define vrhaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200140, (Qn), (Qm)) )
1142 #define vrhaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000140, (Qn), (Qm)) )
1143 #define vrhaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100140, (Qn), (Qm)) )
1144 #define vrhaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200140, (Qn), (Qm)) )
1145 #define vrhaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000140, (Qn), (Qm)) )
1146 
1147 // VLD1 (multiple single elements)
1148 #define vld1_f32(pcD) ( __neon_D1Adr( 0xf420078f, __float32ToN64_c(pcD)) )
1149 #define vld1_p16(pcD) ( __neon_D1Adr( 0xf420074f, __poly16ToN64_c(pcD)) )
1150 #define vld1_p8(pcD) ( __neon_D1Adr( 0xf420070f, __poly8ToN64_c(pcD)) )
1151 #define vld1_s16(pcD) ( __neon_D1Adr( 0xf420074f, __int16ToN64_c(pcD)) )
1152 #define vld1_s32(pcD) ( __neon_D1Adr( 0xf420078f, __int32ToN64_c(pcD)) )
1153 #define vld1_s64(pcD) ( __neon_D1Adr( 0xf42007cf, __int64ToN64_c(pcD)) )
1154 #define vld1_s8(pcD) ( __neon_D1Adr( 0xf420070f, __int8ToN64_c(pcD)) )
1155 #define vld1_u16(pcD) ( __neon_D1Adr( 0xf420074f, __uint16ToN64_c(pcD)) )
1156 #define vld1_u32(pcD) ( __neon_D1Adr( 0xf420078f, __uint32ToN64_c(pcD)) )
1157 #define vld1_u64(pcD) ( __neon_D1Adr( 0xf42007cf, __uint64ToN64_c(pcD)) )
1158 #define vld1_u8(pcD) ( __neon_D1Adr( 0xf420070f, __uint8ToN64_c(pcD)) )
1159 #define vld1_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1160 #define vld1_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1161 #define vld1_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1162 #define vld1_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1163 #define vld1_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1164 #define vld1_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf42007cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64_c(pcD)) )
1165 #define vld1_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1166 #define vld1_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1167 #define vld1_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1168 #define vld1_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf42007cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64_c(pcD)) )
1169 #define vld1_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1170 #define vld1q_f32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __float32ToN64_c(pcD)) )
1171 #define vld1q_p16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __poly16ToN64_c(pcD)) )
1172 #define vld1q_p8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __poly8ToN64_c(pcD)) )
1173 #define vld1q_s16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __int16ToN64_c(pcD)) )
1174 #define vld1q_s32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __int32ToN64_c(pcD)) )
1175 #define vld1q_s64(pcD) ( __neon_Q1Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1176 #define vld1q_s8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __int8ToN64_c(pcD)) )
1177 #define vld1q_u16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __uint16ToN64_c(pcD)) )
1178 #define vld1q_u32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __uint32ToN64_c(pcD)) )
1179 #define vld1q_u64(pcD) ( __neon_Q1Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1180 #define vld1q_u8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __uint8ToN64_c(pcD)) )
1181 #define vld1q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1182 #define vld1q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1183 #define vld1q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1184 #define vld1q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1185 #define vld1q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1186 #define vld1q_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1187 #define vld1q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1188 #define vld1q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1189 #define vld1q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1190 #define vld1q_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1191 #define vld1q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1192 
1193 // VLD1 (single element to all lanes)
1194 #define vld1_dup_f32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __float32ToN64_c(pcD)) )
1195 #define vld1_dup_p16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __poly16ToN64_c(pcD)) )
1196 #define vld1_dup_p8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __poly8ToN64_c(pcD)) )
1197 #define vld1_dup_s16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __int16ToN64_c(pcD)) )
1198 #define vld1_dup_s32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __int32ToN64_c(pcD)) )
1199 #define vld1_dup_s8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __int8ToN64_c(pcD)) )
1200 #define vld1_dup_u16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __uint16ToN64_c(pcD)) )
1201 #define vld1_dup_u32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __uint32ToN64_c(pcD)) )
1202 #define vld1_dup_u8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __uint8ToN64_c(pcD)) )
1203 #define vld1q_dup_f32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __float32ToN64_c(pcD)) )
1204 #define vld1q_dup_p16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __poly16ToN64_c(pcD)) )
1205 #define vld1q_dup_p8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __poly8ToN64_c(pcD)) )
1206 #define vld1q_dup_s16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __int16ToN64_c(pcD)) )
1207 #define vld1q_dup_s32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __int32ToN64_c(pcD)) )
1208 #define vld1q_dup_s8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __int8ToN64_c(pcD)) )
1209 #define vld1q_dup_u16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __uint16ToN64_c(pcD)) )
1210 #define vld1q_dup_u32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __uint32ToN64_c(pcD)) )
1211 #define vld1q_dup_u8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __uint8ToN64_c(pcD)) )
1212 
1213 // VLD1 (single element to all lanes, aligned)
1214 #define vld1_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __float32ToN64_c(pcD)) )
1215 #define vld1_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64_c(pcD)) )
1216 #define vld1_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64_c(pcD)) )
1217 #define vld1_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __int32ToN64_c(pcD)) )
1218 #define vld1_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64_c(pcD)) )
1219 #define vld1_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __uint32ToN64_c(pcD)) )
1220 #define vld1q_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __float32ToN64_c(pcD)) )
1221 #define vld1q_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64_c(pcD)) )
1222 #define vld1q_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64_c(pcD)) )
1223 #define vld1q_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __int32ToN64_c(pcD)) )
1224 #define vld1q_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64_c(pcD)) )
1225 #define vld1q_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __uint32ToN64_c(pcD)) )
1226 
1227 // VLD1 (single element to one lane)
1228 #define vld1_lane_f32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __float32ToN64_c(pcD)) )
1229 #define vld1_lane_p16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __poly16ToN64_c(pcD)) )
1230 #define vld1_lane_p8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __poly8ToN64_c(pcD)) )
1231 #define vld1_lane_s16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __int16ToN64_c(pcD)) )
1232 #define vld1_lane_s32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __int32ToN64_c(pcD)) )
1233 #define vld1_lane_s8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __int8ToN64_c(pcD)) )
1234 #define vld1_lane_u16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __uint16ToN64_c(pcD)) )
1235 #define vld1_lane_u32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __uint32ToN64_c(pcD)) )
1236 #define vld1_lane_u8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __uint8ToN64_c(pcD)) )
1237 #define vld1q_lane_f32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __float32ToN64_c(pcD)) )
1238 #define vld1q_lane_p16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __poly16ToN64_c(pcD)) )
1239 #define vld1q_lane_p8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __poly8ToN64_c(pcD)) )
1240 #define vld1q_lane_s16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __int16ToN64_c(pcD)) )
1241 #define vld1q_lane_s32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __int32ToN64_c(pcD)) )
1242 #define vld1q_lane_s8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __int8ToN64_c(pcD)) )
1243 #define vld1q_lane_u16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __uint16ToN64_c(pcD)) )
1244 #define vld1q_lane_u32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __uint32ToN64_c(pcD)) )
1245 #define vld1q_lane_u8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __uint8ToN64_c(pcD)) )
1246 
1247 // VLD1 (single element to one lane, aligned)
1248 #define vld1_lane_f32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __float32ToN64_c(pcD)) )
1249 #define vld1_lane_p16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __poly16ToN64_c(pcD)) )
1250 #define vld1_lane_s16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __int16ToN64_c(pcD)) )
1251 #define vld1_lane_s32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __int32ToN64_c(pcD)) )
1252 #define vld1_lane_u16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __uint16ToN64_c(pcD)) )
1253 #define vld1_lane_u32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __uint32ToN64_c(pcD)) )
1254 #define vld1q_lane_f32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __float32ToN64_c(pcD)) )
1255 #define vld1q_lane_p16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __poly16ToN64_c(pcD)) )
1256 #define vld1q_lane_s16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __int16ToN64_c(pcD)) )
1257 #define vld1q_lane_s32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __int32ToN64_c(pcD)) )
1258 #define vld1q_lane_u16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __uint16ToN64_c(pcD)) )
1259 #define vld1q_lane_u32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __uint32ToN64_c(pcD)) )
1260 
1261 // VLD2 (multiple 2-element structures)
1262 #define vld2_f32(pcD) ( __neon_Dx2Adr( 0xf420088f, __float32ToN64_c(pcD)) )
1263 #define vld2_p16(pcD) ( __neon_Dx2Adr( 0xf420084f, __poly16ToN64_c(pcD)) )
1264 #define vld2_p8(pcD) ( __neon_Dx2Adr( 0xf420080f, __poly8ToN64_c(pcD)) )
1265 #define vld2_s16(pcD) ( __neon_Dx2Adr( 0xf420084f, __int16ToN64_c(pcD)) )
1266 #define vld2_s32(pcD) ( __neon_Dx2Adr( 0xf420088f, __int32ToN64_c(pcD)) )
1267 #define vld2_s8(pcD) ( __neon_Dx2Adr( 0xf420080f, __int8ToN64_c(pcD)) )
1268 #define vld2_u16(pcD) ( __neon_Dx2Adr( 0xf420084f, __uint16ToN64_c(pcD)) )
1269 #define vld2_u32(pcD) ( __neon_Dx2Adr( 0xf420088f, __uint32ToN64_c(pcD)) )
1270 #define vld2_u8(pcD) ( __neon_Dx2Adr( 0xf420080f, __uint8ToN64_c(pcD)) )
1271 #define vld2_s64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1272 #define vld2_u64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1273 #define vld2_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1274 #define vld2_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1275 #define vld2_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1276 #define vld2_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1277 #define vld2_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1278 #define vld2_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1279 #define vld2_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1280 #define vld2_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1281 #define vld2_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1282 #define vld2_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1283 #define vld2_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1284 #define vld2q_f32(pcD) ( __neon_Qx2Adr( 0xf420098f, __float32ToN64_c(pcD)) )
1285 #define vld2q_p16(pcD) ( __neon_Qx2Adr( 0xf420094f, __poly16ToN64_c(pcD)) )
1286 #define vld2q_p8(pcD) ( __neon_Qx2Adr( 0xf420090f, __poly8ToN64_c(pcD)) )
1287 #define vld2q_s16(pcD) ( __neon_Qx2Adr( 0xf420094f, __int16ToN64_c(pcD)) )
1288 #define vld2q_s32(pcD) ( __neon_Qx2Adr( 0xf420098f, __int32ToN64_c(pcD)) )
1289 #define vld2q_s8(pcD) ( __neon_Qx2Adr( 0xf420090f, __int8ToN64_c(pcD)) )
1290 #define vld2q_u16(pcD) ( __neon_Qx2Adr( 0xf420094f, __uint16ToN64_c(pcD)) )
1291 #define vld2q_u32(pcD) ( __neon_Qx2Adr( 0xf420098f, __uint32ToN64_c(pcD)) )
1292 #define vld2q_u8(pcD) ( __neon_Qx2Adr( 0xf420090f, __uint8ToN64_c(pcD)) )
1293 #define vld2q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1294 #define vld2q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1295 #define vld2q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1296 #define vld2q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1297 #define vld2q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1298 #define vld2q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1299 #define vld2q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1300 #define vld2q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1301 #define vld2q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1302 
1303 // VLD2 (single 2-element structure to all lanes)
1304 #define vld2_dup_f32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __float32ToN64_c(pcD)) )
1305 #define vld2_dup_p16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __poly16ToN64_c(pcD)) )
1306 #define vld2_dup_p8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __poly8ToN64_c(pcD)) )
1307 #define vld2_dup_s16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __int16ToN64_c(pcD)) )
1308 #define vld2_dup_s32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __int32ToN64_c(pcD)) )
1309 #define vld2_dup_s8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __int8ToN64_c(pcD)) )
1310 #define vld2_dup_u16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __uint16ToN64_c(pcD)) )
1311 #define vld2_dup_u32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __uint32ToN64_c(pcD)) )
1312 #define vld2_dup_u8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __uint8ToN64_c(pcD)) )
1313 #define vld2_dup_s64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1314 #define vld2_dup_u64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1315 
1316 // VLD2 (single 2-element structure to all lanes, aligned)
1317 #define vld2_dup_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1318 #define vld2_dup_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1319 #define vld2_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1320 #define vld2_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64_c(pcD)) )
1321 #define vld2_dup_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __poly8ToN64_c(pcD)) )
1322 #define vld2_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64_c(pcD)) )
1323 #define vld2_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1324 #define vld2_dup_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __int8ToN64_c(pcD)) )
1325 #define vld2_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64_c(pcD)) )
1326 #define vld2_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1327 #define vld2_dup_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __uint8ToN64_c(pcD)) )
1328 
1329 // VLD2 (single 2-element structure to one lane)
1330 #define vld2_lane_f32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __float32ToN64_c(pcD)) )
1331 #define vld2_lane_p16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __poly16ToN64_c(pcD)) )
1332 #define vld2_lane_p8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __poly8ToN64_c(pcD)) )
1333 #define vld2_lane_s16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __int16ToN64_c(pcD)) )
1334 #define vld2_lane_s32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __int32ToN64_c(pcD)) )
1335 #define vld2_lane_s8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __int8ToN64_c(pcD)) )
1336 #define vld2_lane_u16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __uint16ToN64_c(pcD)) )
1337 #define vld2_lane_u32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __uint32ToN64_c(pcD)) )
1338 #define vld2_lane_u8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __uint8ToN64_c(pcD)) )
1339 #define vld2q_lane_f32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __float32ToN64_c(pcD)) )
1340 #define vld2q_lane_p16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __poly16ToN64_c(pcD)) )
1341 #define vld2q_lane_s16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __int16ToN64_c(pcD)) )
1342 #define vld2q_lane_s32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __int32ToN64_c(pcD)) )
1343 #define vld2q_lane_u16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __uint16ToN64_c(pcD)) )
1344 #define vld2q_lane_u32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __uint32ToN64_c(pcD)) )
1345 
1346 // VLD2 (single 2-element structure to one lane, aligned)
1347 #define vld2_lane_f32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __float32ToN64_c(pcD)) )
1348 #define vld2_lane_p16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __poly16ToN64_c(pcD)) )
1349 #define vld2_lane_p8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __poly8ToN64_c(pcD)) )
1350 #define vld2_lane_s16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __int16ToN64_c(pcD)) )
1351 #define vld2_lane_s32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __int32ToN64_c(pcD)) )
1352 #define vld2_lane_s8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __int8ToN64_c(pcD)) )
1353 #define vld2_lane_u16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __uint16ToN64_c(pcD)) )
1354 #define vld2_lane_u32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __uint32ToN64_c(pcD)) )
1355 #define vld2_lane_u8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __uint8ToN64_c(pcD)) )
1356 #define vld2q_lane_f32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __float32ToN64_c(pcD)) )
1357 #define vld2q_lane_p16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __poly16ToN64_c(pcD)) )
1358 #define vld2q_lane_s16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __int16ToN64_c(pcD)) )
1359 #define vld2q_lane_s32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __int32ToN64_c(pcD)) )
1360 #define vld2q_lane_u16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __uint16ToN64_c(pcD)) )
1361 #define vld2q_lane_u32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __uint32ToN64_c(pcD)) )
1362 
1363 // VLD3 (multiple 3-element structures)
1364 #define vld3_f32(pcD) ( __neon_Dx3Adr( 0xf420048f, __float32ToN64_c(pcD)) )
1365 #define vld3_p16(pcD) ( __neon_Dx3Adr( 0xf420044f, __poly16ToN64_c(pcD)) )
1366 #define vld3_p8(pcD) ( __neon_Dx3Adr( 0xf420040f, __poly8ToN64_c(pcD)) )
1367 #define vld3_s16(pcD) ( __neon_Dx3Adr( 0xf420044f, __int16ToN64_c(pcD)) )
1368 #define vld3_s32(pcD) ( __neon_Dx3Adr( 0xf420048f, __int32ToN64_c(pcD)) )
1369 #define vld3_s8(pcD) ( __neon_Dx3Adr( 0xf420040f, __int8ToN64_c(pcD)) )
1370 #define vld3_u16(pcD) ( __neon_Dx3Adr( 0xf420044f, __uint16ToN64_c(pcD)) )
1371 #define vld3_u32(pcD) ( __neon_Dx3Adr( 0xf420048f, __uint32ToN64_c(pcD)) )
1372 #define vld3_u8(pcD) ( __neon_Dx3Adr( 0xf420040f, __uint8ToN64_c(pcD)) )
1373 #define vld3_s64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __int64ToN64_c(pcD)) )
1374 #define vld3_u64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __uint64ToN64_c(pcD)) )
1375 #define vld3_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf42006cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64_c(pcD)) )
1376 #define vld3_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf42006cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64_c(pcD)) )
1377 #define vld3_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1378 #define vld3_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1379 #define vld3_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1380 #define vld3_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1381 #define vld3_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1382 #define vld3_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1383 #define vld3_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1384 #define vld3_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1385 #define vld3_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1386 #define vld3q_f32(pcD) ( __neon_Qx3Adr( 0xf420058f, __float32ToN64_c(pcD)) )
1387 #define vld3q_p16(pcD) ( __neon_Qx3Adr( 0xf420054f, __poly16ToN64_c(pcD)) )
1388 #define vld3q_p8(pcD) ( __neon_Qx3Adr( 0xf420050f, __poly8ToN64_c(pcD)) )
1389 #define vld3q_s16(pcD) ( __neon_Qx3Adr( 0xf420054f, __int16ToN64_c(pcD)) )
1390 #define vld3q_s32(pcD) ( __neon_Qx3Adr( 0xf420058f, __int32ToN64_c(pcD)) )
1391 #define vld3q_s8(pcD) ( __neon_Qx3Adr( 0xf420050f, __int8ToN64_c(pcD)) )
1392 #define vld3q_u16(pcD) ( __neon_Qx3Adr( 0xf420054f, __uint16ToN64_c(pcD)) )
1393 #define vld3q_u32(pcD) ( __neon_Qx3Adr( 0xf420058f, __uint32ToN64_c(pcD)) )
1394 #define vld3q_u8(pcD) ( __neon_Qx3Adr( 0xf420050f, __uint8ToN64_c(pcD)) )
1395 #define vld3q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1396 #define vld3q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1397 #define vld3q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1398 #define vld3q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1399 #define vld3q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1400 #define vld3q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1401 #define vld3q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1402 #define vld3q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1403 #define vld3q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1404 
1405 // VLD3 (single 3-element structure to all lanes)
1406 #define vld3_dup_f32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __float32ToN64_c(pcD)) )
1407 #define vld3_dup_p16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __poly16ToN64_c(pcD)) )
1408 #define vld3_dup_p8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __poly8ToN64_c(pcD)) )
1409 #define vld3_dup_s16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __int16ToN64_c(pcD)) )
1410 #define vld3_dup_s32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __int32ToN64_c(pcD)) )
1411 #define vld3_dup_s8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __int8ToN64_c(pcD)) )
1412 #define vld3_dup_u16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __uint16ToN64_c(pcD)) )
1413 #define vld3_dup_u32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __uint32ToN64_c(pcD)) )
1414 #define vld3_dup_u8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __uint8ToN64_c(pcD)) )
1415 #define vld3_dup_s64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __int64ToN64_c(pcD)) )
1416 #define vld3_dup_u64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __uint64ToN64_c(pcD)) )
1417 
1418 // VLD3 (single 3-element structure to one lane)
1419 #define vld3_lane_f32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __float32ToN64_c(pcD)) )
1420 #define vld3_lane_p16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __poly16ToN64_c(pcD)) )
1421 #define vld3_lane_p8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __poly8ToN64_c(pcD)) )
1422 #define vld3_lane_s16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __int16ToN64_c(pcD)) )
1423 #define vld3_lane_s32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __int32ToN64_c(pcD)) )
1424 #define vld3_lane_s8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __int8ToN64_c(pcD)) )
1425 #define vld3_lane_u16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __uint16ToN64_c(pcD)) )
1426 #define vld3_lane_u32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __uint32ToN64_c(pcD)) )
1427 #define vld3_lane_u8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __uint8ToN64_c(pcD)) )
1428 #define vld3q_lane_f32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __float32ToN64_c(pcD)) )
1429 #define vld3q_lane_p16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __poly16ToN64_c(pcD)) )
1430 #define vld3q_lane_s16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __int16ToN64_c(pcD)) )
1431 #define vld3q_lane_s32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __int32ToN64_c(pcD)) )
1432 #define vld3q_lane_u16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __uint16ToN64_c(pcD)) )
1433 #define vld3q_lane_u32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __uint32ToN64_c(pcD)) )
1434 
1435 // VLD4 (multiple 4-element structures)
1436 #define vld4_f32(pcD) ( __neon_Dx4Adr( 0xf420008f, __float32ToN64_c(pcD)) )
1437 #define vld4_p16(pcD) ( __neon_Dx4Adr( 0xf420004f, __poly16ToN64_c(pcD)) )
1438 #define vld4_p8(pcD) ( __neon_Dx4Adr( 0xf420000f, __poly8ToN64_c(pcD)) )
1439 #define vld4_s16(pcD) ( __neon_Dx4Adr( 0xf420004f, __int16ToN64_c(pcD)) )
1440 #define vld4_s32(pcD) ( __neon_Dx4Adr( 0xf420008f, __int32ToN64_c(pcD)) )
1441 #define vld4_s8(pcD) ( __neon_Dx4Adr( 0xf420000f, __int8ToN64_c(pcD)) )
1442 #define vld4_u16(pcD) ( __neon_Dx4Adr( 0xf420004f, __uint16ToN64_c(pcD)) )
1443 #define vld4_u32(pcD) ( __neon_Dx4Adr( 0xf420008f, __uint32ToN64_c(pcD)) )
1444 #define vld4_u8(pcD) ( __neon_Dx4Adr( 0xf420000f, __uint8ToN64_c(pcD)) )
1445 #define vld4_s64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __int64ToN64_c(pcD)) )
1446 #define vld4_u64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __uint64ToN64_c(pcD)) )
1447 #define vld4_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf42002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int64ToN64_c(pcD)) )
1448 #define vld4_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf42002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint64ToN64_c(pcD)) )
1449 #define vld4_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64_c(pcD)) )
1450 #define vld4_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64_c(pcD)) )
1451 #define vld4_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64_c(pcD)) )
1452 #define vld4_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64_c(pcD)) )
1453 #define vld4_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64_c(pcD)) )
1454 #define vld4_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64_c(pcD)) )
1455 #define vld4_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64_c(pcD)) )
1456 #define vld4_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64_c(pcD)) )
1457 #define vld4_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64_c(pcD)) )
1458 #define vld4q_f32(pcD) ( __neon_Qx4Adr( 0xf420018f, __float32ToN64_c(pcD)) )
1459 #define vld4q_p16(pcD) ( __neon_Qx4Adr( 0xf420014f, __poly16ToN64_c(pcD)) )
1460 #define vld4q_p8(pcD) ( __neon_Qx4Adr( 0xf420010f, __poly8ToN64_c(pcD)) )
1461 #define vld4q_s16(pcD) ( __neon_Qx4Adr( 0xf420014f, __int16ToN64_c(pcD)) )
1462 #define vld4q_s32(pcD) ( __neon_Qx4Adr( 0xf420018f, __int32ToN64_c(pcD)) )
1463 #define vld4q_s8(pcD) ( __neon_Qx4Adr( 0xf420010f, __int8ToN64_c(pcD)) )
1464 #define vld4q_u16(pcD) ( __neon_Qx4Adr( 0xf420014f, __uint16ToN64_c(pcD)) )
1465 #define vld4q_u32(pcD) ( __neon_Qx4Adr( 0xf420018f, __uint32ToN64_c(pcD)) )
1466 #define vld4q_u8(pcD) ( __neon_Qx4Adr( 0xf420010f, __uint8ToN64_c(pcD)) )
1467 #define vld4q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64_c(pcD)) )
1468 #define vld4q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64_c(pcD)) )
1469 #define vld4q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64_c(pcD)) )
1470 #define vld4q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64_c(pcD)) )
1471 #define vld4q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64_c(pcD)) )
1472 #define vld4q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64_c(pcD)) )
1473 #define vld4q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64_c(pcD)) )
1474 #define vld4q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64_c(pcD)) )
1475 #define vld4q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64_c(pcD)) )
1476 
1477 // VLD4 (single 4-element structure to all lanes)
1478 #define vld4_dup_f32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __float32ToN64_c(pcD)) )
1479 #define vld4_dup_p16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __poly16ToN64_c(pcD)) )
1480 #define vld4_dup_p8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __poly8ToN64_c(pcD)) )
1481 #define vld4_dup_s16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __int16ToN64_c(pcD)) )
1482 #define vld4_dup_s32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __int32ToN64_c(pcD)) )
1483 #define vld4_dup_s8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __int8ToN64_c(pcD)) )
1484 #define vld4_dup_u16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __uint16ToN64_c(pcD)) )
1485 #define vld4_dup_u32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __uint32ToN64_c(pcD)) )
1486 #define vld4_dup_u8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __uint8ToN64_c(pcD)) )
1487 #define vld4_dup_s64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __int64ToN64_c(pcD)) )
1488 #define vld4_dup_u64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __uint64ToN64_c(pcD)) )
1489 
1490 // VLD4 (single 4-element structure to all lanes, aligned)
1491 #define vld4_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __float32ToN64_c(pcD)) )
1492 #define vld4_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1493 #define vld4_dup_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __poly8ToN64_c(pcD)) )
1494 #define vld4_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1495 #define vld4_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __int32ToN64_c(pcD)) )
1496 #define vld4_dup_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __int8ToN64_c(pcD)) )
1497 #define vld4_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1498 #define vld4_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __uint32ToN64_c(pcD)) )
1499 #define vld4_dup_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __uint8ToN64_c(pcD)) )
1500 
1501 // VLD4 (single 4-element structure to one lane)
1502 #define vld4_lane_f32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __float32ToN64_c(pcD)) )
1503 #define vld4_lane_p16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __poly16ToN64_c(pcD)) )
1504 #define vld4_lane_p8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __poly8ToN64_c(pcD)) )
1505 #define vld4_lane_s16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __int16ToN64_c(pcD)) )
1506 #define vld4_lane_s32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __int32ToN64_c(pcD)) )
1507 #define vld4_lane_s8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __int8ToN64_c(pcD)) )
1508 #define vld4_lane_u16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __uint16ToN64_c(pcD)) )
1509 #define vld4_lane_u32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __uint32ToN64_c(pcD)) )
1510 #define vld4_lane_u8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __uint8ToN64_c(pcD)) )
1511 #define vld4q_lane_f32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __float32ToN64_c(pcD)) )
1512 #define vld4q_lane_p16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __poly16ToN64_c(pcD)) )
1513 #define vld4q_lane_s16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __int16ToN64_c(pcD)) )
1514 #define vld4q_lane_s32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __int32ToN64_c(pcD)) )
1515 #define vld4q_lane_u16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __uint16ToN64_c(pcD)) )
1516 #define vld4q_lane_u32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __uint32ToN64_c(pcD)) )
1517 
1518 // VLD4 (single 4-element structure to one lane, aligned)
1519 #define vld4_lane_f32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __float32ToN64_c(pcD)) )
1520 #define vld4_lane_p16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __poly16ToN64_c(pcD)) )
1521 #define vld4_lane_p8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __poly8ToN64_c(pcD)) )
1522 #define vld4_lane_s16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __int16ToN64_c(pcD)) )
1523 #define vld4_lane_s32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __int32ToN64_c(pcD)) )
1524 #define vld4_lane_s8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __int8ToN64_c(pcD)) )
1525 #define vld4_lane_u16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __uint16ToN64_c(pcD)) )
1526 #define vld4_lane_u32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __uint32ToN64_c(pcD)) )
1527 #define vld4_lane_u8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __uint8ToN64_c(pcD)) )
1528 #define vld4q_lane_f32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __float32ToN64_c(pcD)) )
1529 #define vld4q_lane_p16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __poly16ToN64_c(pcD)) )
1530 #define vld4q_lane_s16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __int16ToN64_c(pcD)) )
1531 #define vld4q_lane_s32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __int32ToN64_c(pcD)) )
1532 #define vld4q_lane_u16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __uint16ToN64_c(pcD)) )
1533 #define vld4q_lane_u32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __uint32ToN64_c(pcD)) )
1534 
1535 // VMAX, VMIN (floating point)
1536 #define vmax_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000f00, (Dn), (Dm)) )
1537 #define vmaxnm_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000f10, (Dn), (Dm)) )
1538 #define vmin_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200f00, (Dn), (Dm)) )
1539 #define vminnm_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200f10, (Dn), (Dm)) )
1540 #define vmaxq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000f40, (Qn), (Qm)) )
1541 #define vmaxnmq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000f50, (Qn), (Qm)) )
1542 #define vminq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200f40, (Qn), (Qm)) )
1543 #define vminnmq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200f50, (Qn), (Qm)) )
1544 
1545 // VMAX, VMIN (integer)
1546 #define vmax_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100600, (Dn), (Dm)) )
1547 #define vmax_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200600, (Dn), (Dm)) )
1548 #define vmax_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000600, (Dn), (Dm)) )
1549 #define vmax_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100600, (Dn), (Dm)) )
1550 #define vmax_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200600, (Dn), (Dm)) )
1551 #define vmax_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000600, (Dn), (Dm)) )
1552 #define vmin_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100610, (Dn), (Dm)) )
1553 #define vmin_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200610, (Dn), (Dm)) )
1554 #define vmin_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000610, (Dn), (Dm)) )
1555 #define vmin_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100610, (Dn), (Dm)) )
1556 #define vmin_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200610, (Dn), (Dm)) )
1557 #define vmin_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000610, (Dn), (Dm)) )
1558 #define vmaxq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100640, (Qn), (Qm)) )
1559 #define vmaxq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200640, (Qn), (Qm)) )
1560 #define vmaxq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000640, (Qn), (Qm)) )
1561 #define vmaxq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100640, (Qn), (Qm)) )
1562 #define vmaxq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200640, (Qn), (Qm)) )
1563 #define vmaxq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000640, (Qn), (Qm)) )
1564 #define vminq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100650, (Qn), (Qm)) )
1565 #define vminq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200650, (Qn), (Qm)) )
1566 #define vminq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000650, (Qn), (Qm)) )
1567 #define vminq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100650, (Qn), (Qm)) )
1568 #define vminq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200650, (Qn), (Qm)) )
1569 #define vminq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000650, (Qn), (Qm)) )
1570 
1571 // VMLA, VMLS (by scalar)
1572 #define vmla_lane_f32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00140 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1573 #define vmla_lane_s16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900040 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1574 #define vmla_lane_s32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00040 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1575 #define vmla_lane_u16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900040 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1576 #define vmla_lane_u32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00040 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1577 #define vmls_lane_f32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00540 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1578 #define vmls_lane_s16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900440 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1579 #define vmls_lane_s32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00440 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1580 #define vmls_lane_u16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900440 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1581 #define vmls_lane_u32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00440 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1582 #define vmlaq_lane_f32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00140 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1583 #define vmlaq_lane_s16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900040 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1584 #define vmlaq_lane_s32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00040 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1585 #define vmlaq_lane_u16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900040 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1586 #define vmlaq_lane_u32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00040 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1587 #define vmlsq_lane_f32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00540 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1588 #define vmlsq_lane_s16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900440 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1589 #define vmlsq_lane_s32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00440 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1590 #define vmlsq_lane_u16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900440 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1591 #define vmlsq_lane_u32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00440 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1592 
1593 // VMLA, VMLS (float, by scalar)
1594 #define vmla_n_f32(Dd, Dn, Ft) ( __neon_DdDnFt_acc( 0xf2a00140, (Dd), (Dn), (Ft)) )
1595 #define vmls_n_f32(Dd, Dn, Ft) ( __neon_DdDnFt_acc( 0xf2a00540, (Dd), (Dn), (Ft)) )
1596 #define vmlaq_n_f32(Qd, Qn, Ft) ( __neon_QdQnFt_acc( 0xf3a00140, (Qd), (Qn), (Ft)) )
1597 #define vmlsq_n_f32(Qd, Qn, Ft) ( __neon_QdQnFt_acc( 0xf3a00540, (Qd), (Qn), (Ft)) )
1598 
1599 // VMLA, VMLS (floating point)
1600 #define vmla_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000d10, (Dd), (Dn), (Dm)) )
1601 #define vmls_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200d10, (Dd), (Dn), (Dm)) )
1602 #define vmlaq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000d50, (Qd), (Qn), (Qm)) )
1603 #define vmlsq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200d50, (Qd), (Qn), (Qm)) )
1604 
1605 // VMLA, VMLS (integer)
1606 #define vmla_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100900, (Dd), (Dn), (Dm)) )
1607 #define vmla_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200900, (Dd), (Dn), (Dm)) )
1608 #define vmla_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000900, (Dd), (Dn), (Dm)) )
1609 #define vmla_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100900, (Dd), (Dn), (Dm)) )
1610 #define vmla_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200900, (Dd), (Dn), (Dm)) )
1611 #define vmla_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000900, (Dd), (Dn), (Dm)) )
1612 #define vmls_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100900, (Dd), (Dn), (Dm)) )
1613 #define vmls_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200900, (Dd), (Dn), (Dm)) )
1614 #define vmls_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000900, (Dd), (Dn), (Dm)) )
1615 #define vmls_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100900, (Dd), (Dn), (Dm)) )
1616 #define vmls_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200900, (Dd), (Dn), (Dm)) )
1617 #define vmls_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000900, (Dd), (Dn), (Dm)) )
1618 #define vmlaq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100940, (Qd), (Qn), (Qm)) )
1619 #define vmlaq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200940, (Qd), (Qn), (Qm)) )
1620 #define vmlaq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000940, (Qd), (Qn), (Qm)) )
1621 #define vmlaq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100940, (Qd), (Qn), (Qm)) )
1622 #define vmlaq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200940, (Qd), (Qn), (Qm)) )
1623 #define vmlaq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000940, (Qd), (Qn), (Qm)) )
1624 #define vmlsq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100940, (Qd), (Qn), (Qm)) )
1625 #define vmlsq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200940, (Qd), (Qn), (Qm)) )
1626 #define vmlsq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000940, (Qd), (Qn), (Qm)) )
1627 #define vmlsq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100940, (Qd), (Qn), (Qm)) )
1628 #define vmlsq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200940, (Qd), (Qn), (Qm)) )
1629 #define vmlsq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000940, (Qd), (Qn), (Qm)) )
1630 
1631 // VMLAL, VMLSL
1632 #define vmlal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900800, (Qd), (Dn), (Dm)) )
1633 #define vmlal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00800, (Qd), (Dn), (Dm)) )
1634 #define vmlal_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800800, (Qd), (Dn), (Dm)) )
1635 #define vmlal_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900800, (Qd), (Dn), (Dm)) )
1636 #define vmlal_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00800, (Qd), (Dn), (Dm)) )
1637 #define vmlal_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800800, (Qd), (Dn), (Dm)) )
1638 #define vmlsl_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900a00, (Qd), (Dn), (Dm)) )
1639 #define vmlsl_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00a00, (Qd), (Dn), (Dm)) )
1640 #define vmlsl_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800a00, (Qd), (Dn), (Dm)) )
1641 #define vmlsl_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900a00, (Qd), (Dn), (Dm)) )
1642 #define vmlsl_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00a00, (Qd), (Dn), (Dm)) )
1643 #define vmlsl_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800a00, (Qd), (Dn), (Dm)) )
1644 
1645 // VMLAL, VMLSL (by scalar)
1646 #define vmlal_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900240 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1647 #define vmlal_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00240 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1648 #define vmlal_lane_u16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3900240 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1649 #define vmlal_lane_u32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3a00240 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1650 #define vmlsl_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900640 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1651 #define vmlsl_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00640 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1652 #define vmlsl_lane_u16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3900640 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1653 #define vmlsl_lane_u32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3a00640 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1654 
1655 // VMOV (ARM core register to scalar)
1656 #define vset_lane_f32(Ft, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdFt_acc( 0xee000b10 | _NENC_21(lane), (Dd), (Ft)) )
1657 #define vset_lane_p16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __poly16ToInt32(Rt)) )
1658 #define vset_lane_p8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __poly8ToInt32(Rt)) )
1659 #define vset_lane_s16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __int16ToInt32(Rt)) )
1660 #define vset_lane_s32(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdRt_acc( 0xee000b10 | _NENC_21(lane), (Dd), __int32ToInt32(Rt)) )
1661 #define vset_lane_s8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __int8ToInt32(Rt)) )
1662 #define vset_lane_u16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __uint16ToInt32(Rt)) )
1663 #define vset_lane_u32(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdRt_acc( 0xee000b10 | _NENC_21(lane), (Dd), __uint32ToInt32(Rt)) )
1664 #define vset_lane_u8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __uint8ToInt32(Rt)) )
1665 
1666 // VMOV (scalar to ARM core register)
1667 #define vget_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_FtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1668 #define vget_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (poly16_t)__neon_RtDn( 0xee900b30 | _NENC_21x6(lane), (Dm)) )
1669 #define vget_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (poly8_t)__neon_RtDn( 0xeed00b10 | _NENC_21x6_5(lane), (Dm)) )
1670 #define vget_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (int16_t)__neon_RtDn( 0xee100b30 | _NENC_21x6(lane), (Dm)) )
1671 #define vget_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (int8_t)__neon_RtDn( 0xee500b10 | _NENC_21x6_5(lane), (Dm)) )
1672 #define vget_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (int32_t)__neon_RtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1673 #define vget_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (uint16_t)__neon_RtDn( 0xee900b30 | _NENC_21x6(lane), (Dm)) )
1674 #define vget_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (uint8_t)__neon_RtDn( 0xeed00b10 | _NENC_21x6_5(lane), (Dm)) )
1675 #define vget_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (uint32_t)__neon_RtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1676 
1677 // VMOV.64 (ARM core register pair to scalar)
1678 #define vset_lane_s64(R64t, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), __neon_DdRtRt2_acc( 0xec400b10, (Dd), __int64ToInt64(R64t)) )
1679 #define vset_lane_u64(R64t, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), __neon_DdRtRt2_acc( 0xec400b10, (Dd), __uint64ToInt64(R64t)) )
1680 #define vsetq_lane_s64(R64t, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdRtRt2_acc( 0xec400b10 | _NENC_0(lane), (Qd), __int64ToInt64(R64t)) )
1681 #define vsetq_lane_u64(R64t, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdRtRt2_acc( 0xec400b10 | _NENC_0(lane), (Qd), __uint64ToInt64(R64t)) )
1682 
1683 // VMOV.64 (scalar to ARM core register pair)
1684 #define vget_lane_s64(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), (int64_t)__neon_RtRt2Dm( 0xec500b10, (Dm)) )
1685 #define vget_lane_u64(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), (uint64_t)__neon_RtRt2Dm( 0xec500b10, (Dm)) )
1686 #define vgetq_lane_s64(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (int64_t)__neon_RtRt2Qm( 0xec500b10 | _NENC_0(lane), (Qm)) )
1687 #define vgetq_lane_u64(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (uint64_t)__neon_RtRt2Qm( 0xec500b10 | _NENC_0(lane), (Qm)) )
1688 
1689 // VMOV.Q (ARM core register to scalar)
1690 #define vsetq_lane_f32(Ft, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdFt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), (Ft)) )
1691 #define vsetq_lane_p16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __poly16ToInt32(Rt)) )
1692 #define vsetq_lane_p8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __poly8ToInt32(Rt)) )
1693 #define vsetq_lane_s16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __int16ToInt32(Rt)) )
1694 #define vsetq_lane_s32(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdRt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), __int32ToInt32(Rt)) )
1695 #define vsetq_lane_s8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __int8ToInt32(Rt)) )
1696 #define vsetq_lane_u16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __uint16ToInt32(Rt)) )
1697 #define vsetq_lane_u32(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdRt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), __uint32ToInt32(Rt)) )
1698 #define vsetq_lane_u8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __uint8ToInt32(Rt)) )
1699 
1700 // VMOV.Q (scalar to ARM core register)
1701 #define vgetq_lane_f32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_FtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1702 #define vgetq_lane_p16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (poly16_t)__neon_RtQn( 0xee900b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1703 #define vgetq_lane_p8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (poly8_t)__neon_RtQn( 0xeed00b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1704 #define vgetq_lane_s16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (int16_t)__neon_RtQn( 0xee100b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1705 #define vgetq_lane_s8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (int8_t)__neon_RtQn( 0xee500b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1706 #define vgetq_lane_s32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (int32_t)__neon_RtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1707 #define vgetq_lane_u16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (uint16_t)__neon_RtQn( 0xee900b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1708 #define vgetq_lane_u8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (uint8_t)__neon_RtQn( 0xeed00b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1709 #define vgetq_lane_u32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (uint32_t)__neon_RtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1710 
1711 // VMOVL
1712 #define vmovl_s16(Dm) ( __neon_QdDm( 0xf2900a10, (Dm)) )
1713 #define vmovl_s32(Dm) ( __neon_QdDm( 0xf2a00a10, (Dm)) )
1714 #define vmovl_s8(Dm) ( __neon_QdDm( 0xf2880a10, (Dm)) )
1715 #define vmovl_u16(Dm) ( __neon_QdDm( 0xf3900a10, (Dm)) )
1716 #define vmovl_u32(Dm) ( __neon_QdDm( 0xf3a00a10, (Dm)) )
1717 #define vmovl_u8(Dm) ( __neon_QdDm( 0xf3880a10, (Dm)) )
1718 
1719 // VMOVN
1720 #define vmovn_s16(Qm) ( __neon_DdQm( 0xf3b20200, (Qm)) )
1721 #define vmovn_s32(Qm) ( __neon_DdQm( 0xf3b60200, (Qm)) )
1722 #define vmovn_s64(Qm) ( __neon_DdQm( 0xf3ba0200, (Qm)) )
1723 #define vmovn_u16(Qm) ( __neon_DdQm( 0xf3b20200, (Qm)) )
1724 #define vmovn_u32(Qm) ( __neon_DdQm( 0xf3b60200, (Qm)) )
1725 #define vmovn_u64(Qm) ( __neon_DdQm( 0xf3ba0200, (Qm)) )
1726 
1727 // VMUL
1728 #define vmul_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000d10, (Dn), (Dm)) )
1729 #define vmul_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3000910, (Dn), (Dm)) )
1730 #define vmul_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100910, (Dn), (Dm)) )
1731 #define vmul_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200910, (Dn), (Dm)) )
1732 #define vmul_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000910, (Dn), (Dm)) )
1733 #define vmul_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100910, (Dn), (Dm)) )
1734 #define vmul_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200910, (Dn), (Dm)) )
1735 #define vmul_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000910, (Dn), (Dm)) )
1736 #define vmulq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000d50, (Qn), (Qm)) )
1737 #define vmulq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf3000950, (Qn), (Qm)) )
1738 #define vmulq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100950, (Qn), (Qm)) )
1739 #define vmulq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200950, (Qn), (Qm)) )
1740 #define vmulq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000950, (Qn), (Qm)) )
1741 #define vmulq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100950, (Qn), (Qm)) )
1742 #define vmulq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200950, (Qn), (Qm)) )
1743 #define vmulq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000950, (Qn), (Qm)) )
1744 
1745 // VMUL (by scalar - float)
1746 #define vmul_n_f32(Dn, Ft) ( __neon_DdDnFt( 0xf2a00940, (Dn), (Ft)) )
1747 #define vmulq_n_f32(Qn, Ft) ( __neon_QdQnFt( 0xf3a00940, (Qn), (Ft)) )
1748 
1749 // VMUL (by scalar)
1750 #define vmul_lane_f32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00940 | _NENC_5(lane), (Dn), (Dm)) )
1751 #define vmul_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900840 | _NENC_5x3(lane), (Dn), (Dm)) )
1752 #define vmul_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00840 | _NENC_5(lane), (Dn), (Dm)) )
1753 #define vmul_lane_u16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900840 | _NENC_5x3(lane), (Dn), (Dm)) )
1754 #define vmul_lane_u32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00840 | _NENC_5(lane), (Dn), (Dm)) )
1755 #define vmulq_lane_f32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00940 | _NENC_5(lane), (Qn), (Dm)) )
1756 #define vmulq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900840 | _NENC_5x3(lane), (Qn), (Dm)) )
1757 #define vmulq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00840 | _NENC_5(lane), (Qn), (Dm)) )
1758 #define vmulq_lane_u16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900840 | _NENC_5x3(lane), (Qn), (Dm)) )
1759 #define vmulq_lane_u32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00840 | _NENC_5(lane), (Qn), (Dm)) )
1760 
1761 // VMULL
1762 #define vmull_p64(Dn, Dm) ( __neon_QdDnDm( 0xf2a00e00, (Dn), (Dm)) )
1763 #define vmull_p8(Dn, Dm) ( __neon_QdDnDm( 0xf2800e00, (Dn), (Dm)) )
1764 #define vmull_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900c00, (Dn), (Dm)) )
1765 #define vmull_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00c00, (Dn), (Dm)) )
1766 #define vmull_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800c00, (Dn), (Dm)) )
1767 #define vmull_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900c00, (Dn), (Dm)) )
1768 #define vmull_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00c00, (Dn), (Dm)) )
1769 #define vmull_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800c00, (Dn), (Dm)) )
1770 
1771 // VMULL (by scalar)
1772 #define vmull_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf2900a40 | _NENC_5x3(lane), (Dn), (Dm)) )
1773 #define vmull_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf2a00a40 | _NENC_5(lane), (Dn), (Dm)) )
1774 #define vmull_lane_u16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf3900a40 | _NENC_5x3(lane), (Dn), (Dm)) )
1775 #define vmull_lane_u32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf3a00a40 | _NENC_5(lane), (Dn), (Dm)) )
1776 
1777 // VMVN
1778 #define vmvn_p16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1779 #define vmvn_p8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1780 #define vmvn_s16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1781 #define vmvn_s32(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1782 #define vmvn_s8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1783 #define vmvn_u16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1784 #define vmvn_u32(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1785 #define vmvn_u8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1786 #define vmvnq_p16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1787 #define vmvnq_p8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1788 #define vmvnq_s16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1789 #define vmvnq_s32(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1790 #define vmvnq_s8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1791 #define vmvnq_u16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1792 #define vmvnq_u32(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1793 #define vmvnq_u8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1794 
1795 // VPADAL
1796 #define vpadal_s16(Dd, Dm) ( __neon_DdDm_acc( 0xf3b40600, (Dd), (Dm)) )
1797 #define vpadal_s32(Dd, Dm) ( __neon_DdDm_acc( 0xf3b80600, (Dd), (Dm)) )
1798 #define vpadal_s8(Dd, Dm) ( __neon_DdDm_acc( 0xf3b00600, (Dd), (Dm)) )
1799 #define vpadal_u16(Dd, Dm) ( __neon_DdDm_acc( 0xf3b40680, (Dd), (Dm)) )
1800 #define vpadal_u32(Dd, Dm) ( __neon_DdDm_acc( 0xf3b80680, (Dd), (Dm)) )
1801 #define vpadal_u8(Dd, Dm) ( __neon_DdDm_acc( 0xf3b00680, (Dd), (Dm)) )
1802 #define vpadalq_s16(Qd, Qm) ( __neon_QdQm_acc( 0xf3b40640, (Qd), (Qm)) )
1803 #define vpadalq_s32(Qd, Qm) ( __neon_QdQm_acc( 0xf3b80640, (Qd), (Qm)) )
1804 #define vpadalq_s8(Qd, Qm) ( __neon_QdQm_acc( 0xf3b00640, (Qd), (Qm)) )
1805 #define vpadalq_u16(Qd, Qm) ( __neon_QdQm_acc( 0xf3b406c0, (Qd), (Qm)) )
1806 #define vpadalq_u32(Qd, Qm) ( __neon_QdQm_acc( 0xf3b806c0, (Qd), (Qm)) )
1807 #define vpadalq_u8(Qd, Qm) ( __neon_QdQm_acc( 0xf3b006c0, (Qd), (Qm)) )
1808 
1809 // VPADD (floating point)
1810 #define vpadd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000d00, (Dn), (Dm)) )
1811 
1812 // VPADD (integer)
1813 #define vpadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b10, (Dn), (Dm)) )
1814 #define vpadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b10, (Dn), (Dm)) )
1815 #define vpadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000b10, (Dn), (Dm)) )
1816 #define vpadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b10, (Dn), (Dm)) )
1817 #define vpadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b10, (Dn), (Dm)) )
1818 #define vpadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000b10, (Dn), (Dm)) )
1819 
1820 // VPADDL
1821 #define vpaddl_s16(Dm) ( __neon_DdDm( 0xf3b40200, (Dm)) )
1822 #define vpaddl_s32(Dm) ( __neon_DdDm( 0xf3b80200, (Dm)) )
1823 #define vpaddl_s8(Dm) ( __neon_DdDm( 0xf3b00200, (Dm)) )
1824 #define vpaddl_u16(Dm) ( __neon_DdDm( 0xf3b40280, (Dm)) )
1825 #define vpaddl_u32(Dm) ( __neon_DdDm( 0xf3b80280, (Dm)) )
1826 #define vpaddl_u8(Dm) ( __neon_DdDm( 0xf3b00280, (Dm)) )
1827 #define vpaddlq_s16(Qm) ( __neon_QdQm( 0xf3b40240, (Qm)) )
1828 #define vpaddlq_s32(Qm) ( __neon_QdQm( 0xf3b80240, (Qm)) )
1829 #define vpaddlq_s8(Qm) ( __neon_QdQm( 0xf3b00240, (Qm)) )
1830 #define vpaddlq_u16(Qm) ( __neon_QdQm( 0xf3b402c0, (Qm)) )
1831 #define vpaddlq_u32(Qm) ( __neon_QdQm( 0xf3b802c0, (Qm)) )
1832 #define vpaddlq_u8(Qm) ( __neon_QdQm( 0xf3b002c0, (Qm)) )
1833 
1834 // VPMAX, VPMIN (floating point)
1835 #define vpmax_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000f00, (Dn), (Dm)) )
1836 #define vpmin_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200f00, (Dn), (Dm)) )
1837 
1838 // VPMAX, VPMIN (integer)
1839 #define vpmax_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100a00, (Dn), (Dm)) )
1840 #define vpmax_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200a00, (Dn), (Dm)) )
1841 #define vpmax_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000a00, (Dn), (Dm)) )
1842 #define vpmax_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100a00, (Dn), (Dm)) )
1843 #define vpmax_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200a00, (Dn), (Dm)) )
1844 #define vpmax_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000a00, (Dn), (Dm)) )
1845 #define vpmin_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100a10, (Dn), (Dm)) )
1846 #define vpmin_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200a10, (Dn), (Dm)) )
1847 #define vpmin_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000a10, (Dn), (Dm)) )
1848 #define vpmin_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100a10, (Dn), (Dm)) )
1849 #define vpmin_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200a10, (Dn), (Dm)) )
1850 #define vpmin_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000a10, (Dn), (Dm)) )
1851 
1852 // VQABS, VQNEG
1853 #define vqabs_s16(Dm) ( __neon_DdDm( 0xf3b40700, (Dm)) )
1854 #define vqabs_s32(Dm) ( __neon_DdDm( 0xf3b80700, (Dm)) )
1855 #define vqabs_s8(Dm) ( __neon_DdDm( 0xf3b00700, (Dm)) )
1856 #define vqneg_s16(Dm) ( __neon_DdDm( 0xf3b40780, (Dm)) )
1857 #define vqneg_s32(Dm) ( __neon_DdDm( 0xf3b80780, (Dm)) )
1858 #define vqneg_s8(Dm) ( __neon_DdDm( 0xf3b00780, (Dm)) )
1859 #define vqabsq_s16(Qm) ( __neon_QdQm( 0xf3b40740, (Qm)) )
1860 #define vqabsq_s32(Qm) ( __neon_QdQm( 0xf3b80740, (Qm)) )
1861 #define vqabsq_s8(Qm) ( __neon_QdQm( 0xf3b00740, (Qm)) )
1862 #define vqnegq_s16(Qm) ( __neon_QdQm( 0xf3b407c0, (Qm)) )
1863 #define vqnegq_s32(Qm) ( __neon_QdQm( 0xf3b807c0, (Qm)) )
1864 #define vqnegq_s8(Qm) ( __neon_QdQm( 0xf3b007c0, (Qm)) )
1865 
1866 // VQADD
1867 #define vqadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100010, (Dn), (Dm)) )
1868 #define vqadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200010, (Dn), (Dm)) )
1869 #define vqadd_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300010, (Dn), (Dm)) )
1870 #define vqadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000010, (Dn), (Dm)) )
1871 #define vqadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100010, (Dn), (Dm)) )
1872 #define vqadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200010, (Dn), (Dm)) )
1873 #define vqadd_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300010, (Dn), (Dm)) )
1874 #define vqadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000010, (Dn), (Dm)) )
1875 #define vqaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100050, (Qn), (Qm)) )
1876 #define vqaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200050, (Qn), (Qm)) )
1877 #define vqaddq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300050, (Qn), (Qm)) )
1878 #define vqaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000050, (Qn), (Qm)) )
1879 #define vqaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100050, (Qn), (Qm)) )
1880 #define vqaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200050, (Qn), (Qm)) )
1881 #define vqaddq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300050, (Qn), (Qm)) )
1882 #define vqaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000050, (Qn), (Qm)) )
1883 
1884 // VQDMLAL, VQDMLSL
1885 #define vqdmlal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900900, (Qd), (Dn), (Dm)) )
1886 #define vqdmlal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00900, (Qd), (Dn), (Dm)) )
1887 #define vqdmlsl_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900b00, (Qd), (Dn), (Dm)) )
1888 #define vqdmlsl_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00b00, (Qd), (Dn), (Dm)) )
1889 
1890 // VQDMLAL, VQDMLSL (by scalar)
1891 #define vqdmlal_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900340 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1892 #define vqdmlal_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00340 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1893 #define vqdmlsl_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900740 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1894 #define vqdmlsl_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00740 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1895 
1896 // VQDMULH (by scalar)
1897 #define vqdmulh_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900c40 | _NENC_5x3(lane), (Dn), (Dm)) )
1898 #define vqdmulh_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00c40 | _NENC_5(lane), (Dn), (Dm)) )
1899 #define vqrdmulh_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900d40 | _NENC_5x3(lane), (Dn), (Dm)) )
1900 #define vqrdmulh_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00d40 | _NENC_5(lane), (Dn), (Dm)) )
1901 #define vqdmulhq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900c40 | _NENC_5x3(lane), (Qn), (Dm)) )
1902 #define vqdmulhq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00c40 | _NENC_5(lane), (Qn), (Dm)) )
1903 #define vqrdmulhq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900d40 | _NENC_5x3(lane), (Qn), (Dm)) )
1904 #define vqrdmulhq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00d40 | _NENC_5(lane), (Qn), (Dm)) )
1905 
1906 // VQDMULH, VQRDMULH
1907 #define vqdmulh_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b00, (Dn), (Dm)) )
1908 #define vqdmulh_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b00, (Dn), (Dm)) )
1909 #define vqrdmulh_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100b00, (Dn), (Dm)) )
1910 #define vqrdmulh_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200b00, (Dn), (Dm)) )
1911 #define vqdmulhq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100b40, (Qn), (Qm)) )
1912 #define vqdmulhq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200b40, (Qn), (Qm)) )
1913 #define vqrdmulhq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100b40, (Qn), (Qm)) )
1914 #define vqrdmulhq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200b40, (Qn), (Qm)) )
1915 
1916 // VQDMULL
1917 #define vqdmull_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900d00, (Dn), (Dm)) )
1918 #define vqdmull_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00d00, (Dn), (Dm)) )
1919 
1920 // VQDMULL (by scalar)
1921 #define vqdmull_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf2900b40 | _NENC_5x3(lane), (Dn), (Dm)) )
1922 #define vqdmull_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf2a00b40 | _NENC_5(lane), (Dn), (Dm)) )
1923 
1924 // VQMOVN, VQMOVUN
1925 #define vqmovn_s16(Qm) ( __neon_DdQm( 0xf3b20280, (Qm)) )
1926 #define vqmovn_s32(Qm) ( __neon_DdQm( 0xf3b60280, (Qm)) )
1927 #define vqmovn_s64(Qm) ( __neon_DdQm( 0xf3ba0280, (Qm)) )
1928 #define vqmovn_u16(Qm) ( __neon_DdQm( 0xf3b202c0, (Qm)) )
1929 #define vqmovn_u32(Qm) ( __neon_DdQm( 0xf3b602c0, (Qm)) )
1930 #define vqmovn_u64(Qm) ( __neon_DdQm( 0xf3ba02c0, (Qm)) )
1931 #define vqmovun_s16(Qm) ( __neon_DdQm( 0xf3b20240, (Qm)) )
1932 #define vqmovun_s32(Qm) ( __neon_DdQm( 0xf3b60240, (Qm)) )
1933 #define vqmovun_s64(Qm) ( __neon_DdQm( 0xf3ba0240, (Qm)) )
1934 
1935 // VQSHL, VQSHLU (immediate)
1936 #define vqshl_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900710 | _NENC_19_16(shift_amount), (Dm)) )
1937 #define vqshl_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00710 | _NENC_20_16(shift_amount), (Dm)) )
1938 #define vqshl_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800790 | _NENC_21_16(shift_amount), (Dm)) )
1939 #define vqshl_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880710 | _NENC_18_16(shift_amount), (Dm)) )
1940 #define vqshl_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf3900710 | _NENC_19_16(shift_amount), (Dm)) )
1941 #define vqshl_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf3a00710 | _NENC_20_16(shift_amount), (Dm)) )
1942 #define vqshl_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf3800790 | _NENC_21_16(shift_amount), (Dm)) )
1943 #define vqshl_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf3880710 | _NENC_18_16(shift_amount), (Dm)) )
1944 #define vqshlu_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf3900610 | _NENC_19_16(shift_amount), (Dm)) )
1945 #define vqshlu_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf3a00610 | _NENC_20_16(shift_amount), (Dm)) )
1946 #define vqshlu_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf3800690 | _NENC_21_16(shift_amount), (Dm)) )
1947 #define vqshlu_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf3880610 | _NENC_18_16(shift_amount), (Dm)) )
1948 #define vqshlq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900750 | _NENC_19_16(shift_amount), (Qm)) )
1949 #define vqshlq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00750 | _NENC_20_16(shift_amount), (Qm)) )
1950 #define vqshlq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28007d0 | _NENC_21_16(shift_amount), (Qm)) )
1951 #define vqshlq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880750 | _NENC_18_16(shift_amount), (Qm)) )
1952 #define vqshlq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf3900750 | _NENC_19_16(shift_amount), (Qm)) )
1953 #define vqshlq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf3a00750 | _NENC_20_16(shift_amount), (Qm)) )
1954 #define vqshlq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf38007d0 | _NENC_21_16(shift_amount), (Qm)) )
1955 #define vqshlq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf3880750 | _NENC_18_16(shift_amount), (Qm)) )
1956 #define vqshluq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf3900650 | _NENC_19_16(shift_amount), (Qm)) )
1957 #define vqshluq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf3a00650 | _NENC_20_16(shift_amount), (Qm)) )
1958 #define vqshluq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf38006d0 | _NENC_21_16(shift_amount), (Qm)) )
1959 #define vqshluq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf3880650 | _NENC_18_16(shift_amount), (Qm)) )
1960 
1961 // VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN (immediate)
1962 #define vqrshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880950 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1963 #define vqrshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900950 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1964 #define vqrshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00950 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1965 #define vqrshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880950 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1966 #define vqrshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900950 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1967 #define vqrshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00950 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1968 #define vqrshrun_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1969 #define vqrshrun_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1970 #define vqrshrun_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1971 #define vqshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880910 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1972 #define vqshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900910 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1973 #define vqshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00910 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1974 #define vqshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880910 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1975 #define vqshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900910 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1976 #define vqshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00910 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1977 #define vqshrun_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1978 #define vqshrun_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1979 #define vqshrun_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1980 
1981 // VQSUB
1982 #define vqsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100210, (Dn), (Dm)) )
1983 #define vqsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200210, (Dn), (Dm)) )
1984 #define vqsub_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300210, (Dn), (Dm)) )
1985 #define vqsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000210, (Dn), (Dm)) )
1986 #define vqsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100210, (Dn), (Dm)) )
1987 #define vqsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200210, (Dn), (Dm)) )
1988 #define vqsub_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300210, (Dn), (Dm)) )
1989 #define vqsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000210, (Dn), (Dm)) )
1990 #define vqsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100250, (Qn), (Qm)) )
1991 #define vqsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200250, (Qn), (Qm)) )
1992 #define vqsubq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300250, (Qn), (Qm)) )
1993 #define vqsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000250, (Qn), (Qm)) )
1994 #define vqsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100250, (Qn), (Qm)) )
1995 #define vqsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200250, (Qn), (Qm)) )
1996 #define vqsubq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300250, (Qn), (Qm)) )
1997 #define vqsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000250, (Qn), (Qm)) )
1998 
1999 // VRECPE, VRSQRTE
2000 #define vrecpe_f32(Dm) ( __neon_DdDm( 0xf3bb0500, (Dm)) )
2001 #define vrecpe_u32(Dm) ( __neon_DdDm( 0xf3bb0400, (Dm)) )
2002 #define vrsqrte_f32(Dm) ( __neon_DdDm( 0xf3bb0580, (Dm)) )
2003 #define vrsqrte_u32(Dm) ( __neon_DdDm( 0xf3bb0480, (Dm)) )
2004 #define vrecpeq_f32(Qm) ( __neon_QdQm( 0xf3bb0540, (Qm)) )
2005 #define vrecpeq_u32(Qm) ( __neon_QdQm( 0xf3bb0440, (Qm)) )
2006 #define vrsqrteq_f32(Qm) ( __neon_QdQm( 0xf3bb05c0, (Qm)) )
2007 #define vrsqrteq_u32(Qm) ( __neon_QdQm( 0xf3bb04c0, (Qm)) )
2008 
2009 // VRECPS
2010 #define vrecps_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000f10, (Dn), (Dm)) )
2011 #define vrecpsq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000f50, (Qn), (Qm)) )
2012 
2013 // VREV
2014 #define vrev16_p8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2015 #define vrev16_s8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2016 #define vrev16_u8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2017 #define vrev32_p16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2018 #define vrev32_p8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2019 #define vrev32_s16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2020 #define vrev32_s8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2021 #define vrev32_u16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2022 #define vrev32_u8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2023 #define vrev64_f32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2024 #define vrev64_p16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2025 #define vrev64_p8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2026 #define vrev64_s16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2027 #define vrev64_s32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2028 #define vrev64_s8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2029 #define vrev64_u16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2030 #define vrev64_u32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2031 #define vrev64_u8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2032 #define vrev16q_p8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2033 #define vrev16q_s8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2034 #define vrev16q_u8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2035 #define vrev32q_p16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2036 #define vrev32q_p8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2037 #define vrev32q_s16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2038 #define vrev32q_s8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2039 #define vrev32q_u16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2040 #define vrev32q_u8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2041 #define vrev64q_f32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2042 #define vrev64q_p16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2043 #define vrev64q_p8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2044 #define vrev64q_s16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2045 #define vrev64q_s32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2046 #define vrev64q_s8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2047 #define vrev64q_u16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2048 #define vrev64q_u32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2049 #define vrev64q_u8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2050 
2051 // VRINT
2052 #define vrnd_f32(Dm) ( __neon_DdDm( 0xf3ba0580, (Dm)) )
2053 #define vrnda_f32(Dm) ( __neon_DdDm( 0xf3ba0500, (Dm)) )
2054 #define vrndm_f32(Dm) ( __neon_DdDm( 0xf3ba0680, (Dm)) )
2055 #define vrndn_f32(Dm) ( __neon_DdDm( 0xf3ba0400, (Dm)) )
2056 #define vrndp_f32(Dm) ( __neon_DdDm( 0xf3ba0780, (Dm)) )
2057 #define vrndx_f32(Dm) ( __neon_DdDm( 0xf3ba0480, (Dm)) )
2058 #define vrndq_f32(Qm) ( __neon_QdQm( 0xf3ba05c0, (Qm)) )
2059 #define vrndaq_f32(Qm) ( __neon_QdQm( 0xf3ba0540, (Qm)) )
2060 #define vrndmq_f32(Qm) ( __neon_QdQm( 0xf3ba06c0, (Qm)) )
2061 #define vrndnq_f32(Qm) ( __neon_QdQm( 0xf3ba0440, (Qm)) )
2062 #define vrndpq_f32(Qm) ( __neon_QdQm( 0xf3ba07c0, (Qm)) )
2063 #define vrndxq_f32(Qm) ( __neon_QdQm( 0xf3ba04c0, (Qm)) )
2064 
2065 // VRSQRTS
2066 #define vrsqrts_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200f10, (Dn), (Dm)) )
2067 #define vrsqrtsq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200f50, (Qn), (Qm)) )
2068 
2069 // VSHL (immediate)
2070 #define vshl_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900510 | _NENC_19_16(shift_amount), (Dm)) )
2071 #define vshl_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00510 | _NENC_20_16(shift_amount), (Dm)) )
2072 #define vshl_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800590 | _NENC_21_16(shift_amount), (Dm)) )
2073 #define vshl_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880510 | _NENC_18_16(shift_amount), (Dm)) )
2074 #define vshl_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900510 | _NENC_19_16(shift_amount), (Dm)) )
2075 #define vshl_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00510 | _NENC_20_16(shift_amount), (Dm)) )
2076 #define vshl_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800590 | _NENC_21_16(shift_amount), (Dm)) )
2077 #define vshl_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880510 | _NENC_18_16(shift_amount), (Dm)) )
2078 #define vshlq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900550 | _NENC_19_16(shift_amount), (Qm)) )
2079 #define vshlq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00550 | _NENC_20_16(shift_amount), (Qm)) )
2080 #define vshlq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28005d0 | _NENC_21_16(shift_amount), (Qm)) )
2081 #define vshlq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880550 | _NENC_18_16(shift_amount), (Qm)) )
2082 #define vshlq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900550 | _NENC_19_16(shift_amount), (Qm)) )
2083 #define vshlq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00550 | _NENC_20_16(shift_amount), (Qm)) )
2084 #define vshlq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28005d0 | _NENC_21_16(shift_amount), (Qm)) )
2085 #define vshlq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880550 | _NENC_18_16(shift_amount), (Qm)) )
2086 
2087 // VSHL, VQSHL, VRSHL, VQRSHL (register)
2088 #define vqrshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100510, (Dm), (Dn)) )
2089 #define vqrshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200510, (Dm), (Dn)) )
2090 #define vqrshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300510, (Dm), (Dn)) )
2091 #define vqrshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000510, (Dm), (Dn)) )
2092 #define vqrshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100510, (Dm), (Dn)) )
2093 #define vqrshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200510, (Dm), (Dn)) )
2094 #define vqrshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300510, (Dm), (Dn)) )
2095 #define vqrshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000510, (Dm), (Dn)) )
2096 #define vqshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100410, (Dm), (Dn)) )
2097 #define vqshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200410, (Dm), (Dn)) )
2098 #define vqshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300410, (Dm), (Dn)) )
2099 #define vqshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000410, (Dm), (Dn)) )
2100 #define vqshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100410, (Dm), (Dn)) )
2101 #define vqshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200410, (Dm), (Dn)) )
2102 #define vqshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300410, (Dm), (Dn)) )
2103 #define vqshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000410, (Dm), (Dn)) )
2104 #define vrshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100500, (Dm), (Dn)) )
2105 #define vrshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200500, (Dm), (Dn)) )
2106 #define vrshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300500, (Dm), (Dn)) )
2107 #define vrshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000500, (Dm), (Dn)) )
2108 #define vrshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100500, (Dm), (Dn)) )
2109 #define vrshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200500, (Dm), (Dn)) )
2110 #define vrshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300500, (Dm), (Dn)) )
2111 #define vrshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000500, (Dm), (Dn)) )
2112 #define vshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100400, (Dm), (Dn)) )
2113 #define vshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200400, (Dm), (Dn)) )
2114 #define vshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300400, (Dm), (Dn)) )
2115 #define vshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000400, (Dm), (Dn)) )
2116 #define vshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100400, (Dm), (Dn)) )
2117 #define vshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200400, (Dm), (Dn)) )
2118 #define vshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300400, (Dm), (Dn)) )
2119 #define vshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000400, (Dm), (Dn)) )
2120 #define vqrshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100550, (Qm), (Qn)) )
2121 #define vqrshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200550, (Qm), (Qn)) )
2122 #define vqrshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300550, (Qm), (Qn)) )
2123 #define vqrshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000550, (Qm), (Qn)) )
2124 #define vqrshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100550, (Qm), (Qn)) )
2125 #define vqrshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200550, (Qm), (Qn)) )
2126 #define vqrshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300550, (Qm), (Qn)) )
2127 #define vqrshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000550, (Qm), (Qn)) )
2128 #define vqshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100450, (Qm), (Qn)) )
2129 #define vqshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200450, (Qm), (Qn)) )
2130 #define vqshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300450, (Qm), (Qn)) )
2131 #define vqshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000450, (Qm), (Qn)) )
2132 #define vqshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100450, (Qm), (Qn)) )
2133 #define vqshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200450, (Qm), (Qn)) )
2134 #define vqshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300450, (Qm), (Qn)) )
2135 #define vqshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000450, (Qm), (Qn)) )
2136 #define vrshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100540, (Qm), (Qn)) )
2137 #define vrshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200540, (Qm), (Qn)) )
2138 #define vrshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300540, (Qm), (Qn)) )
2139 #define vrshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000540, (Qm), (Qn)) )
2140 #define vrshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100540, (Qm), (Qn)) )
2141 #define vrshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200540, (Qm), (Qn)) )
2142 #define vrshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300540, (Qm), (Qn)) )
2143 #define vrshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000540, (Qm), (Qn)) )
2144 #define vshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100440, (Qm), (Qn)) )
2145 #define vshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200440, (Qm), (Qn)) )
2146 #define vshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300440, (Qm), (Qn)) )
2147 #define vshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000440, (Qm), (Qn)) )
2148 #define vshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100440, (Qm), (Qn)) )
2149 #define vshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200440, (Qm), (Qn)) )
2150 #define vshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300440, (Qm), (Qn)) )
2151 #define vshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000440, (Qm), (Qn)) )
2152 
2153 // VSHLL (shift_amount != size)
2154 #define __internal_vshll_n_t1_s16(Dm, shift_amount) ( __neon_QdDm( 0xf2900a10 | _NENC_19_16(shift_amount), (Dm)) )
2155 #define __internal_vshll_n_t1_s32(Dm, shift_amount) ( __neon_QdDm( 0xf2a00a10 | _NENC_20_16(shift_amount), (Dm)) )
2156 #define __internal_vshll_n_t1_s8(Dm, shift_amount) ( __neon_QdDm( 0xf2880a10 | _NENC_18_16(shift_amount), (Dm)) )
2157 #define __internal_vshll_n_t1_u16(Dm, shift_amount) ( __neon_QdDm( 0xf3900a10 | _NENC_19_16(shift_amount), (Dm)) )
2158 #define __internal_vshll_n_t1_u32(Dm, shift_amount) ( __neon_QdDm( 0xf3a00a10 | _NENC_20_16(shift_amount), (Dm)) )
2159 #define __internal_vshll_n_t1_u8(Dm, shift_amount) ( __neon_QdDm( 0xf3880a10 | _NENC_18_16(shift_amount), (Dm)) )
2160 
2161 // VSHLL (shift_amount == size)
2162 #define __internal_vshll_n_t2_s16(Dm) ( __neon_QdDm( 0xf3b60300, (Dm)) )
2163 #define __internal_vshll_n_t2_s32(Dm) ( __neon_QdDm( 0xf3ba0300, (Dm)) )
2164 #define __internal_vshll_n_t2_s8(Dm) ( __neon_QdDm( 0xf3b20300, (Dm)) )
2165 #define __internal_vshll_n_t2_u16(Dm) ( __neon_QdDm( 0xf3b60300, (Dm)) )
2166 #define __internal_vshll_n_t2_u32(Dm) ( __neon_QdDm( 0xf3ba0300, (Dm)) )
2167 #define __internal_vshll_n_t2_u8(Dm) ( __neon_QdDm( 0xf3b20300, (Dm)) )
2168 
2169 // VSHR, VRSHR (immediate)
2170 #define vrshr_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf2900210 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2171 #define vrshr_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf2a00210 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2172 #define vrshr_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf2800290 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2173 #define vrshr_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf2880210 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2174 #define vrshr_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf3900210 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2175 #define vrshr_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf3a00210 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2176 #define vrshr_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf3800290 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2177 #define vrshr_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf3880210 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2178 #define vshr_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf2900010 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2179 #define vshr_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf2a00010 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2180 #define vshr_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf2800090 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2181 #define vshr_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf2880010 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2182 #define vshr_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf3900010 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2183 #define vshr_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf3a00010 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2184 #define vshr_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf3800090 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2185 #define vshr_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf3880010 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2186 #define vrshrq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf2900250 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2187 #define vrshrq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf2a00250 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2188 #define vrshrq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf28002d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2189 #define vrshrq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf2880250 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2190 #define vrshrq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf3900250 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2191 #define vrshrq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf3a00250 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2192 #define vrshrq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf38002d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2193 #define vrshrq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf3880250 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2194 #define vshrq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf2900050 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2195 #define vshrq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf2a00050 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2196 #define vshrq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf28000d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2197 #define vshrq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf2880050 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2198 #define vshrq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf3900050 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2199 #define vshrq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf3a00050 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2200 #define vshrq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf38000d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2201 #define vshrq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf3880050 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2202 
2203 // VSHRN, VRSHRN (immediate)
2204 #define vrshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2205 #define vrshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2206 #define vrshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2207 #define vrshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2208 #define vrshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2209 #define vrshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2210 #define vshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2211 #define vshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2212 #define vshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2213 #define vshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2214 #define vshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2215 #define vshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2216 
2217 // VSLI (immediate)
2218 #define vsli_n_p16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2219 #define vsli_n_p8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2220 #define vsli_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2221 #define vsli_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00510 | _NENC_20_16(shift_amount), (Dd), (Dm)) )
2222 #define vsli_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800590 | _NENC_21_16(shift_amount), (Dd), (Dm)) )
2223 #define vsli_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2224 #define vsli_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2225 #define vsli_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00510 | _NENC_20_16(shift_amount), (Dd), (Dm)) )
2226 #define vsli_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800590 | _NENC_21_16(shift_amount), (Dd), (Dm)) )
2227 #define vsli_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2228 #define vsliq_n_p16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2229 #define vsliq_n_p8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2230 #define vsliq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2231 #define vsliq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00550 | _NENC_20_16(shift_amount), (Qd), (Qm)) )
2232 #define vsliq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38005d0 | _NENC_21_16(shift_amount), (Qd), (Qm)) )
2233 #define vsliq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2234 #define vsliq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2235 #define vsliq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00550 | _NENC_20_16(shift_amount), (Qd), (Qm)) )
2236 #define vsliq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38005d0 | _NENC_21_16(shift_amount), (Qd), (Qm)) )
2237 #define vsliq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2238 
2239 // VSRA, VRSRA (immediate)
2240 #define vrsra_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf2900310 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2241 #define vrsra_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf2a00310 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2242 #define vrsra_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf2800390 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2243 #define vrsra_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf2880310 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2244 #define vrsra_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900310 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2245 #define vrsra_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00310 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2246 #define vrsra_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800390 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2247 #define vrsra_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880310 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2248 #define vsra_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf2900110 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2249 #define vsra_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf2a00110 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2250 #define vsra_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf2800190 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2251 #define vsra_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf2880110 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2252 #define vsra_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900110 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2253 #define vsra_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00110 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2254 #define vsra_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800190 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2255 #define vsra_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880110 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2256 #define vrsraq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf2900350 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2257 #define vrsraq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf2a00350 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2258 #define vrsraq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf28003d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2259 #define vrsraq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf2880350 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2260 #define vrsraq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900350 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2261 #define vrsraq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00350 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2262 #define vrsraq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38003d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2263 #define vrsraq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880350 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2264 #define vsraq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf2900150 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2265 #define vsraq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf2a00150 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2266 #define vsraq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf28001d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2267 #define vsraq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf2880150 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2268 #define vsraq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900150 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2269 #define vsraq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00150 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2270 #define vsraq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38001d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2271 #define vsraq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880150 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2272 
2273 // VSRI (immediate)
2274 #define vsri_n_p16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2275 #define vsri_n_p8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2276 #define vsri_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2277 #define vsri_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00410 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2278 #define vsri_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800490 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2279 #define vsri_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2280 #define vsri_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2281 #define vsri_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00410 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2282 #define vsri_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800490 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2283 #define vsri_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2284 #define vsriq_n_p16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2285 #define vsriq_n_p8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2286 #define vsriq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2287 #define vsriq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00450 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2288 #define vsriq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38004d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2289 #define vsriq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2290 #define vsriq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2291 #define vsriq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00450 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2292 #define vsriq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38004d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2293 #define vsriq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2294 
2295 // VST1 (multiple single elements)
2296 #define vst1_f32(pD, D) ( __neon_AdrD1( 0xf400078f, __float32ToN64(pD), (D)) )
2297 #define vst1_p16(pD, D) ( __neon_AdrD1( 0xf400074f, __poly16ToN64(pD), (D)) )
2298 #define vst1_p8(pD, D) ( __neon_AdrD1( 0xf400070f, __poly8ToN64(pD), (D)) )
2299 #define vst1_s16(pD, D) ( __neon_AdrD1( 0xf400074f, __int16ToN64(pD), (D)) )
2300 #define vst1_s32(pD, D) ( __neon_AdrD1( 0xf400078f, __int32ToN64(pD), (D)) )
2301 #define vst1_s64(pD, D) ( __neon_AdrD1( 0xf40007cf, __int64ToN64(pD), (D)) )
2302 #define vst1_s8(pD, D) ( __neon_AdrD1( 0xf400070f, __int8ToN64(pD), (D)) )
2303 #define vst1_u16(pD, D) ( __neon_AdrD1( 0xf400074f, __uint16ToN64(pD), (D)) )
2304 #define vst1_u32(pD, D) ( __neon_AdrD1( 0xf400078f, __uint32ToN64(pD), (D)) )
2305 #define vst1_u64(pD, D) ( __neon_AdrD1( 0xf40007cf, __uint64ToN64(pD), (D)) )
2306 #define vst1_u8(pD, D) ( __neon_AdrD1( 0xf400070f, __uint8ToN64(pD), (D)) )
2307 #define vst1_f32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D)) )
2308 #define vst1_p16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D)) )
2309 #define vst1_p8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (D)) )
2310 #define vst1_s16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D)) )
2311 #define vst1_s32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D)) )
2312 #define vst1_s64_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf40007cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64(pD), (D)) )
2313 #define vst1_s8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (D)) )
2314 #define vst1_u16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D)) )
2315 #define vst1_u32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D)) )
2316 #define vst1_u64_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf40007cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64(pD), (D)) )
2317 #define vst1_u8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (D)) )
2318 #define vst1q_f32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __float32ToN64(pD), (Q)) )
2319 #define vst1q_p16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __poly16ToN64(pD), (Q)) )
2320 #define vst1q_p8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __poly8ToN64(pD), (Q)) )
2321 #define vst1q_s16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __int16ToN64(pD), (Q)) )
2322 #define vst1q_s32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __int32ToN64(pD), (Q)) )
2323 #define vst1q_s64(pD, Q) ( __neon_AdrQ1( 0xf4000acf, __int64ToN64(pD), (Q)) )
2324 #define vst1q_s8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __int8ToN64(pD), (Q)) )
2325 #define vst1q_u16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __uint16ToN64(pD), (Q)) )
2326 #define vst1q_u32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __uint32ToN64(pD), (Q)) )
2327 #define vst1q_u64(pD, Q) ( __neon_AdrQ1( 0xf4000acf, __uint64ToN64(pD), (Q)) )
2328 #define vst1q_u8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __uint8ToN64(pD), (Q)) )
2329 #define vst1q_f32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q)) )
2330 #define vst1q_p16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (Q)) )
2331 #define vst1q_p8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (Q)) )
2332 #define vst1q_s16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (Q)) )
2333 #define vst1q_s32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q)) )
2334 #define vst1q_s64_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64(pD), (Q)) )
2335 #define vst1q_s8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (Q)) )
2336 #define vst1q_u16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (Q)) )
2337 #define vst1q_u32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q)) )
2338 #define vst1q_u64_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64(pD), (Q)) )
2339 #define vst1q_u8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (Q)) )
2340 
2341 // VST1 (single element from one lane)
2342 #define vst1_lane_f32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __float32ToN64(pD), (D)) )
2343 #define vst1_lane_p16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __poly16ToN64(pD), (D)) )
2344 #define vst1_lane_p8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __poly8ToN64(pD), (D)) )
2345 #define vst1_lane_s16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __int16ToN64(pD), (D)) )
2346 #define vst1_lane_s32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __int32ToN64(pD), (D)) )
2347 #define vst1_lane_s8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __int8ToN64(pD), (D)) )
2348 #define vst1_lane_u16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __uint16ToN64(pD), (D)) )
2349 #define vst1_lane_u32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __uint32ToN64(pD), (D)) )
2350 #define vst1_lane_u8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __uint8ToN64(pD), (D)) )
2351 #define vst1q_lane_f32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q)) )
2352 #define vst1q_lane_p16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q)) )
2353 #define vst1q_lane_p8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __poly8ToN64(pD), (Q)) )
2354 #define vst1q_lane_s16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q)) )
2355 #define vst1q_lane_s32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q)) )
2356 #define vst1q_lane_s8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __int8ToN64(pD), (Q)) )
2357 #define vst1q_lane_u16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q)) )
2358 #define vst1q_lane_u32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q)) )
2359 #define vst1q_lane_u8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __uint8ToN64(pD), (Q)) )
2360 
2361 // VST1 (single element from one lane, aligned)
2362 #define vst1_lane_f32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __float32ToN64(pD), (D)) )
2363 #define vst1_lane_p16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64(pD), (D)) )
2364 #define vst1_lane_s16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64(pD), (D)) )
2365 #define vst1_lane_s32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __int32ToN64(pD), (D)) )
2366 #define vst1_lane_u16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64(pD), (D)) )
2367 #define vst1_lane_u32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __uint32ToN64(pD), (D)) )
2368 #define vst1q_lane_f32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __float32ToN64(pD), (Q)) )
2369 #define vst1q_lane_p16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64(pD), (Q)) )
2370 #define vst1q_lane_s16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64(pD), (Q)) )
2371 #define vst1q_lane_s32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __int32ToN64(pD), (Q)) )
2372 #define vst1q_lane_u16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64(pD), (Q)) )
2373 #define vst1q_lane_u32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __uint32ToN64(pD), (Q)) )
2374 
2375 // VST2 (multiple 2-element structures)
2376 #define vst2_f32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __float32ToN64(pD), (D2)) )
2377 #define vst2_p16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __poly16ToN64(pD), (D2)) )
2378 #define vst2_p8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __poly8ToN64(pD), (D2)) )
2379 #define vst2_s16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __int16ToN64(pD), (D2)) )
2380 #define vst2_s32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __int32ToN64(pD), (D2)) )
2381 #define vst2_s8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __int8ToN64(pD), (D2)) )
2382 #define vst2_u16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __uint16ToN64(pD), (D2)) )
2383 #define vst2_u32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __uint32ToN64(pD), (D2)) )
2384 #define vst2_u8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __uint8ToN64(pD), (D2)) )
2385 #define vst2_s64(pD, D2) ( __neon_AdrDx2( 0xf4000acf, __int64ToN64(pD), (D2)) )
2386 #define vst2_u64(pD, D2) ( __neon_AdrDx2( 0xf4000acf, __uint64ToN64(pD), (D2)) )
2387 #define vst2_s64_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64(pD), (D2)) )
2388 #define vst2_u64_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64(pD), (D2)) )
2389 #define vst2_f32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (D2)) )
2390 #define vst2_p16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (D2)) )
2391 #define vst2_p8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (D2)) )
2392 #define vst2_s16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (D2)) )
2393 #define vst2_s32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (D2)) )
2394 #define vst2_s8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (D2)) )
2395 #define vst2_u16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (D2)) )
2396 #define vst2_u32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (D2)) )
2397 #define vst2_u8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (D2)) )
2398 #define vst2q_f32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __float32ToN64(pD), (Q2)) )
2399 #define vst2q_p16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __poly16ToN64(pD), (Q2)) )
2400 #define vst2q_p8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __poly8ToN64(pD), (Q2)) )
2401 #define vst2q_s16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __int16ToN64(pD), (Q2)) )
2402 #define vst2q_s32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __int32ToN64(pD), (Q2)) )
2403 #define vst2q_s8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __int8ToN64(pD), (Q2)) )
2404 #define vst2q_u16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __uint16ToN64(pD), (Q2)) )
2405 #define vst2q_u32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __uint32ToN64(pD), (Q2)) )
2406 #define vst2q_u8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __uint8ToN64(pD), (Q2)) )
2407 #define vst2q_f32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q2)) )
2408 #define vst2q_p16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (Q2)) )
2409 #define vst2q_p8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (Q2)) )
2410 #define vst2q_s16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (Q2)) )
2411 #define vst2q_s32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q2)) )
2412 #define vst2q_s8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (Q2)) )
2413 #define vst2q_u16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (Q2)) )
2414 #define vst2q_u32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q2)) )
2415 #define vst2q_u8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (Q2)) )
2416 
2417 // VST2 (single 2-element structure from one lane)
2418 #define vst2_lane_f32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __float32ToN64(pD), (D2)) )
2419 #define vst2_lane_p16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __poly16ToN64(pD), (D2)) )
2420 #define vst2_lane_p8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __poly8ToN64(pD), (D2)) )
2421 #define vst2_lane_s16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __int16ToN64(pD), (D2)) )
2422 #define vst2_lane_s32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __int32ToN64(pD), (D2)) )
2423 #define vst2_lane_s8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __int8ToN64(pD), (D2)) )
2424 #define vst2_lane_u16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __uint16ToN64(pD), (D2)) )
2425 #define vst2_lane_u32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __uint32ToN64(pD), (D2)) )
2426 #define vst2_lane_u8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __uint8ToN64(pD), (D2)) )
2427 #define vst2q_lane_f32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q2)) )
2428 #define vst2q_lane_p16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q2)) )
2429 #define vst2q_lane_s16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q2)) )
2430 #define vst2q_lane_s32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q2)) )
2431 #define vst2q_lane_u16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q2)) )
2432 #define vst2q_lane_u32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q2)) )
2433 
2434 // VST2 (single 2-element structure from one lane, aligned)
2435 #define vst2_lane_f32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D2)) )
2436 #define vst2_lane_p16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64(pD), (D2)) )
2437 #define vst2_lane_p8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __poly8ToN64(pD), (D2)) )
2438 #define vst2_lane_s16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64(pD), (D2)) )
2439 #define vst2_lane_s32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D2)) )
2440 #define vst2_lane_s8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __int8ToN64(pD), (D2)) )
2441 #define vst2_lane_u16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64(pD), (D2)) )
2442 #define vst2_lane_u32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D2)) )
2443 #define vst2_lane_u8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __uint8ToN64(pD), (D2)) )
2444 #define vst2q_lane_f32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (Q2)) )
2445 #define vst2q_lane_p16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64(pD), (Q2)) )
2446 #define vst2q_lane_s16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64(pD), (Q2)) )
2447 #define vst2q_lane_s32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (Q2)) )
2448 #define vst2q_lane_u16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64(pD), (Q2)) )
2449 #define vst2q_lane_u32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (Q2)) )
2450 
2451 // VST3 (multiple 3-element structures)
2452 #define vst3_f32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __float32ToN64(pD), (D3)) )
2453 #define vst3_p16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __poly16ToN64(pD), (D3)) )
2454 #define vst3_p8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __poly8ToN64(pD), (D3)) )
2455 #define vst3_s16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __int16ToN64(pD), (D3)) )
2456 #define vst3_s32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __int32ToN64(pD), (D3)) )
2457 #define vst3_s8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __int8ToN64(pD), (D3)) )
2458 #define vst3_u16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __uint16ToN64(pD), (D3)) )
2459 #define vst3_u32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __uint32ToN64(pD), (D3)) )
2460 #define vst3_u8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __uint8ToN64(pD), (D3)) )
2461 #define vst3_s64(pD, D3) ( __neon_AdrDx3( 0xf40006cf, __int64ToN64(pD), (D3)) )
2462 #define vst3_u64(pD, D3) ( __neon_AdrDx3( 0xf40006cf, __uint64ToN64(pD), (D3)) )
2463 #define vst3_s64_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf40006cf | _NENC_4(_NEON_ALIGN64(align)), __int64ToN64(pD), (D3)) )
2464 #define vst3_u64_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf40006cf | _NENC_4(_NEON_ALIGN64(align)), __uint64ToN64(pD), (D3)) )
2465 #define vst3_f32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D3)) )
2466 #define vst3_p16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D3)) )
2467 #define vst3_p8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (D3)) )
2468 #define vst3_s16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D3)) )
2469 #define vst3_s32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D3)) )
2470 #define vst3_s8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (D3)) )
2471 #define vst3_u16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D3)) )
2472 #define vst3_u32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D3)) )
2473 #define vst3_u8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (D3)) )
2474 #define vst3q_f32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __float32ToN64(pD), (Q3)) )
2475 #define vst3q_p16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __poly16ToN64(pD), (Q3)) )
2476 #define vst3q_p8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __poly8ToN64(pD), (Q3)) )
2477 #define vst3q_s16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __int16ToN64(pD), (Q3)) )
2478 #define vst3q_s32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __int32ToN64(pD), (Q3)) )
2479 #define vst3q_s8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __int8ToN64(pD), (Q3)) )
2480 #define vst3q_u16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __uint16ToN64(pD), (Q3)) )
2481 #define vst3q_u32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __uint32ToN64(pD), (Q3)) )
2482 #define vst3q_u8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __uint8ToN64(pD), (Q3)) )
2483 #define vst3q_f32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (Q3)) )
2484 #define vst3q_p16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (Q3)) )
2485 #define vst3q_p8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (Q3)) )
2486 #define vst3q_s16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (Q3)) )
2487 #define vst3q_s32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (Q3)) )
2488 #define vst3q_s8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (Q3)) )
2489 #define vst3q_u16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (Q3)) )
2490 #define vst3q_u32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (Q3)) )
2491 #define vst3q_u8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (Q3)) )
2492 
2493 // VST3 (single 3-element structure from one lane)
2494 #define vst3_lane_f32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __float32ToN64(pD), (D3)) )
2495 #define vst3_lane_p16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __poly16ToN64(pD), (D3)) )
2496 #define vst3_lane_p8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __poly8ToN64(pD), (D3)) )
2497 #define vst3_lane_s16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __int16ToN64(pD), (D3)) )
2498 #define vst3_lane_s32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __int32ToN64(pD), (D3)) )
2499 #define vst3_lane_s8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __int8ToN64(pD), (D3)) )
2500 #define vst3_lane_u16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __uint16ToN64(pD), (D3)) )
2501 #define vst3_lane_u32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __uint32ToN64(pD), (D3)) )
2502 #define vst3_lane_u8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __uint8ToN64(pD), (D3)) )
2503 #define vst3q_lane_f32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q3)) )
2504 #define vst3q_lane_p16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q3)) )
2505 #define vst3q_lane_s16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q3)) )
2506 #define vst3q_lane_s32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q3)) )
2507 #define vst3q_lane_u16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q3)) )
2508 #define vst3q_lane_u32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q3)) )
2509 
2510 // VST4 (multiple 4-element structures)
2511 #define vst4_f32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __float32ToN64(pD), (D4)) )
2512 #define vst4_p16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __poly16ToN64(pD), (D4)) )
2513 #define vst4_p8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __poly8ToN64(pD), (D4)) )
2514 #define vst4_s16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __int16ToN64(pD), (D4)) )
2515 #define vst4_s32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __int32ToN64(pD), (D4)) )
2516 #define vst4_s8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __int8ToN64(pD), (D4)) )
2517 #define vst4_u16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __uint16ToN64(pD), (D4)) )
2518 #define vst4_u32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __uint32ToN64(pD), (D4)) )
2519 #define vst4_u8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __uint8ToN64(pD), (D4)) )
2520 #define vst4_s64(pD, D4) ( __neon_AdrDx4( 0xf40002cf, __int64ToN64(pD), (D4)) )
2521 #define vst4_u64(pD, D4) ( __neon_AdrDx4( 0xf40002cf, __uint64ToN64(pD), (D4)) )
2522 #define vst4_s64_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf40002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int64ToN64(pD), (D4)) )
2523 #define vst4_u64_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf40002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint64ToN64(pD), (D4)) )
2524 #define vst4_f32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64(pD), (D4)) )
2525 #define vst4_p16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64(pD), (D4)) )
2526 #define vst4_p8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64(pD), (D4)) )
2527 #define vst4_s16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64(pD), (D4)) )
2528 #define vst4_s32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64(pD), (D4)) )
2529 #define vst4_s8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64(pD), (D4)) )
2530 #define vst4_u16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64(pD), (D4)) )
2531 #define vst4_u32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64(pD), (D4)) )
2532 #define vst4_u8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64(pD), (D4)) )
2533 #define vst4q_f32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __float32ToN64(pD), (Q4)) )
2534 #define vst4q_p16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __poly16ToN64(pD), (Q4)) )
2535 #define vst4q_p8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __poly8ToN64(pD), (Q4)) )
2536 #define vst4q_s16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __int16ToN64(pD), (Q4)) )
2537 #define vst4q_s32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __int32ToN64(pD), (Q4)) )
2538 #define vst4q_s8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __int8ToN64(pD), (Q4)) )
2539 #define vst4q_u16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __uint16ToN64(pD), (Q4)) )
2540 #define vst4q_u32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __uint32ToN64(pD), (Q4)) )
2541 #define vst4q_u8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __uint8ToN64(pD), (Q4)) )
2542 #define vst4q_f32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64(pD), (Q4)) )
2543 #define vst4q_p16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64(pD), (Q4)) )
2544 #define vst4q_p8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64(pD), (Q4)) )
2545 #define vst4q_s16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64(pD), (Q4)) )
2546 #define vst4q_s32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64(pD), (Q4)) )
2547 #define vst4q_s8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64(pD), (Q4)) )
2548 #define vst4q_u16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64(pD), (Q4)) )
2549 #define vst4q_u32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64(pD), (Q4)) )
2550 #define vst4q_u8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64(pD), (Q4)) )
2551 
2552 // VST4 (single 4-element structure from one lane)
2553 #define vst4_lane_f32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __float32ToN64(pD), (D4)) )
2554 #define vst4_lane_p16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __poly16ToN64(pD), (D4)) )
2555 #define vst4_lane_p8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __poly8ToN64(pD), (D4)) )
2556 #define vst4_lane_s16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __int16ToN64(pD), (D4)) )
2557 #define vst4_lane_s32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __int32ToN64(pD), (D4)) )
2558 #define vst4_lane_s8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __int8ToN64(pD), (D4)) )
2559 #define vst4_lane_u16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __uint16ToN64(pD), (D4)) )
2560 #define vst4_lane_u32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __uint32ToN64(pD), (D4)) )
2561 #define vst4_lane_u8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __uint8ToN64(pD), (D4)) )
2562 #define vst4q_lane_f32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q4)) )
2563 #define vst4q_lane_p16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q4)) )
2564 #define vst4q_lane_s16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q4)) )
2565 #define vst4q_lane_s32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q4)) )
2566 #define vst4q_lane_u16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q4)) )
2567 #define vst4q_lane_u32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q4)) )
2568 
2569 // VST4 (single 4-element structure from one lane, aligned)
2570 #define vst4_lane_f32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (D4)) )
2571 #define vst4_lane_p16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D4)) )
2572 #define vst4_lane_p8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __poly8ToN64(pD), (D4)) )
2573 #define vst4_lane_s16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D4)) )
2574 #define vst4_lane_s32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (D4)) )
2575 #define vst4_lane_s8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __int8ToN64(pD), (D4)) )
2576 #define vst4_lane_u16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D4)) )
2577 #define vst4_lane_u32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (D4)) )
2578 #define vst4_lane_u8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __uint8ToN64(pD), (D4)) )
2579 #define vst4q_lane_f32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q4)) )
2580 #define vst4q_lane_p16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (Q4)) )
2581 #define vst4q_lane_s16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (Q4)) )
2582 #define vst4q_lane_s32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q4)) )
2583 #define vst4q_lane_u16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (Q4)) )
2584 #define vst4q_lane_u32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q4)) )
2585 
2586 // VSUB
2587 #define vsub_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200d00, (Dn), (Dm)) )
2588 #define vsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100800, (Dn), (Dm)) )
2589 #define vsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200800, (Dn), (Dm)) )
2590 #define vsub_s64(Dn, Dm) ( __neon_DdDnDm( 0xf3300800, (Dn), (Dm)) )
2591 #define vsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000800, (Dn), (Dm)) )
2592 #define vsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100800, (Dn), (Dm)) )
2593 #define vsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200800, (Dn), (Dm)) )
2594 #define vsub_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300800, (Dn), (Dm)) )
2595 #define vsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000800, (Dn), (Dm)) )
2596 #define vsubq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200d40, (Qn), (Qm)) )
2597 #define vsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100840, (Qn), (Qm)) )
2598 #define vsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200840, (Qn), (Qm)) )
2599 #define vsubq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf3300840, (Qn), (Qm)) )
2600 #define vsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000840, (Qn), (Qm)) )
2601 #define vsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100840, (Qn), (Qm)) )
2602 #define vsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200840, (Qn), (Qm)) )
2603 #define vsubq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300840, (Qn), (Qm)) )
2604 #define vsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000840, (Qn), (Qm)) )
2605 
2606 // VSUBHN, VRSUBHN
2607 #define vrsubhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf3800600, (Qn), (Qm)) )
2608 #define vrsubhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf3900600, (Qn), (Qm)) )
2609 #define vrsubhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00600, (Qn), (Qm)) )
2610 #define vrsubhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf3800600, (Qn), (Qm)) )
2611 #define vrsubhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf3900600, (Qn), (Qm)) )
2612 #define vrsubhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00600, (Qn), (Qm)) )
2613 #define vsubhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf2800600, (Qn), (Qm)) )
2614 #define vsubhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf2900600, (Qn), (Qm)) )
2615 #define vsubhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00600, (Qn), (Qm)) )
2616 #define vsubhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf2800600, (Qn), (Qm)) )
2617 #define vsubhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf2900600, (Qn), (Qm)) )
2618 #define vsubhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00600, (Qn), (Qm)) )
2619 
2620 // VSUBL, VSUBW
2621 #define vsubl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900200, (Dn), (Dm)) )
2622 #define vsubl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00200, (Dn), (Dm)) )
2623 #define vsubl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800200, (Dn), (Dm)) )
2624 #define vsubl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900200, (Dn), (Dm)) )
2625 #define vsubl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00200, (Dn), (Dm)) )
2626 #define vsubl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800200, (Dn), (Dm)) )
2627 #define vsubw_s16(Qn, Dm) ( __neon_QdQnDm( 0xf2900300, (Qn), (Dm)) )
2628 #define vsubw_s32(Qn, Dm) ( __neon_QdQnDm( 0xf2a00300, (Qn), (Dm)) )
2629 #define vsubw_s8(Qn, Dm) ( __neon_QdQnDm( 0xf2800300, (Qn), (Dm)) )
2630 #define vsubw_u16(Qn, Dm) ( __neon_QdQnDm( 0xf3900300, (Qn), (Dm)) )
2631 #define vsubw_u32(Qn, Dm) ( __neon_QdQnDm( 0xf3a00300, (Qn), (Dm)) )
2632 #define vsubw_u8(Qn, Dm) ( __neon_QdQnDm( 0xf3800300, (Qn), (Dm)) )
2633 
2634 // VTBL, VTBX
2635 #define vtbl2_p8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2636 #define vtbl2_s8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2637 #define vtbl2_u8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2638 #define vtbx2_p8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2639 #define vtbx2_s8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2640 #define vtbx2_u8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2641 #define vtbl3_p8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2642 #define vtbl3_s8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2643 #define vtbl3_u8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2644 #define vtbx3_p8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2645 #define vtbx3_s8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2646 #define vtbx3_u8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2647 #define vtbl4_p8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2648 #define vtbl4_s8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2649 #define vtbl4_u8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2650 #define vtbx4_p8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2651 #define vtbx4_s8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2652 #define vtbx4_u8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2653 #define vtbl1_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2654 #define vtbl1_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2655 #define vtbl1_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2656 #define vtbx1_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2657 #define vtbx1_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2658 #define vtbx1_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2659 
2660 // VTRN
2661 #define vtrn_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2662 #define vtrn_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2663 #define vtrn_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2664 #define vtrn_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2665 #define vtrn_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2666 #define vtrn_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2667 #define vtrn_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2668 #define vtrn_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2669 #define vtrn_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2670 #define vtrnq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2671 #define vtrnq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2672 #define vtrnq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2673 #define vtrnq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2674 #define vtrnq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2675 #define vtrnq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2676 #define vtrnq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2677 #define vtrnq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2678 #define vtrnq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2679 
2680 // VTRNQ64
2681 #define vtrnq_s64(Qd, Qm) ( __neon_QdQm_acc3( 0x00000000, (Qd), (Qm)) )
2682 #define vtrnq_u64(Qd, Qm) ( __neon_QdQm_acc3( 0x00000000, (Qd), (Qm)) )
2683 
2684 // VTST
2685 #define vtst_p8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2686 #define vtst_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100810, (Dn), (Dm)) )
2687 #define vtst_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200810, (Dn), (Dm)) )
2688 #define vtst_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2689 #define vtst_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100810, (Dn), (Dm)) )
2690 #define vtst_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200810, (Dn), (Dm)) )
2691 #define vtst_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2692 #define vtstq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2693 #define vtstq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100850, (Qn), (Qm)) )
2694 #define vtstq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200850, (Qn), (Qm)) )
2695 #define vtstq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2696 #define vtstq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100850, (Qn), (Qm)) )
2697 #define vtstq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200850, (Qn), (Qm)) )
2698 #define vtstq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2699 
2700 // VUZP
2701 #define vuzp_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2702 #define vuzp_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2703 #define vuzp_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2704 #define vuzp_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2705 #define vuzp_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2706 #define vuzp_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2707 #define vuzp_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2708 #define vuzp_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2709 #define vuzp_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2710 #define vuzpq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2711 #define vuzpq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2712 #define vuzpq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2713 #define vuzpq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2714 #define vuzpq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2715 #define vuzpq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2716 #define vuzpq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2717 #define vuzpq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2718 #define vuzpq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2719 
2720 // VZIP
2721 #define vzip_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2722 #define vzip_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2723 #define vzip_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2724 #define vzip_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2725 #define vzip_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2726 #define vzip_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2727 #define vzip_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2728 #define vzip_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2729 #define vzip_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2730 #define vzipq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2731 #define vzipq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2732 #define vzipq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2733 #define vzipq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2734 #define vzipq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2735 #define vzipq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2736 #define vzipq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2737 #define vzipq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2738 #define vzipq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2739 
2740 // } +++ auto-generated code ends (Neon macros)
2741 
2742 
2743 
2745 //
2746 // { +++ auto-generated code begins (vreinterpret macros)
2747 
2748 #define vreinterpret_f32_s8(a) (a)
2749 #define vreinterpret_f32_s16(a) (a)
2750 #define vreinterpret_f32_s32(a) (a)
2751 #define vreinterpret_f32_s64(a) (a)
2752 #define vreinterpret_f32_p8(a) (a)
2753 #define vreinterpret_f32_p16(a) (a)
2754 #define vreinterpret_f32_u8(a) (a)
2755 #define vreinterpret_f32_u16(a) (a)
2756 #define vreinterpret_f32_u32(a) (a)
2757 #define vreinterpret_f32_u64(a) (a)
2758 #define vreinterpret_s8_f32(a) (a)
2759 #define vreinterpret_s8_s16(a) (a)
2760 #define vreinterpret_s8_s32(a) (a)
2761 #define vreinterpret_s8_s64(a) (a)
2762 #define vreinterpret_s8_p8(a) (a)
2763 #define vreinterpret_s8_p16(a) (a)
2764 #define vreinterpret_s8_u8(a) (a)
2765 #define vreinterpret_s8_u16(a) (a)
2766 #define vreinterpret_s8_u32(a) (a)
2767 #define vreinterpret_s8_u64(a) (a)
2768 #define vreinterpret_s16_f32(a) (a)
2769 #define vreinterpret_s16_s8(a) (a)
2770 #define vreinterpret_s16_s32(a) (a)
2771 #define vreinterpret_s16_s64(a) (a)
2772 #define vreinterpret_s16_p8(a) (a)
2773 #define vreinterpret_s16_p16(a) (a)
2774 #define vreinterpret_s16_u8(a) (a)
2775 #define vreinterpret_s16_u16(a) (a)
2776 #define vreinterpret_s16_u32(a) (a)
2777 #define vreinterpret_s16_u64(a) (a)
2778 #define vreinterpret_s32_f32(a) (a)
2779 #define vreinterpret_s32_s8(a) (a)
2780 #define vreinterpret_s32_s16(a) (a)
2781 #define vreinterpret_s32_s64(a) (a)
2782 #define vreinterpret_s32_p8(a) (a)
2783 #define vreinterpret_s32_p16(a) (a)
2784 #define vreinterpret_s32_u8(a) (a)
2785 #define vreinterpret_s32_u16(a) (a)
2786 #define vreinterpret_s32_u32(a) (a)
2787 #define vreinterpret_s32_u64(a) (a)
2788 #define vreinterpret_s64_f32(a) (a)
2789 #define vreinterpret_s64_s8(a) (a)
2790 #define vreinterpret_s64_s16(a) (a)
2791 #define vreinterpret_s64_s32(a) (a)
2792 #define vreinterpret_s64_p8(a) (a)
2793 #define vreinterpret_s64_p16(a) (a)
2794 #define vreinterpret_s64_u8(a) (a)
2795 #define vreinterpret_s64_u16(a) (a)
2796 #define vreinterpret_s64_u32(a) (a)
2797 #define vreinterpret_s64_u64(a) (a)
2798 #define vreinterpret_p8_f32(a) (a)
2799 #define vreinterpret_p8_s8(a) (a)
2800 #define vreinterpret_p8_s16(a) (a)
2801 #define vreinterpret_p8_s32(a) (a)
2802 #define vreinterpret_p8_s64(a) (a)
2803 #define vreinterpret_p8_p16(a) (a)
2804 #define vreinterpret_p8_u8(a) (a)
2805 #define vreinterpret_p8_u16(a) (a)
2806 #define vreinterpret_p8_u32(a) (a)
2807 #define vreinterpret_p8_u64(a) (a)
2808 #define vreinterpret_p16_f32(a) (a)
2809 #define vreinterpret_p16_s8(a) (a)
2810 #define vreinterpret_p16_s16(a) (a)
2811 #define vreinterpret_p16_s32(a) (a)
2812 #define vreinterpret_p16_s64(a) (a)
2813 #define vreinterpret_p16_p8(a) (a)
2814 #define vreinterpret_p16_u8(a) (a)
2815 #define vreinterpret_p16_u16(a) (a)
2816 #define vreinterpret_p16_u32(a) (a)
2817 #define vreinterpret_p16_u64(a) (a)
2818 #define vreinterpret_u8_f32(a) (a)
2819 #define vreinterpret_u8_s8(a) (a)
2820 #define vreinterpret_u8_s16(a) (a)
2821 #define vreinterpret_u8_s32(a) (a)
2822 #define vreinterpret_u8_s64(a) (a)
2823 #define vreinterpret_u8_p8(a) (a)
2824 #define vreinterpret_u8_p16(a) (a)
2825 #define vreinterpret_u8_u16(a) (a)
2826 #define vreinterpret_u8_u32(a) (a)
2827 #define vreinterpret_u8_u64(a) (a)
2828 #define vreinterpret_u16_f32(a) (a)
2829 #define vreinterpret_u16_s8(a) (a)
2830 #define vreinterpret_u16_s16(a) (a)
2831 #define vreinterpret_u16_s32(a) (a)
2832 #define vreinterpret_u16_s64(a) (a)
2833 #define vreinterpret_u16_p8(a) (a)
2834 #define vreinterpret_u16_p16(a) (a)
2835 #define vreinterpret_u16_u8(a) (a)
2836 #define vreinterpret_u16_u32(a) (a)
2837 #define vreinterpret_u16_u64(a) (a)
2838 #define vreinterpret_u32_f32(a) (a)
2839 #define vreinterpret_u32_s8(a) (a)
2840 #define vreinterpret_u32_s16(a) (a)
2841 #define vreinterpret_u32_s32(a) (a)
2842 #define vreinterpret_u32_s64(a) (a)
2843 #define vreinterpret_u32_p8(a) (a)
2844 #define vreinterpret_u32_p16(a) (a)
2845 #define vreinterpret_u32_u8(a) (a)
2846 #define vreinterpret_u32_u16(a) (a)
2847 #define vreinterpret_u32_u64(a) (a)
2848 #define vreinterpret_u64_f32(a) (a)
2849 #define vreinterpret_u64_s8(a) (a)
2850 #define vreinterpret_u64_s16(a) (a)
2851 #define vreinterpret_u64_s32(a) (a)
2852 #define vreinterpret_u64_s64(a) (a)
2853 #define vreinterpret_u64_p8(a) (a)
2854 #define vreinterpret_u64_p16(a) (a)
2855 #define vreinterpret_u64_u8(a) (a)
2856 #define vreinterpret_u64_u16(a) (a)
2857 #define vreinterpret_u64_u32(a) (a)
2858 #define vreinterpretq_f32_s8(a) (a)
2859 #define vreinterpretq_f32_s16(a) (a)
2860 #define vreinterpretq_f32_s32(a) (a)
2861 #define vreinterpretq_f32_s64(a) (a)
2862 #define vreinterpretq_f32_p8(a) (a)
2863 #define vreinterpretq_f32_p16(a) (a)
2864 #define vreinterpretq_f32_u8(a) (a)
2865 #define vreinterpretq_f32_u16(a) (a)
2866 #define vreinterpretq_f32_u32(a) (a)
2867 #define vreinterpretq_f32_u64(a) (a)
2868 #define vreinterpretq_s8_f32(a) (a)
2869 #define vreinterpretq_s8_s16(a) (a)
2870 #define vreinterpretq_s8_s32(a) (a)
2871 #define vreinterpretq_s8_s64(a) (a)
2872 #define vreinterpretq_s8_p8(a) (a)
2873 #define vreinterpretq_s8_p16(a) (a)
2874 #define vreinterpretq_s8_u8(a) (a)
2875 #define vreinterpretq_s8_u16(a) (a)
2876 #define vreinterpretq_s8_u32(a) (a)
2877 #define vreinterpretq_s8_u64(a) (a)
2878 #define vreinterpretq_s16_f32(a) (a)
2879 #define vreinterpretq_s16_s8(a) (a)
2880 #define vreinterpretq_s16_s32(a) (a)
2881 #define vreinterpretq_s16_s64(a) (a)
2882 #define vreinterpretq_s16_p8(a) (a)
2883 #define vreinterpretq_s16_p16(a) (a)
2884 #define vreinterpretq_s16_u8(a) (a)
2885 #define vreinterpretq_s16_u16(a) (a)
2886 #define vreinterpretq_s16_u32(a) (a)
2887 #define vreinterpretq_s16_u64(a) (a)
2888 #define vreinterpretq_s32_f32(a) (a)
2889 #define vreinterpretq_s32_s8(a) (a)
2890 #define vreinterpretq_s32_s16(a) (a)
2891 #define vreinterpretq_s32_s64(a) (a)
2892 #define vreinterpretq_s32_p8(a) (a)
2893 #define vreinterpretq_s32_p16(a) (a)
2894 #define vreinterpretq_s32_u8(a) (a)
2895 #define vreinterpretq_s32_u16(a) (a)
2896 #define vreinterpretq_s32_u32(a) (a)
2897 #define vreinterpretq_s32_u64(a) (a)
2898 #define vreinterpretq_s64_f32(a) (a)
2899 #define vreinterpretq_s64_s8(a) (a)
2900 #define vreinterpretq_s64_s16(a) (a)
2901 #define vreinterpretq_s64_s32(a) (a)
2902 #define vreinterpretq_s64_p8(a) (a)
2903 #define vreinterpretq_s64_p16(a) (a)
2904 #define vreinterpretq_s64_u8(a) (a)
2905 #define vreinterpretq_s64_u16(a) (a)
2906 #define vreinterpretq_s64_u32(a) (a)
2907 #define vreinterpretq_s64_u64(a) (a)
2908 #define vreinterpretq_p8_f32(a) (a)
2909 #define vreinterpretq_p8_s8(a) (a)
2910 #define vreinterpretq_p8_s16(a) (a)
2911 #define vreinterpretq_p8_s32(a) (a)
2912 #define vreinterpretq_p8_s64(a) (a)
2913 #define vreinterpretq_p8_p16(a) (a)
2914 #define vreinterpretq_p8_u8(a) (a)
2915 #define vreinterpretq_p8_u16(a) (a)
2916 #define vreinterpretq_p8_u32(a) (a)
2917 #define vreinterpretq_p8_u64(a) (a)
2918 #define vreinterpretq_p16_f32(a) (a)
2919 #define vreinterpretq_p16_s8(a) (a)
2920 #define vreinterpretq_p16_s16(a) (a)
2921 #define vreinterpretq_p16_s32(a) (a)
2922 #define vreinterpretq_p16_s64(a) (a)
2923 #define vreinterpretq_p16_p8(a) (a)
2924 #define vreinterpretq_p16_u8(a) (a)
2925 #define vreinterpretq_p16_u16(a) (a)
2926 #define vreinterpretq_p16_u32(a) (a)
2927 #define vreinterpretq_p16_u64(a) (a)
2928 #define vreinterpretq_u8_f32(a) (a)
2929 #define vreinterpretq_u8_s8(a) (a)
2930 #define vreinterpretq_u8_s16(a) (a)
2931 #define vreinterpretq_u8_s32(a) (a)
2932 #define vreinterpretq_u8_s64(a) (a)
2933 #define vreinterpretq_u8_p8(a) (a)
2934 #define vreinterpretq_u8_p16(a) (a)
2935 #define vreinterpretq_u8_u16(a) (a)
2936 #define vreinterpretq_u8_u32(a) (a)
2937 #define vreinterpretq_u8_u64(a) (a)
2938 #define vreinterpretq_u16_f32(a) (a)
2939 #define vreinterpretq_u16_s8(a) (a)
2940 #define vreinterpretq_u16_s16(a) (a)
2941 #define vreinterpretq_u16_s32(a) (a)
2942 #define vreinterpretq_u16_s64(a) (a)
2943 #define vreinterpretq_u16_p8(a) (a)
2944 #define vreinterpretq_u16_p16(a) (a)
2945 #define vreinterpretq_u16_u8(a) (a)
2946 #define vreinterpretq_u16_u32(a) (a)
2947 #define vreinterpretq_u16_u64(a) (a)
2948 #define vreinterpretq_u32_f32(a) (a)
2949 #define vreinterpretq_u32_s8(a) (a)
2950 #define vreinterpretq_u32_s16(a) (a)
2951 #define vreinterpretq_u32_s32(a) (a)
2952 #define vreinterpretq_u32_s64(a) (a)
2953 #define vreinterpretq_u32_p8(a) (a)
2954 #define vreinterpretq_u32_p16(a) (a)
2955 #define vreinterpretq_u32_u8(a) (a)
2956 #define vreinterpretq_u32_u16(a) (a)
2957 #define vreinterpretq_u32_u64(a) (a)
2958 #define vreinterpretq_u64_f32(a) (a)
2959 #define vreinterpretq_u64_s8(a) (a)
2960 #define vreinterpretq_u64_s16(a) (a)
2961 #define vreinterpretq_u64_s32(a) (a)
2962 #define vreinterpretq_u64_s64(a) (a)
2963 #define vreinterpretq_u64_p8(a) (a)
2964 #define vreinterpretq_u64_p16(a) (a)
2965 #define vreinterpretq_u64_u8(a) (a)
2966 #define vreinterpretq_u64_u16(a) (a)
2967 #define vreinterpretq_u64_u32(a) (a)
2968 
2969 // } +++ auto-generated code ends (vreinterpret macros)
2970 
2971 // { +++ auto-generated code begins (Pseudo intrinsics)
2972 
2973 // Multiply by scalar
2974 #define vmul_n_s16(Vd, Rt) vmul_lane_s16((Vd), vmov_n_s16(Rt), 0)
2975 #define vmul_n_s32(Vd, Rt) vmul_lane_s32((Vd), vmov_n_s32(Rt), 0)
2976 #define vmul_n_u16(Vd, Rt) vmul_lane_u16((Vd), vmov_n_u16(Rt), 0)
2977 #define vmul_n_u32(Vd, Rt) vmul_lane_u32((Vd), vmov_n_u32(Rt), 0)
2978 #define vmulq_n_s16(Vd, Rt) vmulq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2979 #define vmulq_n_s32(Vd, Rt) vmulq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2980 #define vmulq_n_u16(Vd, Rt) vmulq_lane_u16((Vd), vmov_n_u16(Rt), 0)
2981 #define vmulq_n_u32(Vd, Rt) vmulq_lane_u32((Vd), vmov_n_u32(Rt), 0)
2982 #define vmull_n_s16(Vd, Rt) vmull_lane_s16((Vd), vmov_n_s16(Rt), 0)
2983 #define vmull_n_s32(Vd, Rt) vmull_lane_s32((Vd), vmov_n_s32(Rt), 0)
2984 #define vmull_n_u16(Vd, Rt) vmull_lane_u16((Vd), vmov_n_u16(Rt), 0)
2985 #define vmull_n_u32(Vd, Rt) vmull_lane_u32((Vd), vmov_n_u32(Rt), 0)
2986 #define vqdmulh_n_s16(Vd, Rt) vqdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
2987 #define vqdmulh_n_s32(Vd, Rt) vqdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
2988 #define vqdmulhq_n_s16(Vd, Rt) vqdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2989 #define vqdmulhq_n_s32(Vd, Rt) vqdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2990 #define vqdmull_n_s16(Vd, Rt) vqdmull_lane_s16((Vd), vmov_n_s16(Rt), 0)
2991 #define vqdmull_n_s32(Vd, Rt) vqdmull_lane_s32((Vd), vmov_n_s32(Rt), 0)
2992 #define vqrdmulh_n_s16(Vd, Rt) vqrdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
2993 #define vqrdmulh_n_s32(Vd, Rt) vqrdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
2994 #define vqrdmulhq_n_s16(Vd, Rt) vqrdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2995 #define vqrdmulhq_n_s32(Vd, Rt) vqrdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2996 
2997 // Multiply by scalar with accumulate
2998 #define vmla_n_s16(Vd, Vn, Rt) vmla_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
2999 #define vmla_n_s32(Vd, Vn, Rt) vmla_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3000 #define vmla_n_u16(Vd, Vn, Rt) vmla_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3001 #define vmla_n_u32(Vd, Vn, Rt) vmla_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3002 #define vmlaq_n_s16(Vd, Vn, Rt) vmlaq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3003 #define vmlaq_n_s32(Vd, Vn, Rt) vmlaq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3004 #define vmlaq_n_u16(Vd, Vn, Rt) vmlaq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3005 #define vmlaq_n_u32(Vd, Vn, Rt) vmlaq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3006 #define vmlal_n_s16(Vd, Vn, Rt) vmlal_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3007 #define vmlal_n_s32(Vd, Vn, Rt) vmlal_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3008 #define vmlal_n_u16(Vd, Vn, Rt) vmlal_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3009 #define vmlal_n_u32(Vd, Vn, Rt) vmlal_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3010 #define vmls_n_s16(Vd, Vn, Rt) vmls_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3011 #define vmls_n_s32(Vd, Vn, Rt) vmls_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3012 #define vmls_n_u16(Vd, Vn, Rt) vmls_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3013 #define vmls_n_u32(Vd, Vn, Rt) vmls_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3014 #define vmlsq_n_s16(Vd, Vn, Rt) vmlsq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3015 #define vmlsq_n_s32(Vd, Vn, Rt) vmlsq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3016 #define vmlsq_n_u16(Vd, Vn, Rt) vmlsq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3017 #define vmlsq_n_u32(Vd, Vn, Rt) vmlsq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3018 #define vmlsl_n_s16(Vd, Vn, Rt) vmlsl_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3019 #define vmlsl_n_s32(Vd, Vn, Rt) vmlsl_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3020 #define vmlsl_n_u16(Vd, Vn, Rt) vmlsl_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3021 #define vmlsl_n_u32(Vd, Vn, Rt) vmlsl_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3022 #define vqdmlal_n_s16(Vd, Vn, Rt) vqdmlal_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3023 #define vqdmlal_n_s32(Vd, Vn, Rt) vqdmlal_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3024 #define vqdmlsl_n_s16(Vd, Vn, Rt) vqdmlsl_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3025 #define vqdmlsl_n_s32(Vd, Vn, Rt) vqdmlsl_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3026 
3027 // VDUP.64 (scalar)
3028 #define vdup_lane_s64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), (Dn))
3029 #define vdup_lane_u64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), (Dn))
3030 
3031 // VDUP.W.64 (scalar)
3032 #define vdupq_lane_s64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), vcombine_s64((Dn), (Dn)))
3033 #define vdupq_lane_u64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), vcombine_u64((Dn), (Dn)))
3034 
3035 // } +++ auto-generated code ends (Pseudo intrinsics)
3036 
3037 #else // defined(_ARM_USE_DEPRECATED_NEON_INTRINSICS)
3038 
3039 #if defined (__cplusplus)
3040 extern "C" {
3041 #endif /* defined (__cplusplus) */
3042 
3043 __n128 __aesd_p8(__n128 _Qm);
3044 __n128 __aesd_s8(__n128 _Qm);
3045 __n128 __aesd_u8(__n128 _Qm);
3046 __n128 __aese_p8(__n128 _Qm);
3047 __n128 __aese_s8(__n128 _Qm);
3048 __n128 __aese_u8(__n128 _Qm);
3049 __n128 __aesimc_p8(__n128 _Qm);
3050 __n128 __aesimc_s8(__n128 _Qm);
3051 __n128 __aesimc_u8(__n128 _Qm);
3052 __n128 __aesmc_p8(__n128 _Qm);
3053 __n128 __aesmc_s8(__n128 _Qm);
3054 __n128 __aesmc_u8(__n128 _Qm);
3055 __n128 __sha1h_f32(__n128 _Qm);
3056 __n128 __sha1h_s32(__n128 _Qm);
3057 __n128 __sha1h_u32(__n128 _Qm);
3058 __n128 __sha1su1_f32(__n128 _Qm);
3059 __n128 __sha1su1_s32(__n128 _Qm);
3060 __n128 __sha1su1_u32(__n128 _Qm);
3061 __n128 __sha256su0_f32(__n128 _Qm);
3062 __n128 __sha256su0_s32(__n128 _Qm);
3063 __n128 __sha256su0_u32(__n128 _Qm);
3064 __n128 __sha1c_f32(__n128 _Qn, __n128 _Qm);
3065 __n128 __sha1c_s32(__n128 _Qn, __n128 _Qm);
3066 __n128 __sha1c_u32(__n128 _Qn, __n128 _Qm);
3067 __n128 __sha1m_f32(__n128 _Qn, __n128 _Qm);
3068 __n128 __sha1m_s32(__n128 _Qn, __n128 _Qm);
3069 __n128 __sha1m_u32(__n128 _Qn, __n128 _Qm);
3070 __n128 __sha1p_f32(__n128 _Qn, __n128 _Qm);
3071 __n128 __sha1p_s32(__n128 _Qn, __n128 _Qm);
3072 __n128 __sha1p_u32(__n128 _Qn, __n128 _Qm);
3073 __n128 __sha1su0_f32(__n128 _Qn, __n128 _Qm);
3074 __n128 __sha1su0_s32(__n128 _Qn, __n128 _Qm);
3075 __n128 __sha1su0_u32(__n128 _Qn, __n128 _Qm);
3076 __n128 __sha256h_f32(__n128 _Qn, __n128 _Qm);
3077 __n128 __sha256h_s32(__n128 _Qn, __n128 _Qm);
3078 __n128 __sha256h_u32(__n128 _Qn, __n128 _Qm);
3079 __n128 __sha256h2_f32(__n128 _Qn, __n128 _Qm);
3080 __n128 __sha256h2_s32(__n128 _Qn, __n128 _Qm);
3081 __n128 __sha256h2_u32(__n128 _Qn, __n128 _Qm);
3082 __n128 __sha256su1_f32(__n128 _Qn, __n128 _Qm);
3083 __n128 __sha256su1_s32(__n128 _Qn, __n128 _Qm);
3084 __n128 __sha256su1_u32(__n128 _Qn, __n128 _Qm);
3085 __n64 __vaba_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3086 __n64 __vaba_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3087 __n64 __vaba_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3088 __n64 __vaba_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3089 __n64 __vaba_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3090 __n64 __vaba_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3091 __n128 __vabal_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3092 __n128 __vabal_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3093 __n128 __vabal_s8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3094 __n128 __vabal_u16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3095 __n128 __vabal_u32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3096 __n128 __vabal_u8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
3097 __n128 __vabaq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3098 __n128 __vabaq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3099 __n128 __vabaq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3100 __n128 __vabaq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3101 __n128 __vabaq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3102 __n128 __vabaq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3103 __n64 __vabd_f32(__n64 _Dn, __n64 _Dm);
3104 __n128 __vabdq_f32(__n128 _Qn, __n128 _Qm);
3105 __n64 __vabd_s16(__n64 _Dn, __n64 _Dm);
3106 __n64 __vabd_s32(__n64 _Dn, __n64 _Dm);
3107 __n64 __vabd_s8(__n64 _Dn, __n64 _Dm);
3108 __n64 __vabd_u16(__n64 _Dn, __n64 _Dm);
3109 __n64 __vabd_u32(__n64 _Dn, __n64 _Dm);
3110 __n64 __vabd_u8(__n64 _Dn, __n64 _Dm);
3111 __n128 __vabdl_s16(__n64 _Dn, __n64 _Dm);
3112 __n128 __vabdl_s32(__n64 _Dn, __n64 _Dm);
3113 __n128 __vabdl_s8(__n64 _Dn, __n64 _Dm);
3114 __n128 __vabdl_u16(__n64 _Dn, __n64 _Dm);
3115 __n128 __vabdl_u32(__n64 _Dn, __n64 _Dm);
3116 __n128 __vabdl_u8(__n64 _Dn, __n64 _Dm);
3117 __n128 __vabdq_s16(__n128 _Qn, __n128 _Qm);
3118 __n128 __vabdq_s32(__n128 _Qn, __n128 _Qm);
3119 __n128 __vabdq_s8(__n128 _Qn, __n128 _Qm);
3120 __n128 __vabdq_u16(__n128 _Qn, __n128 _Qm);
3121 __n128 __vabdq_u32(__n128 _Qn, __n128 _Qm);
3122 __n128 __vabdq_u8(__n128 _Qn, __n128 _Qm);
3123 __n64 __vabs_f32(__n64 _Dm);
3124 __n64 __vabs_s16(__n64 _Dm);
3125 __n64 __vabs_s32(__n64 _Dm);
3126 __n64 __vabs_s8(__n64 _Dm);
3127 __n64 __vneg_f32(__n64 _Dm);
3128 __n64 __vneg_s16(__n64 _Dm);
3129 __n64 __vneg_s32(__n64 _Dm);
3130 __n64 __vneg_s8(__n64 _Dm);
3131 __n128 __vabsq_f32(__n128 _Qm);
3132 __n128 __vabsq_s16(__n128 _Qm);
3133 __n128 __vabsq_s32(__n128 _Qm);
3134 __n128 __vabsq_s8(__n128 _Qm);
3135 __n128 __vnegq_f32(__n128 _Qm);
3136 __n128 __vnegq_s16(__n128 _Qm);
3137 __n128 __vnegq_s32(__n128 _Qm);
3138 __n128 __vnegq_s8(__n128 _Qm);
3139 __n64 __vacge_f32(__n64 _Dn, __n64 _Dm);
3140 __n64 __vacgt_f32(__n64 _Dn, __n64 _Dm);
3141 __n64 __vacle_f32(__n64 _Dn, __n64 _Dm);
3142 __n64 __vaclt_f32(__n64 _Dn, __n64 _Dm);
3143 __n128 __vacgeq_f32(__n128 _Qn, __n128 _Qm);
3144 __n128 __vacgtq_f32(__n128 _Qn, __n128 _Qm);
3145 __n128 __vacleq_f32(__n128 _Qn, __n128 _Qm);
3146 __n128 __vacltq_f32(__n128 _Qn, __n128 _Qm);
3147 __n64 __vadd_f32(__n64 _Dn, __n64 _Dm);
3148 __n64 __vadd_s16(__n64 _Dn, __n64 _Dm);
3149 __n64 __vadd_s32(__n64 _Dn, __n64 _Dm);
3150 __n64 __vadd_s64(__n64 _Dn, __n64 _Dm);
3151 __n64 __vadd_s8(__n64 _Dn, __n64 _Dm);
3152 __n64 __vadd_u16(__n64 _Dn, __n64 _Dm);
3153 __n64 __vadd_u32(__n64 _Dn, __n64 _Dm);
3154 __n64 __vadd_u64(__n64 _Dn, __n64 _Dm);
3155 __n64 __vadd_u8(__n64 _Dn, __n64 _Dm);
3156 __n128 __vaddq_f32(__n128 _Qn, __n128 _Qm);
3157 __n128 __vaddq_s16(__n128 _Qn, __n128 _Qm);
3158 __n128 __vaddq_s32(__n128 _Qn, __n128 _Qm);
3159 __n128 __vaddq_s64(__n128 _Qn, __n128 _Qm);
3160 __n128 __vaddq_s8(__n128 _Qn, __n128 _Qm);
3161 __n128 __vaddq_u16(__n128 _Qn, __n128 _Qm);
3162 __n128 __vaddq_u32(__n128 _Qn, __n128 _Qm);
3163 __n128 __vaddq_u64(__n128 _Qn, __n128 _Qm);
3164 __n128 __vaddq_u8(__n128 _Qn, __n128 _Qm);
3165 __n64 __vaddhn_s16(__n128 _Qn, __n128 _Qm);
3166 __n64 __vaddhn_s32(__n128 _Qn, __n128 _Qm);
3167 __n64 __vaddhn_s64(__n128 _Qn, __n128 _Qm);
3168 __n64 __vaddhn_u16(__n128 _Qn, __n128 _Qm);
3169 __n64 __vaddhn_u32(__n128 _Qn, __n128 _Qm);
3170 __n64 __vaddhn_u64(__n128 _Qn, __n128 _Qm);
3171 __n64 __vraddhn_s16(__n128 _Qn, __n128 _Qm);
3172 __n64 __vraddhn_s32(__n128 _Qn, __n128 _Qm);
3173 __n64 __vraddhn_s64(__n128 _Qn, __n128 _Qm);
3174 __n64 __vraddhn_u16(__n128 _Qn, __n128 _Qm);
3175 __n64 __vraddhn_u32(__n128 _Qn, __n128 _Qm);
3176 __n64 __vraddhn_u64(__n128 _Qn, __n128 _Qm);
3177 __n128 __vaddl_s16(__n64 _Dn, __n64 _Dm);
3178 __n128 __vaddl_s32(__n64 _Dn, __n64 _Dm);
3179 __n128 __vaddl_s8(__n64 _Dn, __n64 _Dm);
3180 __n128 __vaddl_u16(__n64 _Dn, __n64 _Dm);
3181 __n128 __vaddl_u32(__n64 _Dn, __n64 _Dm);
3182 __n128 __vaddl_u8(__n64 _Dn, __n64 _Dm);
3183 __n128 __vaddw_s16(__n128 _Qn, __n64 _Dm);
3184 __n128 __vaddw_s32(__n128 _Qn, __n64 _Dm);
3185 __n128 __vaddw_s8(__n128 _Qn, __n64 _Dm);
3186 __n128 __vaddw_u16(__n128 _Qn, __n64 _Dm);
3187 __n128 __vaddw_u32(__n128 _Qn, __n64 _Dm);
3188 __n128 __vaddw_u8(__n128 _Qn, __n64 _Dm);
3189 __n64 __vand_s16(__n64 _Dn, __n64 _Dm);
3190 __n64 __vand_s32(__n64 _Dn, __n64 _Dm);
3191 __n64 __vand_s64(__n64 _Dn, __n64 _Dm);
3192 __n64 __vand_s8(__n64 _Dn, __n64 _Dm);
3193 __n64 __vand_u16(__n64 _Dn, __n64 _Dm);
3194 __n64 __vand_u32(__n64 _Dn, __n64 _Dm);
3195 __n64 __vand_u64(__n64 _Dn, __n64 _Dm);
3196 __n64 __vand_u8(__n64 _Dn, __n64 _Dm);
3197 __n64 __vorr_s16(__n64 _Dn, __n64 _Dm);
3198 __n64 __vorr_s32(__n64 _Dn, __n64 _Dm);
3199 __n64 __vorr_s64(__n64 _Dn, __n64 _Dm);
3200 __n64 __vorr_s8(__n64 _Dn, __n64 _Dm);
3201 __n64 __vorr_u16(__n64 _Dn, __n64 _Dm);
3202 __n64 __vorr_u32(__n64 _Dn, __n64 _Dm);
3203 __n64 __vorr_u64(__n64 _Dn, __n64 _Dm);
3204 __n64 __vorr_u8(__n64 _Dn, __n64 _Dm);
3205 __n128 __vandq_s16(__n128 _Qn, __n128 _Qm);
3206 __n128 __vandq_s32(__n128 _Qn, __n128 _Qm);
3207 __n128 __vandq_s64(__n128 _Qn, __n128 _Qm);
3208 __n128 __vandq_s8(__n128 _Qn, __n128 _Qm);
3209 __n128 __vandq_u16(__n128 _Qn, __n128 _Qm);
3210 __n128 __vandq_u32(__n128 _Qn, __n128 _Qm);
3211 __n128 __vandq_u64(__n128 _Qn, __n128 _Qm);
3212 __n128 __vandq_u8(__n128 _Qn, __n128 _Qm);
3213 __n128 __vorrq_s16(__n128 _Qn, __n128 _Qm);
3214 __n128 __vorrq_s32(__n128 _Qn, __n128 _Qm);
3215 __n128 __vorrq_s64(__n128 _Qn, __n128 _Qm);
3216 __n128 __vorrq_s8(__n128 _Qn, __n128 _Qm);
3217 __n128 __vorrq_u16(__n128 _Qn, __n128 _Qm);
3218 __n128 __vorrq_u32(__n128 _Qn, __n128 _Qm);
3219 __n128 __vorrq_u64(__n128 _Qn, __n128 _Qm);
3220 __n128 __vorrq_u8(__n128 _Qn, __n128 _Qm);
3221 __n64 __vbif_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3222 __n64 __vbif_p16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3223 __n64 __vbif_p8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3224 __n64 __vbif_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3225 __n64 __vbif_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3226 __n64 __vbif_s64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3227 __n64 __vbif_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3228 __n64 __vbif_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3229 __n64 __vbif_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3230 __n64 __vbif_u64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3231 __n64 __vbif_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3232 __n64 __vbit_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3233 __n64 __vbit_p16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3234 __n64 __vbit_p8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3235 __n64 __vbit_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3236 __n64 __vbit_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3237 __n64 __vbit_s64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3238 __n64 __vbit_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3239 __n64 __vbit_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3240 __n64 __vbit_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3241 __n64 __vbit_u64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3242 __n64 __vbit_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3243 __n64 __vbsl_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3244 __n64 __vbsl_p16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3245 __n64 __vbsl_p8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3246 __n64 __vbsl_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3247 __n64 __vbsl_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3248 __n64 __vbsl_s64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3249 __n64 __vbsl_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3250 __n64 __vbsl_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3251 __n64 __vbsl_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3252 __n64 __vbsl_u64(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3253 __n64 __vbsl_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
3254 __n128 __vbifq_f32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3255 __n128 __vbifq_p16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3256 __n128 __vbifq_p8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3257 __n128 __vbifq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3258 __n128 __vbifq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3259 __n128 __vbifq_s64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3260 __n128 __vbifq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3261 __n128 __vbifq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3262 __n128 __vbifq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3263 __n128 __vbifq_u64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3264 __n128 __vbifq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3265 __n128 __vbitq_f32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3266 __n128 __vbitq_p16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3267 __n128 __vbitq_p8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3268 __n128 __vbitq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3269 __n128 __vbitq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3270 __n128 __vbitq_s64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3271 __n128 __vbitq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3272 __n128 __vbitq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3273 __n128 __vbitq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3274 __n128 __vbitq_u64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3275 __n128 __vbitq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3276 __n128 __vbslq_f32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3277 __n128 __vbslq_p16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3278 __n128 __vbslq_p8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3279 __n128 __vbslq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3280 __n128 __vbslq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3281 __n128 __vbslq_s64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3282 __n128 __vbslq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3283 __n128 __vbslq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3284 __n128 __vbslq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3285 __n128 __vbslq_u64(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3286 __n128 __vbslq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
3287 __n64 __vceq_z_f32_ex(__n64 _Dm);
3288 __n64 __vceq_z_s16_ex(__n64 _Dm);
3289 __n64 __vceq_z_s32_ex(__n64 _Dm);
3290 __n64 __vceq_z_s8_ex(__n64 _Dm);
3291 __n64 __vceq_z_u16_ex(__n64 _Dm);
3292 __n64 __vceq_z_u32_ex(__n64 _Dm);
3293 __n64 __vceq_z_u8_ex(__n64 _Dm);
3294 __n128 __vceqq_z_f32_ex(__n128 _Qm);
3295 __n128 __vceqq_z_s16_ex(__n128 _Qm);
3296 __n128 __vceqq_z_s32_ex(__n128 _Qm);
3297 __n128 __vceqq_z_s8_ex(__n128 _Qm);
3298 __n128 __vceqq_z_u16_ex(__n128 _Qm);
3299 __n128 __vceqq_z_u32_ex(__n128 _Qm);
3300 __n128 __vceqq_z_u8_ex(__n128 _Qm);
3301 __n64 __vceq_f32(__n64 _Dn, __n64 _Dm);
3302 __n64 __vceq_p8(__n64 _Dn, __n64 _Dm);
3303 __n64 __vceq_s16(__n64 _Dn, __n64 _Dm);
3304 __n64 __vceq_s32(__n64 _Dn, __n64 _Dm);
3305 __n64 __vceq_s8(__n64 _Dn, __n64 _Dm);
3306 __n64 __vceq_u16(__n64 _Dn, __n64 _Dm);
3307 __n64 __vceq_u32(__n64 _Dn, __n64 _Dm);
3308 __n64 __vceq_u8(__n64 _Dn, __n64 _Dm);
3309 __n128 __vceqq_f32(__n128 _Qn, __n128 _Qm);
3310 __n128 __vceqq_p8(__n128 _Qn, __n128 _Qm);
3311 __n128 __vceqq_s16(__n128 _Qn, __n128 _Qm);
3312 __n128 __vceqq_s32(__n128 _Qn, __n128 _Qm);
3313 __n128 __vceqq_s8(__n128 _Qn, __n128 _Qm);
3314 __n128 __vceqq_u16(__n128 _Qn, __n128 _Qm);
3315 __n128 __vceqq_u32(__n128 _Qn, __n128 _Qm);
3316 __n128 __vceqq_u8(__n128 _Qn, __n128 _Qm);
3317 __n64 __vcge_z_f32_ex(__n64 _Dm);
3318 __n64 __vcge_z_s16_ex(__n64 _Dm);
3319 __n64 __vcge_z_s32_ex(__n64 _Dm);
3320 __n64 __vcge_z_s8_ex(__n64 _Dm);
3321 __n128 __vcgeq_z_f32_ex(__n128 _Qm);
3322 __n128 __vcgeq_z_s16_ex(__n128 _Qm);
3323 __n128 __vcgeq_z_s32_ex(__n128 _Qm);
3324 __n128 __vcgeq_z_s8_ex(__n128 _Qm);
3325 __n64 __vcge_f32(__n64 _Dn, __n64 _Dm);
3326 __n64 __vcge_s16(__n64 _Dn, __n64 _Dm);
3327 __n64 __vcge_s32(__n64 _Dn, __n64 _Dm);
3328 __n64 __vcge_s8(__n64 _Dn, __n64 _Dm);
3329 __n64 __vcge_u16(__n64 _Dn, __n64 _Dm);
3330 __n64 __vcge_u32(__n64 _Dn, __n64 _Dm);
3331 __n64 __vcge_u8(__n64 _Dn, __n64 _Dm);
3332 __n64 __vcle_f32(__n64 _Dn, __n64 _Dm);
3333 __n64 __vcle_s16(__n64 _Dn, __n64 _Dm);
3334 __n64 __vcle_s32(__n64 _Dn, __n64 _Dm);
3335 __n64 __vcle_s8(__n64 _Dn, __n64 _Dm);
3336 __n64 __vcle_u16(__n64 _Dn, __n64 _Dm);
3337 __n64 __vcle_u32(__n64 _Dn, __n64 _Dm);
3338 __n64 __vcle_u8(__n64 _Dn, __n64 _Dm);
3339 __n128 __vcgeq_f32(__n128 _Qn, __n128 _Qm);
3340 __n128 __vcgeq_s16(__n128 _Qn, __n128 _Qm);
3341 __n128 __vcgeq_s32(__n128 _Qn, __n128 _Qm);
3342 __n128 __vcgeq_s8(__n128 _Qn, __n128 _Qm);
3343 __n128 __vcgeq_u16(__n128 _Qn, __n128 _Qm);
3344 __n128 __vcgeq_u32(__n128 _Qn, __n128 _Qm);
3345 __n128 __vcgeq_u8(__n128 _Qn, __n128 _Qm);
3346 __n128 __vcleq_f32(__n128 _Qn, __n128 _Qm);
3347 __n128 __vcleq_s16(__n128 _Qn, __n128 _Qm);
3348 __n128 __vcleq_s32(__n128 _Qn, __n128 _Qm);
3349 __n128 __vcleq_s8(__n128 _Qn, __n128 _Qm);
3350 __n128 __vcleq_u16(__n128 _Qn, __n128 _Qm);
3351 __n128 __vcleq_u32(__n128 _Qn, __n128 _Qm);
3352 __n128 __vcleq_u8(__n128 _Qn, __n128 _Qm);
3353 __n64 __vcgt_z_f32_ex(__n64 _Dm);
3354 __n64 __vcgt_z_s16_ex(__n64 _Dm);
3355 __n64 __vcgt_z_s32_ex(__n64 _Dm);
3356 __n64 __vcgt_z_s8_ex(__n64 _Dm);
3357 __n128 __vcgtq_z_f32_ex(__n128 _Qm);
3358 __n128 __vcgtq_z_s16_ex(__n128 _Qm);
3359 __n128 __vcgtq_z_s32_ex(__n128 _Qm);
3360 __n128 __vcgtq_z_s8_ex(__n128 _Qm);
3361 __n64 __vcgt_f32(__n64 _Dn, __n64 _Dm);
3362 __n64 __vcgt_s16(__n64 _Dn, __n64 _Dm);
3363 __n64 __vcgt_s32(__n64 _Dn, __n64 _Dm);
3364 __n64 __vcgt_s8(__n64 _Dn, __n64 _Dm);
3365 __n64 __vcgt_u16(__n64 _Dn, __n64 _Dm);
3366 __n64 __vcgt_u32(__n64 _Dn, __n64 _Dm);
3367 __n64 __vcgt_u8(__n64 _Dn, __n64 _Dm);
3368 __n64 __vclt_f32(__n64 _Dn, __n64 _Dm);
3369 __n64 __vclt_s16(__n64 _Dn, __n64 _Dm);
3370 __n64 __vclt_s32(__n64 _Dn, __n64 _Dm);
3371 __n64 __vclt_s8(__n64 _Dn, __n64 _Dm);
3372 __n64 __vclt_u16(__n64 _Dn, __n64 _Dm);
3373 __n64 __vclt_u32(__n64 _Dn, __n64 _Dm);
3374 __n64 __vclt_u8(__n64 _Dn, __n64 _Dm);
3375 __n128 __vcgtq_f32(__n128 _Qn, __n128 _Qm);
3376 __n128 __vcgtq_s16(__n128 _Qn, __n128 _Qm);
3377 __n128 __vcgtq_s32(__n128 _Qn, __n128 _Qm);
3378 __n128 __vcgtq_s8(__n128 _Qn, __n128 _Qm);
3379 __n128 __vcgtq_u16(__n128 _Qn, __n128 _Qm);
3380 __n128 __vcgtq_u32(__n128 _Qn, __n128 _Qm);
3381 __n128 __vcgtq_u8(__n128 _Qn, __n128 _Qm);
3382 __n128 __vcltq_f32(__n128 _Qn, __n128 _Qm);
3383 __n128 __vcltq_s16(__n128 _Qn, __n128 _Qm);
3384 __n128 __vcltq_s32(__n128 _Qn, __n128 _Qm);
3385 __n128 __vcltq_s8(__n128 _Qn, __n128 _Qm);
3386 __n128 __vcltq_u16(__n128 _Qn, __n128 _Qm);
3387 __n128 __vcltq_u32(__n128 _Qn, __n128 _Qm);
3388 __n128 __vcltq_u8(__n128 _Qn, __n128 _Qm);
3389 __n64 __vcle_z_f32_ex(__n64 _Dm);
3390 __n64 __vcle_z_s16_ex(__n64 _Dm);
3391 __n64 __vcle_z_s32_ex(__n64 _Dm);
3392 __n64 __vcle_z_s8_ex(__n64 _Dm);
3393 __n128 __vcleq_z_f32_ex(__n128 _Qm);
3394 __n128 __vcleq_z_s16_ex(__n128 _Qm);
3395 __n128 __vcleq_z_s32_ex(__n128 _Qm);
3396 __n128 __vcleq_z_s8_ex(__n128 _Qm);
3397 __n64 __vcls_s16(__n64 _Dm);
3398 __n64 __vcls_s32(__n64 _Dm);
3399 __n64 __vcls_s8(__n64 _Dm);
3400 __n64 __vclz_s16(__n64 _Dm);
3401 __n64 __vclz_s32(__n64 _Dm);
3402 __n64 __vclz_s8(__n64 _Dm);
3403 __n64 __vclz_u16(__n64 _Dm);
3404 __n64 __vclz_u32(__n64 _Dm);
3405 __n64 __vclz_u8(__n64 _Dm);
3406 __n128 __vclsq_s16(__n128 _Qm);
3407 __n128 __vclsq_s32(__n128 _Qm);
3408 __n128 __vclsq_s8(__n128 _Qm);
3409 __n128 __vclzq_s16(__n128 _Qm);
3410 __n128 __vclzq_s32(__n128 _Qm);
3411 __n128 __vclzq_s8(__n128 _Qm);
3412 __n128 __vclzq_u16(__n128 _Qm);
3413 __n128 __vclzq_u32(__n128 _Qm);
3414 __n128 __vclzq_u8(__n128 _Qm);
3415 __n64 __vclt_z_f32_ex(__n64 _Dm);
3416 __n64 __vclt_z_s16_ex(__n64 _Dm);
3417 __n64 __vclt_z_s32_ex(__n64 _Dm);
3418 __n64 __vclt_z_s8_ex(__n64 _Dm);
3419 __n128 __vcltq_z_f32_ex(__n128 _Qm);
3420 __n128 __vcltq_z_s16_ex(__n128 _Qm);
3421 __n128 __vcltq_z_s32_ex(__n128 _Qm);
3422 __n128 __vcltq_z_s8_ex(__n128 _Qm);
3423 __n64 __vcnt_p8(__n64 _Dm);
3424 __n64 __vcnt_s8(__n64 _Dm);
3425 __n64 __vcnt_u8(__n64 _Dm);
3426 __n128 __vcntq_p8(__n128 _Qm);
3427 __n128 __vcntq_s8(__n128 _Qm);
3428 __n128 __vcntq_u8(__n128 _Qm);
3429 __n128 __vcombine_f32(__n64 _Dn, __n64 _Dm);
3430 __n128 __vcombine_p16(__n64 _Dn, __n64 _Dm);
3431 __n128 __vcombine_p8(__n64 _Dn, __n64 _Dm);
3432 __n128 __vcombine_s16(__n64 _Dn, __n64 _Dm);
3433 __n128 __vcombine_s32(__n64 _Dn, __n64 _Dm);
3434 __n128 __vcombine_s64(__n64 _Dn, __n64 _Dm);
3435 __n128 __vcombine_s8(__n64 _Dn, __n64 _Dm);
3436 __n128 __vcombine_u16(__n64 _Dn, __n64 _Dm);
3437 __n128 __vcombine_u32(__n64 _Dn, __n64 _Dm);
3438 __n128 __vcombine_u64(__n64 _Dn, __n64 _Dm);
3439 __n128 __vcombine_u8(__n64 _Dn, __n64 _Dm);
3440 __n64 __vcreate_f32(uint64_t _R64t);
3441 __n64 __vcreate_p16(uint64_t _R64t);
3442 __n64 __vcreate_p8(uint64_t _R64t);
3443 __n64 __vcreate_s16(uint64_t _R64t);
3444 __n64 __vcreate_s32(uint64_t _R64t);
3445 __n64 __vcreate_s64(uint64_t _R64t);
3446 __n64 __vcreate_s8(uint64_t _R64t);
3447 __n64 __vcreate_u16(uint64_t _R64t);
3448 __n64 __vcreate_u32(uint64_t _R64t);
3449 __n64 __vcreate_u64(uint64_t _R64t);
3450 __n64 __vcreate_u8(uint64_t _R64t);
3451 __n64 __vcvt_n_f32_s32(__n64 _Dm, const int _Fbits);
3452 __n64 __vcvt_n_f32_u32(__n64 _Dm, const int _Fbits);
3453 __n64 __vcvt_n_s32_f32(__n64 _Dm, const int _Fbits);
3454 __n64 __vcvt_n_u32_f32(__n64 _Dm, const int _Fbits);
3455 __n128 __vcvtq_n_f32_s32(__n128 _Qm, const int _Fbits);
3456 __n128 __vcvtq_n_f32_u32(__n128 _Qm, const int _Fbits);
3457 __n128 __vcvtq_n_s32_f32(__n128 _Qm, const int _Fbits);
3458 __n128 __vcvtq_n_u32_f32(__n128 _Qm, const int _Fbits);
3459 __n64 __vcvta_s32_f32(__n64 _Dm);
3460 __n64 __vcvta_u32_f32(__n64 _Dm);
3461 __n64 __vcvtm_s32_f32(__n64 _Dm);
3462 __n64 __vcvtm_u32_f32(__n64 _Dm);
3463 __n64 __vcvtn_s32_f32(__n64 _Dm);
3464 __n64 __vcvtn_u32_f32(__n64 _Dm);
3465 __n64 __vcvtp_s32_f32(__n64 _Dm);
3466 __n64 __vcvtp_u32_f32(__n64 _Dm);
3467 __n128 __vcvtaq_s32_f32(__n128 _Qm);
3468 __n128 __vcvtaq_u32_f32(__n128 _Qm);
3469 __n128 __vcvtmq_s32_f32(__n128 _Qm);
3470 __n128 __vcvtmq_u32_f32(__n128 _Qm);
3471 __n128 __vcvtnq_s32_f32(__n128 _Qm);
3472 __n128 __vcvtnq_u32_f32(__n128 _Qm);
3473 __n128 __vcvtpq_s32_f32(__n128 _Qm);
3474 __n128 __vcvtpq_u32_f32(__n128 _Qm);
3475 __n64 __vcvt_f32_s32(__n64 _Dm);
3476 __n64 __vcvt_f32_u32(__n64 _Dm);
3477 __n64 __vcvt_s32_f32(__n64 _Dm);
3478 __n64 __vcvt_u32_f32(__n64 _Dm);
3479 __n128 __vcvtq_f32_s32(__n128 _Qm);
3480 __n128 __vcvtq_f32_u32(__n128 _Qm);
3481 __n128 __vcvtq_s32_f32(__n128 _Qm);
3482 __n128 __vcvtq_u32_f32(__n128 _Qm);
3483 __n64 __vdup_lane_f32(__n64 _Dm, const int _Lane);
3484 __n64 __vdup_lane_p16(__n64 _Dm, const int _Lane);
3485 __n64 __vdup_lane_p8(__n64 _Dm, const int _Lane);
3486 __n64 __vdup_lane_s16(__n64 _Dm, const int _Lane);
3487 __n64 __vdup_lane_s32(__n64 _Dm, const int _Lane);
3488 __n64 __vdup_lane_s8(__n64 _Dm, const int _Lane);
3489 __n64 __vdup_lane_u16(__n64 _Dm, const int _Lane);
3490 __n64 __vdup_lane_u32(__n64 _Dm, const int _Lane);
3491 __n64 __vdup_lane_u8(__n64 _Dm, const int _Lane);
3492 __n128 __vdupq_lane_f32(__n64 _Dm, const int _Lane);
3493 __n128 __vdupq_lane_p16(__n64 _Dm, const int _Lane);
3494 __n128 __vdupq_lane_p8(__n64 _Dm, const int _Lane);
3495 __n128 __vdupq_lane_s16(__n64 _Dm, const int _Lane);
3496 __n128 __vdupq_lane_s32(__n64 _Dm, const int _Lane);
3497 __n128 __vdupq_lane_s8(__n64 _Dm, const int _Lane);
3498 __n128 __vdupq_lane_u16(__n64 _Dm, const int _Lane);
3499 __n128 __vdupq_lane_u32(__n64 _Dm, const int _Lane);
3500 __n128 __vdupq_lane_u8(__n64 _Dm, const int _Lane);
3501 __n64 __vdup_n_f32(float32_t _Ft);
3502 __n64 __vmov_n_f32(float32_t _Ft);
3503 __n64 __vdup_n_p16(poly16_t _Rt);
3504 __n64 __vdup_n_p8(poly8_t _Rt);
3505 __n64 __vdup_n_s16(int16_t _Rt);
3506 __n64 __vdup_n_s32(int32_t _Rt);
3507 __n64 __vdup_n_s8(int8_t _Rt);
3508 __n64 __vdup_n_u16(uint16_t _Rt);
3509 __n64 __vdup_n_u32(uint32_t _Rt);
3510 __n64 __vdup_n_u8(uint8_t _Rt);
3511 __n64 __vmov_n_p16(poly16_t _Rt);
3512 __n64 __vmov_n_p8(poly8_t _Rt);
3513 __n64 __vmov_n_s16(int16_t _Rt);
3514 __n64 __vmov_n_s32(int32_t _Rt);
3515 __n64 __vmov_n_s8(int8_t _Rt);
3516 __n64 __vmov_n_u16(uint16_t _Rt);
3517 __n64 __vmov_n_u32(uint32_t _Rt);
3518 __n64 __vmov_n_u8(uint8_t _Rt);
3519 __n128 __vdupq_n_f32(float32_t _Ft);
3520 __n128 __vmovq_n_f32(float32_t _Ft);
3521 __n128 __vdupq_n_p16(poly16_t _Rt);
3522 __n128 __vdupq_n_p8(poly8_t _Rt);
3523 __n128 __vdupq_n_s16(int16_t _Rt);
3524 __n128 __vdupq_n_s32(int32_t _Rt);
3525 __n128 __vdupq_n_s8(int8_t _Rt);
3526 __n128 __vdupq_n_u16(uint16_t _Rt);
3527 __n128 __vdupq_n_u32(uint32_t _Rt);
3528 __n128 __vdupq_n_u8(uint8_t _Rt);
3529 __n128 __vmovq_n_p16(poly16_t _Rt);
3530 __n128 __vmovq_n_p8(poly8_t _Rt);
3531 __n128 __vmovq_n_s16(int16_t _Rt);
3532 __n128 __vmovq_n_s32(int32_t _Rt);
3533 __n128 __vmovq_n_s8(int8_t _Rt);
3534 __n128 __vmovq_n_u16(uint16_t _Rt);
3535 __n128 __vmovq_n_u32(uint32_t _Rt);
3536 __n128 __vmovq_n_u8(uint8_t _Rt);
3537 __n64 __vdup_n_s64(int64_t _R64t);
3538 __n64 __vdup_n_u64(uint64_t _R64t);
3539 __n64 __vmov_n_s64(int64_t _R64t);
3540 __n64 __vmov_n_u64(uint64_t _R64t);
3541 __n128 __vdupq_n_s64(int64_t _R64t);
3542 __n128 __vdupq_n_u64(uint64_t _R64t);
3543 __n128 __vmovq_n_s64(int64_t _R64t);
3544 __n128 __vmovq_n_u64(uint64_t _R64t);
3545 __n64 __vbic_s16(__n64 _Dn, __n64 _Dm);
3546 __n64 __vbic_s32(__n64 _Dn, __n64 _Dm);
3547 __n64 __vbic_s64(__n64 _Dn, __n64 _Dm);
3548 __n64 __vbic_s8(__n64 _Dn, __n64 _Dm);
3549 __n64 __vbic_u16(__n64 _Dn, __n64 _Dm);
3550 __n64 __vbic_u32(__n64 _Dn, __n64 _Dm);
3551 __n64 __vbic_u64(__n64 _Dn, __n64 _Dm);
3552 __n64 __vbic_u8(__n64 _Dn, __n64 _Dm);
3553 __n64 __veor_s16(__n64 _Dn, __n64 _Dm);
3554 __n64 __veor_s32(__n64 _Dn, __n64 _Dm);
3555 __n64 __veor_s64(__n64 _Dn, __n64 _Dm);
3556 __n64 __veor_s8(__n64 _Dn, __n64 _Dm);
3557 __n64 __veor_u16(__n64 _Dn, __n64 _Dm);
3558 __n64 __veor_u32(__n64 _Dn, __n64 _Dm);
3559 __n64 __veor_u64(__n64 _Dn, __n64 _Dm);
3560 __n64 __veor_u8(__n64 _Dn, __n64 _Dm);
3561 __n64 __vorn_s16(__n64 _Dn, __n64 _Dm);
3562 __n64 __vorn_s32(__n64 _Dn, __n64 _Dm);
3563 __n64 __vorn_s64(__n64 _Dn, __n64 _Dm);
3564 __n64 __vorn_s8(__n64 _Dn, __n64 _Dm);
3565 __n64 __vorn_u16(__n64 _Dn, __n64 _Dm);
3566 __n64 __vorn_u32(__n64 _Dn, __n64 _Dm);
3567 __n64 __vorn_u64(__n64 _Dn, __n64 _Dm);
3568 __n64 __vorn_u8(__n64 _Dn, __n64 _Dm);
3569 __n128 __vbicq_s16(__n128 _Qn, __n128 _Qm);
3570 __n128 __vbicq_s32(__n128 _Qn, __n128 _Qm);
3571 __n128 __vbicq_s64(__n128 _Qn, __n128 _Qm);
3572 __n128 __vbicq_s8(__n128 _Qn, __n128 _Qm);
3573 __n128 __vbicq_u16(__n128 _Qn, __n128 _Qm);
3574 __n128 __vbicq_u32(__n128 _Qn, __n128 _Qm);
3575 __n128 __vbicq_u64(__n128 _Qn, __n128 _Qm);
3576 __n128 __vbicq_u8(__n128 _Qn, __n128 _Qm);
3577 __n128 __veorq_s16(__n128 _Qn, __n128 _Qm);
3578 __n128 __veorq_s32(__n128 _Qn, __n128 _Qm);
3579 __n128 __veorq_s64(__n128 _Qn, __n128 _Qm);
3580 __n128 __veorq_s8(__n128 _Qn, __n128 _Qm);
3581 __n128 __veorq_u16(__n128 _Qn, __n128 _Qm);
3582 __n128 __veorq_u32(__n128 _Qn, __n128 _Qm);
3583 __n128 __veorq_u64(__n128 _Qn, __n128 _Qm);
3584 __n128 __veorq_u8(__n128 _Qn, __n128 _Qm);
3585 __n128 __vornq_s16(__n128 _Qn, __n128 _Qm);
3586 __n128 __vornq_s32(__n128 _Qn, __n128 _Qm);
3587 __n128 __vornq_s64(__n128 _Qn, __n128 _Qm);
3588 __n128 __vornq_s8(__n128 _Qn, __n128 _Qm);
3589 __n128 __vornq_u16(__n128 _Qn, __n128 _Qm);
3590 __n128 __vornq_u32(__n128 _Qn, __n128 _Qm);
3591 __n128 __vornq_u64(__n128 _Qn, __n128 _Qm);
3592 __n128 __vornq_u8(__n128 _Qn, __n128 _Qm);
3593 __n64 __vext_f32(__n64 _Dn, __n64 _Dm, const int _Pos);
3594 __n64 __vext_p16(__n64 _Dn, __n64 _Dm, const int _Pos);
3595 __n64 __vext_p8(__n64 _Dn, __n64 _Dm, const int _Pos);
3596 __n64 __vext_s16(__n64 _Dn, __n64 _Dm, const int _Pos);
3597 __n64 __vext_s32(__n64 _Dn, __n64 _Dm, const int _Pos);
3598 __n64 __vext_s64(__n64 _Dn, __n64 _Dm, const int _Pos);
3599 __n64 __vext_s8(__n64 _Dn, __n64 _Dm, const int _Pos);
3600 __n64 __vext_u16(__n64 _Dn, __n64 _Dm, const int _Pos);
3601 __n64 __vext_u32(__n64 _Dn, __n64 _Dm, const int _Pos);
3602 __n64 __vext_u64(__n64 _Dn, __n64 _Dm, const int _Pos);
3603 __n64 __vext_u8(__n64 _Dn, __n64 _Dm, const int _Pos);
3604 __n128 __vextq_f32(__n128 _Qn, __n128 _Qm, const int _Pos);
3605 __n128 __vextq_p16(__n128 _Qn, __n128 _Qm, const int _Pos);
3606 __n128 __vextq_p8(__n128 _Qn, __n128 _Qm, const int _Pos);
3607 __n128 __vextq_s16(__n128 _Qn, __n128 _Qm, const int _Pos);
3608 __n128 __vextq_s32(__n128 _Qn, __n128 _Qm, const int _Pos);
3609 __n128 __vextq_s64(__n128 _Qn, __n128 _Qm, const int _Pos);
3610 __n128 __vextq_s8(__n128 _Qn, __n128 _Qm, const int _Pos);
3611 __n128 __vextq_u16(__n128 _Qn, __n128 _Qm, const int _Pos);
3612 __n128 __vextq_u32(__n128 _Qn, __n128 _Qm, const int _Pos);
3613 __n128 __vextq_u64(__n128 _Qn, __n128 _Qm, const int _Pos);
3614 __n128 __vextq_u8(__n128 _Qn, __n128 _Qm, const int _Pos);
3615 __n64 __vget_high_f32(__n128 _Qm);
3616 __n64 __vget_high_p16(__n128 _Qm);
3617 __n64 __vget_high_p8(__n128 _Qm);
3618 __n64 __vget_high_s16(__n128 _Qm);
3619 __n64 __vget_high_s32(__n128 _Qm);
3620 __n64 __vget_high_s64(__n128 _Qm);
3621 __n64 __vget_high_s8(__n128 _Qm);
3622 __n64 __vget_high_u16(__n128 _Qm);
3623 __n64 __vget_high_u32(__n128 _Qm);
3624 __n64 __vget_high_u64(__n128 _Qm);
3625 __n64 __vget_high_u8(__n128 _Qm);
3626 __n64 __vget_low_f32(__n128 _Qm);
3627 __n64 __vget_low_p16(__n128 _Qm);
3628 __n64 __vget_low_p8(__n128 _Qm);
3629 __n64 __vget_low_s16(__n128 _Qm);
3630 __n64 __vget_low_s32(__n128 _Qm);
3631 __n64 __vget_low_s64(__n128 _Qm);
3632 __n64 __vget_low_s8(__n128 _Qm);
3633 __n64 __vget_low_u16(__n128 _Qm);
3634 __n64 __vget_low_u32(__n128 _Qm);
3635 __n64 __vget_low_u64(__n128 _Qm);
3636 __n64 __vget_low_u8(__n128 _Qm);
3637 __n64 __vhadd_s16(__n64 _Dn, __n64 _Dm);
3638 __n64 __vhadd_s32(__n64 _Dn, __n64 _Dm);
3639 __n64 __vhadd_s8(__n64 _Dn, __n64 _Dm);
3640 __n64 __vhadd_u16(__n64 _Dn, __n64 _Dm);
3641 __n64 __vhadd_u32(__n64 _Dn, __n64 _Dm);
3642 __n64 __vhadd_u8(__n64 _Dn, __n64 _Dm);
3643 __n64 __vhsub_s16(__n64 _Dn, __n64 _Dm);
3644 __n64 __vhsub_s32(__n64 _Dn, __n64 _Dm);
3645 __n64 __vhsub_s8(__n64 _Dn, __n64 _Dm);
3646 __n64 __vhsub_u16(__n64 _Dn, __n64 _Dm);
3647 __n64 __vhsub_u32(__n64 _Dn, __n64 _Dm);
3648 __n64 __vhsub_u8(__n64 _Dn, __n64 _Dm);
3649 __n64 __vrhadd_s16(__n64 _Dn, __n64 _Dm);
3650 __n64 __vrhadd_s32(__n64 _Dn, __n64 _Dm);
3651 __n64 __vrhadd_s8(__n64 _Dn, __n64 _Dm);
3652 __n64 __vrhadd_u16(__n64 _Dn, __n64 _Dm);
3653 __n64 __vrhadd_u32(__n64 _Dn, __n64 _Dm);
3654 __n64 __vrhadd_u8(__n64 _Dn, __n64 _Dm);
3655 __n128 __vhaddq_s16(__n128 _Qn, __n128 _Qm);
3656 __n128 __vhaddq_s32(__n128 _Qn, __n128 _Qm);
3657 __n128 __vhaddq_s8(__n128 _Qn, __n128 _Qm);
3658 __n128 __vhaddq_u16(__n128 _Qn, __n128 _Qm);
3659 __n128 __vhaddq_u32(__n128 _Qn, __n128 _Qm);
3660 __n128 __vhaddq_u8(__n128 _Qn, __n128 _Qm);
3661 __n128 __vhsubq_s16(__n128 _Qn, __n128 _Qm);
3662 __n128 __vhsubq_s32(__n128 _Qn, __n128 _Qm);
3663 __n128 __vhsubq_s8(__n128 _Qn, __n128 _Qm);
3664 __n128 __vhsubq_u16(__n128 _Qn, __n128 _Qm);
3665 __n128 __vhsubq_u32(__n128 _Qn, __n128 _Qm);
3666 __n128 __vhsubq_u8(__n128 _Qn, __n128 _Qm);
3667 __n128 __vrhaddq_s16(__n128 _Qn, __n128 _Qm);
3668 __n128 __vrhaddq_s32(__n128 _Qn, __n128 _Qm);
3669 __n128 __vrhaddq_s8(__n128 _Qn, __n128 _Qm);
3670 __n128 __vrhaddq_u16(__n128 _Qn, __n128 _Qm);
3671 __n128 __vrhaddq_u32(__n128 _Qn, __n128 _Qm);
3672 __n128 __vrhaddq_u8(__n128 _Qn, __n128 _Qm);
3673 __n64 __vld1_f32(_In_reads_(2) const float32_t * _PcD);
3674 __n64 __vld1_p16(_In_reads_(4) const poly16_t * _PcD);
3675 __n64 __vld1_p8(_In_reads_(8) const poly8_t * _PcD);
3676 __n64 __vld1_s16(_In_reads_(4) const int16_t * _PcD);
3677 __n64 __vld1_s32(_In_reads_(2) const int32_t * _PcD);
3678 __n64 __vld1_s64(_In_reads_(1) const int64_t * _PcD);
3679 __n64 __vld1_s8(_In_reads_(8) const int8_t * _PcD);
3680 __n64 __vld1_u16(_In_reads_(4) const uint16_t * _PcD);
3681 __n64 __vld1_u32(_In_reads_(2) const uint32_t * _PcD);
3682 __n64 __vld1_u64(_In_reads_(1) const uint64_t * _PcD);
3683 __n64 __vld1_u8(_In_reads_(8) const uint8_t * _PcD);
3684 __n64 __vld1_f32_ex(_In_reads_(2) const float32_t * _PcD, const int _Align);
3685 __n64 __vld1_p16_ex(_In_reads_(4) const poly16_t * _PcD, const int _Align);
3686 __n64 __vld1_p8_ex(_In_reads_(8) const poly8_t * _PcD, const int _Align);
3687 __n64 __vld1_s16_ex(_In_reads_(4) const int16_t * _PcD, const int _Align);
3688 __n64 __vld1_s32_ex(_In_reads_(2) const int32_t * _PcD, const int _Align);
3689 __n64 __vld1_s64_ex(_In_reads_(1) const int64_t * _PcD, const int _Align);
3690 __n64 __vld1_s8_ex(_In_reads_(8) const int8_t * _PcD, const int _Align);
3691 __n64 __vld1_u16_ex(_In_reads_(4) const uint16_t * _PcD, const int _Align);
3692 __n64 __vld1_u32_ex(_In_reads_(2) const uint32_t * _PcD, const int _Align);
3693 __n64 __vld1_u64_ex(_In_reads_(1) const uint64_t * _PcD, const int _Align);
3694 __n64 __vld1_u8_ex(_In_reads_(8) const uint8_t * _PcD, const int _Align);
3695 __n128 __vld1q_f32(_In_reads_(4) const float32_t * _PcD);
3696 __n128 __vld1q_p16(_In_reads_(8) const poly16_t * _PcD);
3697 __n128 __vld1q_p8(_In_reads_(16) const poly8_t * _PcD);
3698 __n128 __vld1q_s16(_In_reads_(8) const int16_t * _PcD);
3699 __n128 __vld1q_s32(_In_reads_(4) const int32_t * _PcD);
3700 __n128 __vld1q_s64(_In_reads_(2) const int64_t * _PcD);
3701 __n128 __vld1q_s8(_In_reads_(16) const int8_t * _PcD);
3702 __n128 __vld1q_u16(_In_reads_(8) const uint16_t * _PcD);
3703 __n128 __vld1q_u32(_In_reads_(4) const uint32_t * _PcD);
3704 __n128 __vld1q_u64(_In_reads_(2) const uint64_t * _PcD);
3705 __n128 __vld1q_u8(_In_reads_(16) const uint8_t * _PcD);
3706 __n128 __vld1q_f32_ex(_In_reads_(4) const float32_t * _PcD, const int _Align);
3707 __n128 __vld1q_p16_ex(_In_reads_(8) const poly16_t * _PcD, const int _Align);
3708 __n128 __vld1q_p8_ex(_In_reads_(16) const poly8_t * _PcD, const int _Align);
3709 __n128 __vld1q_s16_ex(_In_reads_(8) const int16_t * _PcD, const int _Align);
3710 __n128 __vld1q_s32_ex(_In_reads_(4) const int32_t * _PcD, const int _Align);
3711 __n128 __vld1q_s64_ex(_In_reads_(2) const int64_t * _PcD, const int _Align);
3712 __n128 __vld1q_s8_ex(_In_reads_(16) const int8_t * _PcD, const int _Align);
3713 __n128 __vld1q_u16_ex(_In_reads_(8) const uint16_t * _PcD, const int _Align);
3714 __n128 __vld1q_u32_ex(_In_reads_(4) const uint32_t * _PcD, const int _Align);
3715 __n128 __vld1q_u64_ex(_In_reads_(2) const uint64_t * _PcD, const int _Align);
3716 __n128 __vld1q_u8_ex(_In_reads_(16) const uint8_t * _PcD, const int _Align);
3717 __n64 __vld1_dup_f32(_In_reads_(1) const float32_t * _PcD);
3718 __n64 __vld1_dup_p16(_In_reads_(1) const poly16_t * _PcD);
3719 __n64 __vld1_dup_p8(_In_reads_(1) const poly8_t * _PcD);
3720 __n64 __vld1_dup_s16(_In_reads_(1) const int16_t * _PcD);
3721 __n64 __vld1_dup_s32(_In_reads_(1) const int32_t * _PcD);
3722 __n64 __vld1_dup_s8(_In_reads_(1) const int8_t * _PcD);
3723 __n64 __vld1_dup_u16(_In_reads_(1) const uint16_t * _PcD);
3724 __n64 __vld1_dup_u32(_In_reads_(1) const uint32_t * _PcD);
3725 __n64 __vld1_dup_u8(_In_reads_(1) const uint8_t * _PcD);
3726 __n128 __vld1q_dup_f32(_In_reads_(1) const float32_t * _PcD);
3727 __n128 __vld1q_dup_p16(_In_reads_(1) const poly16_t * _PcD);
3728 __n128 __vld1q_dup_p8(_In_reads_(1) const poly8_t * _PcD);
3729 __n128 __vld1q_dup_s16(_In_reads_(1) const int16_t * _PcD);
3730 __n128 __vld1q_dup_s32(_In_reads_(1) const int32_t * _PcD);
3731 __n128 __vld1q_dup_s8(_In_reads_(1) const int8_t * _PcD);
3732 __n128 __vld1q_dup_u16(_In_reads_(1) const uint16_t * _PcD);
3733 __n128 __vld1q_dup_u32(_In_reads_(1) const uint32_t * _PcD);
3734 __n128 __vld1q_dup_u8(_In_reads_(1) const uint8_t * _PcD);
3735 __n64 __vld1_dup_f32_ex(_In_reads_(1) const float32_t * _PcD, const int _Align);
3736 __n64 __vld1_dup_p16_ex(_In_reads_(1) const poly16_t * _PcD, const int _Align);
3737 __n64 __vld1_dup_s16_ex(_In_reads_(1) const int16_t * _PcD, const int _Align);
3738 __n64 __vld1_dup_s32_ex(_In_reads_(1) const int32_t * _PcD, const int _Align);
3739 __n64 __vld1_dup_u16_ex(_In_reads_(1) const uint16_t * _PcD, const int _Align);
3740 __n64 __vld1_dup_u32_ex(_In_reads_(1) const uint32_t * _PcD, const int _Align);
3741 __n128 __vld1q_dup_f32_ex(_In_reads_(1) const float32_t * _PcD, const int _Align);
3742 __n128 __vld1q_dup_p16_ex(_In_reads_(1) const poly16_t * _PcD, const int _Align);
3743 __n128 __vld1q_dup_s16_ex(_In_reads_(1) const int16_t * _PcD, const int _Align);
3744 __n128 __vld1q_dup_s32_ex(_In_reads_(1) const int32_t * _PcD, const int _Align);
3745 __n128 __vld1q_dup_u16_ex(_In_reads_(1) const uint16_t * _PcD, const int _Align);
3746 __n128 __vld1q_dup_u32_ex(_In_reads_(1) const uint32_t * _PcD, const int _Align);
3747 __n64 __vld1_lane_f32(_In_reads_(1) const float32_t * _PcD, __n64 _Dd, const int _Lane);
3748 __n64 __vld1_lane_p16(_In_reads_(1) const poly16_t * _PcD, __n64 _Dd, const int _Lane);
3749 __n64 __vld1_lane_p8(_In_reads_(1) const poly8_t * _PcD, __n64 _Dd, const int _Lane);
3750 __n64 __vld1_lane_s16(_In_reads_(1) const int16_t * _PcD, __n64 _Dd, const int _Lane);
3751 __n64 __vld1_lane_s32(_In_reads_(1) const int32_t * _PcD, __n64 _Dd, const int _Lane);
3752 __n64 __vld1_lane_s8(_In_reads_(1) const int8_t * _PcD, __n64 _Dd, const int _Lane);
3753 __n64 __vld1_lane_u16(_In_reads_(1) const uint16_t * _PcD, __n64 _Dd, const int _Lane);
3754 __n64 __vld1_lane_u32(_In_reads_(1) const uint32_t * _PcD, __n64 _Dd, const int _Lane);
3755 __n64 __vld1_lane_u8(_In_reads_(1) const uint8_t * _PcD, __n64 _Dd, const int _Lane);
3756 __n128 __vld1q_lane_f32(_In_reads_(1) const float32_t * _PcD, __n128 _Qd, const int _Lane);
3757 __n128 __vld1q_lane_p16(_In_reads_(1) const poly16_t * _PcD, __n128 _Qd, const int _Lane);
3758 __n128 __vld1q_lane_p8(_In_reads_(1) const poly8_t * _PcD, __n128 _Qd, const int _Lane);
3759 __n128 __vld1q_lane_s16(_In_reads_(1) const int16_t * _PcD, __n128 _Qd, const int _Lane);
3760 __n128 __vld1q_lane_s32(_In_reads_(1) const int32_t * _PcD, __n128 _Qd, const int _Lane);
3761 __n128 __vld1q_lane_s8(_In_reads_(1) const int8_t * _PcD, __n128 _Qd, const int _Lane);
3762 __n128 __vld1q_lane_u16(_In_reads_(1) const uint16_t * _PcD, __n128 _Qd, const int _Lane);
3763 __n128 __vld1q_lane_u32(_In_reads_(1) const uint32_t * _PcD, __n128 _Qd, const int _Lane);
3764 __n128 __vld1q_lane_u8(_In_reads_(1) const uint8_t * _PcD, __n128 _Qd, const int _Lane);
3765 __n64 __vld1_lane_f32_ex(_In_reads_(1) const float32_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3766 __n64 __vld1_lane_p16_ex(_In_reads_(1) const poly16_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3767 __n64 __vld1_lane_s16_ex(_In_reads_(1) const int16_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3768 __n64 __vld1_lane_s32_ex(_In_reads_(1) const int32_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3769 __n64 __vld1_lane_u16_ex(_In_reads_(1) const uint16_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3770 __n64 __vld1_lane_u32_ex(_In_reads_(1) const uint32_t * _PcD, __n64 _Dd, const int _Lane, const int _Align);
3771 __n128 __vld1q_lane_f32_ex(_In_reads_(1) const float32_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3772 __n128 __vld1q_lane_p16_ex(_In_reads_(1) const poly16_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3773 __n128 __vld1q_lane_s16_ex(_In_reads_(1) const int16_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3774 __n128 __vld1q_lane_s32_ex(_In_reads_(1) const int32_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3775 __n128 __vld1q_lane_u16_ex(_In_reads_(1) const uint16_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3776 __n128 __vld1q_lane_u32_ex(_In_reads_(1) const uint32_t * _PcD, __n128 _Qd, const int _Lane, const int _Align);
3777 __n64x2 __vld2_f32(_In_reads_(4) const float32_t * _PcD);
3778 __n64x2 __vld2_p16(_In_reads_(8) const poly16_t * _PcD);
3779 __n64x2 __vld2_p8(_In_reads_(16) const poly8_t * _PcD);
3780 __n64x2 __vld2_s16(_In_reads_(8) const int16_t * _PcD);
3781 __n64x2 __vld2_s32(_In_reads_(4) const int32_t * _PcD);
3782 __n64x2 __vld2_s8(_In_reads_(16) const int8_t * _PcD);
3783 __n64x2 __vld2_u16(_In_reads_(8) const uint16_t * _PcD);
3784 __n64x2 __vld2_u32(_In_reads_(4) const uint32_t * _PcD);
3785 __n64x2 __vld2_u8(_In_reads_(16) const uint8_t * _PcD);
3786 __n64x2 __vld2_s64(_In_reads_(2) const int64_t * _PcD);
3787 __n64x2 __vld2_u64(_In_reads_(2) const uint64_t * _PcD);
3788 __n64x2 __vld2_s64_ex(_In_reads_(2) const int64_t * _PcD, const int _Align);
3789 __n64x2 __vld2_u64_ex(_In_reads_(2) const uint64_t * _PcD, const int _Align);
3790 __n64x2 __vld2_f32_ex(_In_reads_(4) const float32_t * _PcD, const int _Align);
3791 __n64x2 __vld2_p16_ex(_In_reads_(8) const poly16_t * _PcD, const int _Align);
3792 __n64x2 __vld2_p8_ex(_In_reads_(16) const poly8_t * _PcD, const int _Align);
3793 __n64x2 __vld2_s16_ex(_In_reads_(8) const int16_t * _PcD, const int _Align);
3794 __n64x2 __vld2_s32_ex(_In_reads_(4) const int32_t * _PcD, const int _Align);
3795 __n64x2 __vld2_s8_ex(_In_reads_(16) const int8_t * _PcD, const int _Align);
3796 __n64x2 __vld2_u16_ex(_In_reads_(8) const uint16_t * _PcD, const int _Align);
3797 __n64x2 __vld2_u32_ex(_In_reads_(4) const uint32_t * _PcD, const int _Align);
3798 __n64x2 __vld2_u8_ex(_In_reads_(16) const uint8_t * _PcD, const int _Align);
3799 __n128x2 __vld2q_f32(_In_reads_(8) const float32_t * _PcD);
3800 __n128x2 __vld2q_p16(_In_reads_(16) const poly16_t * _PcD);
3801 __n128x2 __vld2q_p8(_In_reads_(32) const poly8_t * _PcD);
3802 __n128x2 __vld2q_s16(_In_reads_(16) const int16_t * _PcD);
3803 __n128x2 __vld2q_s32(_In_reads_(8) const int32_t * _PcD);
3804 __n128x2 __vld2q_s8(_In_reads_(32) const int8_t * _PcD);
3805 __n128x2 __vld2q_u16(_In_reads_(16) const uint16_t * _PcD);
3806 __n128x2 __vld2q_u32(_In_reads_(8) const uint32_t * _PcD);
3807 __n128x2 __vld2q_u8(_In_reads_(32) const uint8_t * _PcD);
3808 __n128x2 __vld2q_f32_ex(_In_reads_(8) const float32_t * _PcD, const int _Align);
3809 __n128x2 __vld2q_p16_ex(_In_reads_(16) const poly16_t * _PcD, const int _Align);
3810 __n128x2 __vld2q_p8_ex(_In_reads_(32) const poly8_t * _PcD, const int _Align);
3811 __n128x2 __vld2q_s16_ex(_In_reads_(16) const int16_t * _PcD, const int _Align);
3812 __n128x2 __vld2q_s32_ex(_In_reads_(8) const int32_t * _PcD, const int _Align);
3813 __n128x2 __vld2q_s8_ex(_In_reads_(32) const int8_t * _PcD, const int _Align);
3814 __n128x2 __vld2q_u16_ex(_In_reads_(16) const uint16_t * _PcD, const int _Align);
3815 __n128x2 __vld2q_u32_ex(_In_reads_(8) const uint32_t * _PcD, const int _Align);
3816 __n128x2 __vld2q_u8_ex(_In_reads_(32) const uint8_t * _PcD, const int _Align);
3817 __n64x2 __vld2_dup_f32(_In_reads_(2) const float32_t * _PcD);
3818 __n64x2 __vld2_dup_p16(_In_reads_(2) const poly16_t * _PcD);
3819 __n64x2 __vld2_dup_p8(_In_reads_(2) const poly8_t * _PcD);
3820 __n64x2 __vld2_dup_s16(_In_reads_(2) const int16_t * _PcD);
3821 __n64x2 __vld2_dup_s32(_In_reads_(2) const int32_t * _PcD);
3822 __n64x2 __vld2_dup_s8(_In_reads_(2) const int8_t * _PcD);
3823 __n64x2 __vld2_dup_u16(_In_reads_(2) const uint16_t * _PcD);
3824 __n64x2 __vld2_dup_u32(_In_reads_(2) const uint32_t * _PcD);
3825 __n64x2 __vld2_dup_u8(_In_reads_(2) const uint8_t * _PcD);
3826 __n64x2 __vld2_dup_s64(_In_reads_(2) const int64_t * _PcD);
3827 __n64x2 __vld2_dup_u64(_In_reads_(2) const uint64_t * _PcD);
3828 __n64x2 __vld2_dup_s64_ex(_In_reads_(2) const int64_t * _PcD, const int _Align);
3829 __n64x2 __vld2_dup_u64_ex(_In_reads_(2) const uint64_t * _PcD, const int _Align);
3830 __n64x2 __vld2_dup_f32_ex(_In_reads_(2) const float32_t * _PcD, const int _Align);
3831 __n64x2 __vld2_dup_p16_ex(_In_reads_(2) const poly16_t * _PcD, const int _Align);
3832 __n64x2 __vld2_dup_p8_ex(_In_reads_(2) const poly8_t * _PcD, const int _Align);
3833 __n64x2 __vld2_dup_s16_ex(_In_reads_(2) const int16_t * _PcD, const int _Align);
3834 __n64x2 __vld2_dup_s32_ex(_In_reads_(2) const int32_t * _PcD, const int _Align);
3835 __n64x2 __vld2_dup_s8_ex(_In_reads_(2) const int8_t * _PcD, const int _Align);
3836 __n64x2 __vld2_dup_u16_ex(_In_reads_(2) const uint16_t * _PcD, const int _Align);
3837 __n64x2 __vld2_dup_u32_ex(_In_reads_(2) const uint32_t * _PcD, const int _Align);
3838 __n64x2 __vld2_dup_u8_ex(_In_reads_(2) const uint8_t * _PcD, const int _Align);
3839 __n64x2 __vld2_lane_f32(_In_reads_(2) const float32_t * _PcD, __n64x2 _D2, const int _Lane);
3840 __n64x2 __vld2_lane_p16(_In_reads_(2) const poly16_t * _PcD, __n64x2 _D2, const int _Lane);
3841 __n64x2 __vld2_lane_p8(_In_reads_(2) const poly8_t * _PcD, __n64x2 _D2, const int _Lane);
3842 __n64x2 __vld2_lane_s16(_In_reads_(2) const int16_t * _PcD, __n64x2 _D2, const int _Lane);
3843 __n64x2 __vld2_lane_s32(_In_reads_(2) const int32_t * _PcD, __n64x2 _D2, const int _Lane);
3844 __n64x2 __vld2_lane_s8(_In_reads_(2) const int8_t * _PcD, __n64x2 _D2, const int _Lane);
3845 __n64x2 __vld2_lane_u16(_In_reads_(2) const uint16_t * _PcD, __n64x2 _D2, const int _Lane);
3846 __n64x2 __vld2_lane_u32(_In_reads_(2) const uint32_t * _PcD, __n64x2 _D2, const int _Lane);
3847 __n64x2 __vld2_lane_u8(_In_reads_(2) const uint8_t * _PcD, __n64x2 _D2, const int _Lane);
3848 __n128x2 __vld2q_lane_f32(_In_reads_(2) const float32_t * _PcD, __n128x2 _Q2, const int _Lane);
3849 __n128x2 __vld2q_lane_p16(_In_reads_(2) const poly16_t * _PcD, __n128x2 _Q2, const int _Lane);
3850 __n128x2 __vld2q_lane_s16(_In_reads_(2) const int16_t * _PcD, __n128x2 _Q2, const int _Lane);
3851 __n128x2 __vld2q_lane_s32(_In_reads_(2) const int32_t * _PcD, __n128x2 _Q2, const int _Lane);
3852 __n128x2 __vld2q_lane_u16(_In_reads_(2) const uint16_t * _PcD, __n128x2 _Q2, const int _Lane);
3853 __n128x2 __vld2q_lane_u32(_In_reads_(2) const uint32_t * _PcD, __n128x2 _Q2, const int _Lane);
3854 __n64x2 __vld2_lane_f32_ex(_In_reads_(2) const float32_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3855 __n64x2 __vld2_lane_p16_ex(_In_reads_(2) const poly16_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3856 __n64x2 __vld2_lane_p8_ex(_In_reads_(2) const poly8_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3857 __n64x2 __vld2_lane_s16_ex(_In_reads_(2) const int16_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3858 __n64x2 __vld2_lane_s32_ex(_In_reads_(2) const int32_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3859 __n64x2 __vld2_lane_s8_ex(_In_reads_(2) const int8_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3860 __n64x2 __vld2_lane_u16_ex(_In_reads_(2) const uint16_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3861 __n64x2 __vld2_lane_u32_ex(_In_reads_(2) const uint32_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3862 __n64x2 __vld2_lane_u8_ex(_In_reads_(2) const uint8_t * _PcD, __n64x2 _D2, const int _Lane, const int _Align);
3863 __n128x2 __vld2q_lane_f32_ex(_In_reads_(2) const float32_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3864 __n128x2 __vld2q_lane_p16_ex(_In_reads_(2) const poly16_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3865 __n128x2 __vld2q_lane_s16_ex(_In_reads_(2) const int16_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3866 __n128x2 __vld2q_lane_s32_ex(_In_reads_(2) const int32_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3867 __n128x2 __vld2q_lane_u16_ex(_In_reads_(2) const uint16_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3868 __n128x2 __vld2q_lane_u32_ex(_In_reads_(2) const uint32_t * _PcD, __n128x2 _Q2, const int _Lane, const int _Align);
3869 __n64x3 __vld3_f32(_In_reads_(6) const float32_t * _PcD);
3870 __n64x3 __vld3_p16(_In_reads_(12) const poly16_t * _PcD);
3871 __n64x3 __vld3_p8(_In_reads_(24) const poly8_t * _PcD);
3872 __n64x3 __vld3_s16(_In_reads_(12) const int16_t * _PcD);
3873 __n64x3 __vld3_s32(_In_reads_(6) const int32_t * _PcD);
3874 __n64x3 __vld3_s8(_In_reads_(24) const int8_t * _PcD);
3875 __n64x3 __vld3_u16(_In_reads_(12) const uint16_t * _PcD);
3876 __n64x3 __vld3_u32(_In_reads_(6) const uint32_t * _PcD);
3877 __n64x3 __vld3_u8(_In_reads_(24) const uint8_t * _PcD);
3878 __n64x3 __vld3_s64(_In_reads_(3) const int64_t * _PcD);
3879 __n64x3 __vld3_u64(_In_reads_(3) const uint64_t * _PcD);
3880 __n64x3 __vld3_s64_ex(_In_reads_(3) const int64_t * _PcD, const int _Align);
3881 __n64x3 __vld3_u64_ex(_In_reads_(3) const uint64_t * _PcD, const int _Align);
3882 __n64x3 __vld3_f32_ex(_In_reads_(6) const float32_t * _PcD, const int _Align);
3883 __n64x3 __vld3_p16_ex(_In_reads_(12) const poly16_t * _PcD, const int _Align);
3884 __n64x3 __vld3_p8_ex(_In_reads_(24) const poly8_t * _PcD, const int _Align);
3885 __n64x3 __vld3_s16_ex(_In_reads_(12) const int16_t * _PcD, const int _Align);
3886 __n64x3 __vld3_s32_ex(_In_reads_(6) const int32_t * _PcD, const int _Align);
3887 __n64x3 __vld3_s8_ex(_In_reads_(24) const int8_t * _PcD, const int _Align);
3888 __n64x3 __vld3_u16_ex(_In_reads_(12) const uint16_t * _PcD, const int _Align);
3889 __n64x3 __vld3_u32_ex(_In_reads_(6) const uint32_t * _PcD, const int _Align);
3890 __n64x3 __vld3_u8_ex(_In_reads_(24) const uint8_t * _PcD, const int _Align);
3891 __n128x3 __vld3q_f32(_In_reads_(12) const float32_t * _PcD);
3892 __n128x3 __vld3q_p16(_In_reads_(24) const poly16_t * _PcD);
3893 __n128x3 __vld3q_p8(_In_reads_(48) const poly8_t * _PcD);
3894 __n128x3 __vld3q_s16(_In_reads_(24) const int16_t * _PcD);
3895 __n128x3 __vld3q_s32(_In_reads_(12) const int32_t * _PcD);
3896 __n128x3 __vld3q_s8(_In_reads_(48) const int8_t * _PcD);
3897 __n128x3 __vld3q_u16(_In_reads_(24) const uint16_t * _PcD);
3898 __n128x3 __vld3q_u32(_In_reads_(12) const uint32_t * _PcD);
3899 __n128x3 __vld3q_u8(_In_reads_(48) const uint8_t * _PcD);
3900 __n128x3 __vld3q_f32_ex(_In_reads_(12) const float32_t * _PcD, const int _Align);
3901 __n128x3 __vld3q_p16_ex(_In_reads_(24) const poly16_t * _PcD, const int _Align);
3902 __n128x3 __vld3q_p8_ex(_In_reads_(48) const poly8_t * _PcD, const int _Align);
3903 __n128x3 __vld3q_s16_ex(_In_reads_(24) const int16_t * _PcD, const int _Align);
3904 __n128x3 __vld3q_s32_ex(_In_reads_(12) const int32_t * _PcD, const int _Align);
3905 __n128x3 __vld3q_s8_ex(_In_reads_(48) const int8_t * _PcD, const int _Align);
3906 __n128x3 __vld3q_u16_ex(_In_reads_(24) const uint16_t * _PcD, const int _Align);
3907 __n128x3 __vld3q_u32_ex(_In_reads_(12) const uint32_t * _PcD, const int _Align);
3908 __n128x3 __vld3q_u8_ex(_In_reads_(48) const uint8_t * _PcD, const int _Align);
3909 __n64x3 __vld3_dup_f32(_In_reads_(3) const float32_t * _PcD);
3910 __n64x3 __vld3_dup_p16(_In_reads_(3) const poly16_t * _PcD);
3911 __n64x3 __vld3_dup_p8(_In_reads_(3) const poly8_t * _PcD);
3912 __n64x3 __vld3_dup_s16(_In_reads_(3) const int16_t * _PcD);
3913 __n64x3 __vld3_dup_s32(_In_reads_(3) const int32_t * _PcD);
3914 __n64x3 __vld3_dup_s8(_In_reads_(3) const int8_t * _PcD);
3915 __n64x3 __vld3_dup_u16(_In_reads_(3) const uint16_t * _PcD);
3916 __n64x3 __vld3_dup_u32(_In_reads_(3) const uint32_t * _PcD);
3917 __n64x3 __vld3_dup_u8(_In_reads_(3) const uint8_t * _PcD);
3918 __n64x3 __vld3_dup_s64(_In_reads_(3) const int64_t * _PcD);
3919 __n64x3 __vld3_dup_u64(_In_reads_(3) const uint64_t * _PcD);
3920 __n64x3 __vld3_lane_f32(_In_reads_(3) const float32_t * _PcD, __n64x3 _D3, const int _Lane);
3921 __n64x3 __vld3_lane_p16(_In_reads_(3) const poly16_t * _PcD, __n64x3 _D3, const int _Lane);
3922 __n64x3 __vld3_lane_p8(_In_reads_(3) const poly8_t * _PcD, __n64x3 _D3, const int _Lane);
3923 __n64x3 __vld3_lane_s16(_In_reads_(3) const int16_t * _PcD, __n64x3 _D3, const int _Lane);
3924 __n64x3 __vld3_lane_s32(_In_reads_(3) const int32_t * _PcD, __n64x3 _D3, const int _Lane);
3925 __n64x3 __vld3_lane_s8(_In_reads_(3) const int8_t * _PcD, __n64x3 _D3, const int _Lane);
3926 __n64x3 __vld3_lane_u16(_In_reads_(3) const uint16_t * _PcD, __n64x3 _D3, const int _Lane);
3927 __n64x3 __vld3_lane_u32(_In_reads_(3) const uint32_t * _PcD, __n64x3 _D3, const int _Lane);
3928 __n64x3 __vld3_lane_u8(_In_reads_(3) const uint8_t * _PcD, __n64x3 _D3, const int _Lane);
3929 __n128x3 __vld3q_lane_f32(_In_reads_(3) const float32_t * _PcD, __n128x3 _Q3, const int _Lane);
3930 __n128x3 __vld3q_lane_p16(_In_reads_(3) const poly16_t * _PcD, __n128x3 _Q3, const int _Lane);
3931 __n128x3 __vld3q_lane_s16(_In_reads_(3) const int16_t * _PcD, __n128x3 _Q3, const int _Lane);
3932 __n128x3 __vld3q_lane_s32(_In_reads_(3) const int32_t * _PcD, __n128x3 _Q3, const int _Lane);
3933 __n128x3 __vld3q_lane_u16(_In_reads_(3) const uint16_t * _PcD, __n128x3 _Q3, const int _Lane);
3934 __n128x3 __vld3q_lane_u32(_In_reads_(3) const uint32_t * _PcD, __n128x3 _Q3, const int _Lane);
3935 __n64x4 __vld4_f32(_In_reads_(8) const float32_t * _PcD);
3936 __n64x4 __vld4_p16(_In_reads_(16) const poly16_t * _PcD);
3937 __n64x4 __vld4_p8(_In_reads_(32) const poly8_t * _PcD);
3938 __n64x4 __vld4_s16(_In_reads_(16) const int16_t * _PcD);
3939 __n64x4 __vld4_s32(_In_reads_(8) const int32_t * _PcD);
3940 __n64x4 __vld4_s8(_In_reads_(32) const int8_t * _PcD);
3941 __n64x4 __vld4_u16(_In_reads_(16) const uint16_t * _PcD);
3942 __n64x4 __vld4_u32(_In_reads_(8) const uint32_t * _PcD);
3943 __n64x4 __vld4_u8(_In_reads_(32) const uint8_t * _PcD);
3944 __n64x4 __vld4_s64(_In_reads_(4) const int64_t * _PcD);
3945 __n64x4 __vld4_u64(_In_reads_(4) const uint64_t * _PcD);
3946 __n64x4 __vld4_s64_ex(_In_reads_(4) const int64_t * _PcD, const int _Align);
3947 __n64x4 __vld4_u64_ex(_In_reads_(4) const uint64_t * _PcD, const int _Align);
3948 __n64x4 __vld4_f32_ex(_In_reads_(8) const float32_t * _PcD, const int _Align);
3949 __n64x4 __vld4_p16_ex(_In_reads_(16) const poly16_t * _PcD, const int _Align);
3950 __n64x4 __vld4_p8_ex(_In_reads_(32) const poly8_t * _PcD, const int _Align);
3951 __n64x4 __vld4_s16_ex(_In_reads_(16) const int16_t * _PcD, const int _Align);
3952 __n64x4 __vld4_s32_ex(_In_reads_(8) const int32_t * _PcD, const int _Align);
3953 __n64x4 __vld4_s8_ex(_In_reads_(32) const int8_t * _PcD, const int _Align);
3954 __n64x4 __vld4_u16_ex(_In_reads_(16) const uint16_t * _PcD, const int _Align);
3955 __n64x4 __vld4_u32_ex(_In_reads_(8) const uint32_t * _PcD, const int _Align);
3956 __n64x4 __vld4_u8_ex(_In_reads_(32) const uint8_t * _PcD, const int _Align);
3957 __n128x4 __vld4q_f32(_In_reads_(16) const float32_t * _PcD);
3958 __n128x4 __vld4q_p16(_In_reads_(32) const poly16_t * _PcD);
3959 __n128x4 __vld4q_p8(_In_reads_(64) const poly8_t * _PcD);
3960 __n128x4 __vld4q_s16(_In_reads_(32) const int16_t * _PcD);
3961 __n128x4 __vld4q_s32(_In_reads_(16) const int32_t * _PcD);
3962 __n128x4 __vld4q_s8(_In_reads_(64) const int8_t * _PcD);
3963 __n128x4 __vld4q_u16(_In_reads_(32) const uint16_t * _PcD);
3964 __n128x4 __vld4q_u32(_In_reads_(16) const uint32_t * _PcD);
3965 __n128x4 __vld4q_u8(_In_reads_(64) const uint8_t * _PcD);
3966 __n128x4 __vld4q_f32_ex(_In_reads_(16) const float32_t * _PcD, const int _Align);
3967 __n128x4 __vld4q_p16_ex(_In_reads_(32) const poly16_t * _PcD, const int _Align);
3968 __n128x4 __vld4q_p8_ex(_In_reads_(64) const poly8_t * _PcD, const int _Align);
3969 __n128x4 __vld4q_s16_ex(_In_reads_(32) const int16_t * _PcD, const int _Align);
3970 __n128x4 __vld4q_s32_ex(_In_reads_(16) const int32_t * _PcD, const int _Align);
3971 __n128x4 __vld4q_s8_ex(_In_reads_(64) const int8_t * _PcD, const int _Align);
3972 __n128x4 __vld4q_u16_ex(_In_reads_(32) const uint16_t * _PcD, const int _Align);
3973 __n128x4 __vld4q_u32_ex(_In_reads_(16) const uint32_t * _PcD, const int _Align);
3974 __n128x4 __vld4q_u8_ex(_In_reads_(64) const uint8_t * _PcD, const int _Align);
3975 __n64x4 __vld4_dup_f32(_In_reads_(4) const float32_t * _PcD);
3976 __n64x4 __vld4_dup_p16(_In_reads_(4) const poly16_t * _PcD);
3977 __n64x4 __vld4_dup_p8(_In_reads_(4) const poly8_t * _PcD);
3978 __n64x4 __vld4_dup_s16(_In_reads_(4) const int16_t * _PcD);
3979 __n64x4 __vld4_dup_s32(_In_reads_(4) const int32_t * _PcD);
3980 __n64x4 __vld4_dup_s8(_In_reads_(4) const int8_t * _PcD);
3981 __n64x4 __vld4_dup_u16(_In_reads_(4) const uint16_t * _PcD);
3982 __n64x4 __vld4_dup_u32(_In_reads_(4) const uint32_t * _PcD);
3983 __n64x4 __vld4_dup_u8(_In_reads_(4) const uint8_t * _PcD);
3984 __n64x4 __vld4_dup_s64(_In_reads_(4) const int64_t * _PcD);
3985 __n64x4 __vld4_dup_u64(_In_reads_(4) const uint64_t * _PcD);
3986 __n64x4 __vld4_dup_f32_ex(_In_reads_(4) const float32_t * _PcD, const int _Align);
3987 __n64x4 __vld4_dup_p16_ex(_In_reads_(4) const poly16_t * _PcD, const int _Align);
3988 __n64x4 __vld4_dup_p8_ex(_In_reads_(4) const poly8_t * _PcD, const int _Align);
3989 __n64x4 __vld4_dup_s16_ex(_In_reads_(4) const int16_t * _PcD, const int _Align);
3990 __n64x4 __vld4_dup_s32_ex(_In_reads_(4) const int32_t * _PcD, const int _Align);
3991 __n64x4 __vld4_dup_s8_ex(_In_reads_(4) const int8_t * _PcD, const int _Align);
3992 __n64x4 __vld4_dup_u16_ex(_In_reads_(4) const uint16_t * _PcD, const int _Align);
3993 __n64x4 __vld4_dup_u32_ex(_In_reads_(4) const uint32_t * _PcD, const int _Align);
3994 __n64x4 __vld4_dup_u8_ex(_In_reads_(4) const uint8_t * _PcD, const int _Align);
3995 __n64x4 __vld4_lane_f32(_In_reads_(4) const float32_t * _PcD, __n64x4 _D4, const int _Lane);
3996 __n64x4 __vld4_lane_p16(_In_reads_(4) const poly16_t * _PcD, __n64x4 _D4, const int _Lane);
3997 __n64x4 __vld4_lane_p8(_In_reads_(4) const poly8_t * _PcD, __n64x4 _D4, const int _Lane);
3998 __n64x4 __vld4_lane_s16(_In_reads_(4) const int16_t * _PcD, __n64x4 _D4, const int _Lane);
3999 __n64x4 __vld4_lane_s32(_In_reads_(4) const int32_t * _PcD, __n64x4 _D4, const int _Lane);
4000 __n64x4 __vld4_lane_s8(_In_reads_(4) const int8_t * _PcD, __n64x4 _D4, const int _Lane);
4001 __n64x4 __vld4_lane_u16(_In_reads_(4) const uint16_t * _PcD, __n64x4 _D4, const int _Lane);
4002 __n64x4 __vld4_lane_u32(_In_reads_(4) const uint32_t * _PcD, __n64x4 _D4, const int _Lane);
4003 __n64x4 __vld4_lane_u8(_In_reads_(4) const uint8_t * _PcD, __n64x4 _D4, const int _Lane);
4004 __n128x4 __vld4q_lane_f32(_In_reads_(4) const float32_t * _PcD, __n128x4 _Q4, const int _Lane);
4005 __n128x4 __vld4q_lane_p16(_In_reads_(4) const poly16_t * _PcD, __n128x4 _Q4, const int _Lane);
4006 __n128x4 __vld4q_lane_s16(_In_reads_(4) const int16_t * _PcD, __n128x4 _Q4, const int _Lane);
4007 __n128x4 __vld4q_lane_s32(_In_reads_(4) const int32_t * _PcD, __n128x4 _Q4, const int _Lane);
4008 __n128x4 __vld4q_lane_u16(_In_reads_(4) const uint16_t * _PcD, __n128x4 _Q4, const int _Lane);
4009 __n128x4 __vld4q_lane_u32(_In_reads_(4) const uint32_t * _PcD, __n128x4 _Q4, const int _Lane);
4010 __n64x4 __vld4_lane_f32_ex(_In_reads_(4) const float32_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4011 __n64x4 __vld4_lane_p16_ex(_In_reads_(4) const poly16_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4012 __n64x4 __vld4_lane_p8_ex(_In_reads_(4) const poly8_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4013 __n64x4 __vld4_lane_s16_ex(_In_reads_(4) const int16_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4014 __n64x4 __vld4_lane_s32_ex(_In_reads_(4) const int32_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4015 __n64x4 __vld4_lane_s8_ex(_In_reads_(4) const int8_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4016 __n64x4 __vld4_lane_u16_ex(_In_reads_(4) const uint16_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4017 __n64x4 __vld4_lane_u32_ex(_In_reads_(4) const uint32_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4018 __n64x4 __vld4_lane_u8_ex(_In_reads_(4) const uint8_t * _PcD, __n64x4 _D4, const int _Lane, const int _Align);
4019 __n128x4 __vld4q_lane_f32_ex(_In_reads_(4) const float32_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4020 __n128x4 __vld4q_lane_p16_ex(_In_reads_(4) const poly16_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4021 __n128x4 __vld4q_lane_s16_ex(_In_reads_(4) const int16_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4022 __n128x4 __vld4q_lane_s32_ex(_In_reads_(4) const int32_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4023 __n128x4 __vld4q_lane_u16_ex(_In_reads_(4) const uint16_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4024 __n128x4 __vld4q_lane_u32_ex(_In_reads_(4) const uint32_t * _PcD, __n128x4 _Q4, const int _Lane, const int _Align);
4025 __n64 __vmax_f32(__n64 _Dn, __n64 _Dm);
4026 __n64 __vmaxnm_f32(__n64 _Dn, __n64 _Dm);
4027 __n64 __vmin_f32(__n64 _Dn, __n64 _Dm);
4028 __n64 __vminnm_f32(__n64 _Dn, __n64 _Dm);
4029 __n128 __vmaxq_f32(__n128 _Qn, __n128 _Qm);
4030 __n128 __vmaxnmq_f32(__n128 _Qn, __n128 _Qm);
4031 __n128 __vminq_f32(__n128 _Qn, __n128 _Qm);
4032 __n128 __vminnmq_f32(__n128 _Qn, __n128 _Qm);
4033 __n64 __vmax_s16(__n64 _Dn, __n64 _Dm);
4034 __n64 __vmax_s32(__n64 _Dn, __n64 _Dm);
4035 __n64 __vmax_s8(__n64 _Dn, __n64 _Dm);
4036 __n64 __vmax_u16(__n64 _Dn, __n64 _Dm);
4037 __n64 __vmax_u32(__n64 _Dn, __n64 _Dm);
4038 __n64 __vmax_u8(__n64 _Dn, __n64 _Dm);
4039 __n64 __vmin_s16(__n64 _Dn, __n64 _Dm);
4040 __n64 __vmin_s32(__n64 _Dn, __n64 _Dm);
4041 __n64 __vmin_s8(__n64 _Dn, __n64 _Dm);
4042 __n64 __vmin_u16(__n64 _Dn, __n64 _Dm);
4043 __n64 __vmin_u32(__n64 _Dn, __n64 _Dm);
4044 __n64 __vmin_u8(__n64 _Dn, __n64 _Dm);
4045 __n128 __vmaxq_s16(__n128 _Qn, __n128 _Qm);
4046 __n128 __vmaxq_s32(__n128 _Qn, __n128 _Qm);
4047 __n128 __vmaxq_s8(__n128 _Qn, __n128 _Qm);
4048 __n128 __vmaxq_u16(__n128 _Qn, __n128 _Qm);
4049 __n128 __vmaxq_u32(__n128 _Qn, __n128 _Qm);
4050 __n128 __vmaxq_u8(__n128 _Qn, __n128 _Qm);
4051 __n128 __vminq_s16(__n128 _Qn, __n128 _Qm);
4052 __n128 __vminq_s32(__n128 _Qn, __n128 _Qm);
4053 __n128 __vminq_s8(__n128 _Qn, __n128 _Qm);
4054 __n128 __vminq_u16(__n128 _Qn, __n128 _Qm);
4055 __n128 __vminq_u32(__n128 _Qn, __n128 _Qm);
4056 __n128 __vminq_u8(__n128 _Qn, __n128 _Qm);
4057 __n64 __vmla_lane_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4058 __n64 __vmla_lane_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4059 __n64 __vmla_lane_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4060 __n64 __vmla_lane_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4061 __n64 __vmla_lane_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4062 __n64 __vmls_lane_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4063 __n64 __vmls_lane_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4064 __n64 __vmls_lane_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4065 __n64 __vmls_lane_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4066 __n64 __vmls_lane_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm, const int _Lane);
4067 __n128 __vmlaq_lane_f32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4068 __n128 __vmlaq_lane_s16(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4069 __n128 __vmlaq_lane_s32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4070 __n128 __vmlaq_lane_u16(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4071 __n128 __vmlaq_lane_u32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4072 __n128 __vmlsq_lane_f32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4073 __n128 __vmlsq_lane_s16(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4074 __n128 __vmlsq_lane_s32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4075 __n128 __vmlsq_lane_u16(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4076 __n128 __vmlsq_lane_u32(__n128 _Qd, __n128 _Qn, __n64 _Dm, const int _Lane);
4077 __n64 __vmla_n_f32(__n64 _Dd, __n64 _Dn, float32_t _Ft);
4078 __n64 __vmls_n_f32(__n64 _Dd, __n64 _Dn, float32_t _Ft);
4079 __n128 __vmlaq_n_f32(__n128 _Qd, __n128 _Qn, float32_t _Ft);
4080 __n128 __vmlsq_n_f32(__n128 _Qd, __n128 _Qn, float32_t _Ft);
4081 __n64 __vmla_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4082 __n64 __vmls_f32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4083 __n128 __vmlaq_f32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4084 __n128 __vmlsq_f32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4085 __n64 __vmla_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4086 __n64 __vmla_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4087 __n64 __vmla_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4088 __n64 __vmla_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4089 __n64 __vmla_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4090 __n64 __vmla_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4091 __n64 __vmls_s16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4092 __n64 __vmls_s32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4093 __n64 __vmls_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4094 __n64 __vmls_u16(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4095 __n64 __vmls_u32(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4096 __n64 __vmls_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
4097 __n128 __vmlaq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4098 __n128 __vmlaq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4099 __n128 __vmlaq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4100 __n128 __vmlaq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4101 __n128 __vmlaq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4102 __n128 __vmlaq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4103 __n128 __vmlsq_s16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4104 __n128 __vmlsq_s32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4105 __n128 __vmlsq_s8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4106 __n128 __vmlsq_u16(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4107 __n128 __vmlsq_u32(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4108 __n128 __vmlsq_u8(__n128 _Qd, __n128 _Qn, __n128 _Qm);
4109 __n128 __vmlal_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4110 __n128 __vmlal_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4111 __n128 __vmlal_s8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4112 __n128 __vmlal_u16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4113 __n128 __vmlal_u32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4114 __n128 __vmlal_u8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4115 __n128 __vmlsl_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4116 __n128 __vmlsl_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4117 __n128 __vmlsl_s8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4118 __n128 __vmlsl_u16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4119 __n128 __vmlsl_u32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4120 __n128 __vmlsl_u8(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4121 __n128 __vmlal_lane_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4122 __n128 __vmlal_lane_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4123 __n128 __vmlal_lane_u16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4124 __n128 __vmlal_lane_u32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4125 __n128 __vmlsl_lane_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4126 __n128 __vmlsl_lane_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4127 __n128 __vmlsl_lane_u16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4128 __n128 __vmlsl_lane_u32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4129 __n64 __vset_lane_f32(float32_t _Ft, __n64 _Dd, const int _Lane);
4130 __n64 __vset_lane_p16(poly16_t _Rt, __n64 _Dd, const int _Lane);
4131 __n64 __vset_lane_p8(poly8_t _Rt, __n64 _Dd, const int _Lane);
4132 __n64 __vset_lane_s16(int16_t _Rt, __n64 _Dd, const int _Lane);
4133 __n64 __vset_lane_s32(int32_t _Rt, __n64 _Dd, const int _Lane);
4134 __n64 __vset_lane_s8(int8_t _Rt, __n64 _Dd, const int _Lane);
4135 __n64 __vset_lane_u16(uint16_t _Rt, __n64 _Dd, const int _Lane);
4136 __n64 __vset_lane_u32(uint32_t _Rt, __n64 _Dd, const int _Lane);
4137 __n64 __vset_lane_u8(uint8_t _Rt, __n64 _Dd, const int _Lane);
4138 float32_t __vget_lane_f32(__n64 _Dm, const int _Lane);
4139 poly16_t __vget_lane_p16(__n64 _Dm, const int _Lane);
4140 poly8_t __vget_lane_p8(__n64 _Dm, const int _Lane);
4141 int16_t __vget_lane_s16(__n64 _Dm, const int _Lane);
4142 int8_t __vget_lane_s8(__n64 _Dm, const int _Lane);
4143 int32_t __vget_lane_s32(__n64 _Dm, const int _Lane);
4144 uint16_t __vget_lane_u16(__n64 _Dm, const int _Lane);
4145 uint8_t __vget_lane_u8(__n64 _Dm, const int _Lane);
4146 uint32_t __vget_lane_u32(__n64 _Dm, const int _Lane);
4147 __n64 __vset_lane_s64(int64_t _R64t, __n64 _Dd, const int _Lane);
4148 __n64 __vset_lane_u64(uint64_t _R64t, __n64 _Dd, const int _Lane);
4149 __n128 __vsetq_lane_s64(int64_t _R64t, __n128 _Qd, const int _Lane);
4150 __n128 __vsetq_lane_u64(uint64_t _R64t, __n128 _Qd, const int _Lane);
4151 int64_t __vget_lane_s64(__n64 _Dm, const int _Lane);
4152 uint64_t __vget_lane_u64(__n64 _Dm, const int _Lane);
4153 int64_t __vgetq_lane_s64(__n128 _Qm, const int _Lane);
4154 uint64_t __vgetq_lane_u64(__n128 _Qm, const int _Lane);
4155 __n128 __vsetq_lane_f32(float32_t _Ft, __n128 _Qd, const int _Lane);
4156 __n128 __vsetq_lane_p16(poly16_t _Rt, __n128 _Qd, const int _Lane);
4157 __n128 __vsetq_lane_p8(poly8_t _Rt, __n128 _Qd, const int _Lane);
4158 __n128 __vsetq_lane_s16(int16_t _Rt, __n128 _Qd, const int _Lane);
4159 __n128 __vsetq_lane_s32(int32_t _Rt, __n128 _Qd, const int _Lane);
4160 __n128 __vsetq_lane_s8(int8_t _Rt, __n128 _Qd, const int _Lane);
4161 __n128 __vsetq_lane_u16(uint16_t _Rt, __n128 _Qd, const int _Lane);
4162 __n128 __vsetq_lane_u32(uint32_t _Rt, __n128 _Qd, const int _Lane);
4163 __n128 __vsetq_lane_u8(uint8_t _Rt, __n128 _Qd, const int _Lane);
4164 float32_t __vgetq_lane_f32(__n128 _Qm, const int _Lane);
4165 poly16_t __vgetq_lane_p16(__n128 _Qm, const int _Lane);
4166 poly8_t __vgetq_lane_p8(__n128 _Qm, const int _Lane);
4167 int16_t __vgetq_lane_s16(__n128 _Qm, const int _Lane);
4168 int8_t __vgetq_lane_s8(__n128 _Qm, const int _Lane);
4169 int32_t __vgetq_lane_s32(__n128 _Qm, const int _Lane);
4170 uint16_t __vgetq_lane_u16(__n128 _Qm, const int _Lane);
4171 uint8_t __vgetq_lane_u8(__n128 _Qm, const int _Lane);
4172 uint32_t __vgetq_lane_u32(__n128 _Qm, const int _Lane);
4173 __n128 __vmovl_s16(__n64 _Dm);
4174 __n128 __vmovl_s32(__n64 _Dm);
4175 __n128 __vmovl_s8(__n64 _Dm);
4176 __n128 __vmovl_u16(__n64 _Dm);
4177 __n128 __vmovl_u32(__n64 _Dm);
4178 __n128 __vmovl_u8(__n64 _Dm);
4179 __n64 __vmovn_s16(__n128 _Qm);
4180 __n64 __vmovn_s32(__n128 _Qm);
4181 __n64 __vmovn_s64(__n128 _Qm);
4182 __n64 __vmovn_u16(__n128 _Qm);
4183 __n64 __vmovn_u32(__n128 _Qm);
4184 __n64 __vmovn_u64(__n128 _Qm);
4185 __n64 __vmul_f32(__n64 _Dn, __n64 _Dm);
4186 __n64 __vmul_p8(__n64 _Dn, __n64 _Dm);
4187 __n64 __vmul_s16(__n64 _Dn, __n64 _Dm);
4188 __n64 __vmul_s32(__n64 _Dn, __n64 _Dm);
4189 __n64 __vmul_s8(__n64 _Dn, __n64 _Dm);
4190 __n64 __vmul_u16(__n64 _Dn, __n64 _Dm);
4191 __n64 __vmul_u32(__n64 _Dn, __n64 _Dm);
4192 __n64 __vmul_u8(__n64 _Dn, __n64 _Dm);
4193 __n128 __vmulq_f32(__n128 _Qn, __n128 _Qm);
4194 __n128 __vmulq_p8(__n128 _Qn, __n128 _Qm);
4195 __n128 __vmulq_s16(__n128 _Qn, __n128 _Qm);
4196 __n128 __vmulq_s32(__n128 _Qn, __n128 _Qm);
4197 __n128 __vmulq_s8(__n128 _Qn, __n128 _Qm);
4198 __n128 __vmulq_u16(__n128 _Qn, __n128 _Qm);
4199 __n128 __vmulq_u32(__n128 _Qn, __n128 _Qm);
4200 __n128 __vmulq_u8(__n128 _Qn, __n128 _Qm);
4201 __n64 __vmul_n_f32(__n64 _Dn, float32_t _Ft);
4202 __n128 __vmulq_n_f32(__n128 _Qn, float32_t _Ft);
4203 __n64 __vmul_lane_f32(__n64 _Dn, __n64 _Dm, const int _Lane);
4204 __n64 __vmul_lane_s16(__n64 _Dn, __n64 _Dm, const int _Lane);
4205 __n64 __vmul_lane_s32(__n64 _Dn, __n64 _Dm, const int _Lane);
4206 __n64 __vmul_lane_u16(__n64 _Dn, __n64 _Dm, const int _Lane);
4207 __n64 __vmul_lane_u32(__n64 _Dn, __n64 _Dm, const int _Lane);
4208 __n128 __vmulq_lane_f32(__n128 _Qn, __n64 _Dm, const int _Lane);
4209 __n128 __vmulq_lane_s16(__n128 _Qn, __n64 _Dm, const int _Lane);
4210 __n128 __vmulq_lane_s32(__n128 _Qn, __n64 _Dm, const int _Lane);
4211 __n128 __vmulq_lane_u16(__n128 _Qn, __n64 _Dm, const int _Lane);
4212 __n128 __vmulq_lane_u32(__n128 _Qn, __n64 _Dm, const int _Lane);
4213 __n128 __vmull_p64(__n64 _Dn, __n64 _Dm);
4214 __n128 __vmull_p8(__n64 _Dn, __n64 _Dm);
4215 __n128 __vmull_s16(__n64 _Dn, __n64 _Dm);
4216 __n128 __vmull_s32(__n64 _Dn, __n64 _Dm);
4217 __n128 __vmull_s8(__n64 _Dn, __n64 _Dm);
4218 __n128 __vmull_u16(__n64 _Dn, __n64 _Dm);
4219 __n128 __vmull_u32(__n64 _Dn, __n64 _Dm);
4220 __n128 __vmull_u8(__n64 _Dn, __n64 _Dm);
4221 __n128 __vmull_lane_s16(__n64 _Dn, __n64 _Dm, const int _Lane);
4222 __n128 __vmull_lane_s32(__n64 _Dn, __n64 _Dm, const int _Lane);
4223 __n128 __vmull_lane_u16(__n64 _Dn, __n64 _Dm, const int _Lane);
4224 __n128 __vmull_lane_u32(__n64 _Dn, __n64 _Dm, const int _Lane);
4225 __n64 __vmvn_p16(__n64 _Dm);
4226 __n64 __vmvn_p8(__n64 _Dm);
4227 __n64 __vmvn_s16(__n64 _Dm);
4228 __n64 __vmvn_s32(__n64 _Dm);
4229 __n64 __vmvn_s8(__n64 _Dm);
4230 __n64 __vmvn_u16(__n64 _Dm);
4231 __n64 __vmvn_u32(__n64 _Dm);
4232 __n64 __vmvn_u8(__n64 _Dm);
4233 __n128 __vmvnq_p16(__n128 _Qm);
4234 __n128 __vmvnq_p8(__n128 _Qm);
4235 __n128 __vmvnq_s16(__n128 _Qm);
4236 __n128 __vmvnq_s32(__n128 _Qm);
4237 __n128 __vmvnq_s8(__n128 _Qm);
4238 __n128 __vmvnq_u16(__n128 _Qm);
4239 __n128 __vmvnq_u32(__n128 _Qm);
4240 __n128 __vmvnq_u8(__n128 _Qm);
4241 __n64 __vpadal_s16(__n64 _Dd, __n64 _Dm);
4242 __n64 __vpadal_s32(__n64 _Dd, __n64 _Dm);
4243 __n64 __vpadal_s8(__n64 _Dd, __n64 _Dm);
4244 __n64 __vpadal_u16(__n64 _Dd, __n64 _Dm);
4245 __n64 __vpadal_u32(__n64 _Dd, __n64 _Dm);
4246 __n64 __vpadal_u8(__n64 _Dd, __n64 _Dm);
4247 __n128 __vpadalq_s16(__n128 _Qd, __n128 _Qm);
4248 __n128 __vpadalq_s32(__n128 _Qd, __n128 _Qm);
4249 __n128 __vpadalq_s8(__n128 _Qd, __n128 _Qm);
4250 __n128 __vpadalq_u16(__n128 _Qd, __n128 _Qm);
4251 __n128 __vpadalq_u32(__n128 _Qd, __n128 _Qm);
4252 __n128 __vpadalq_u8(__n128 _Qd, __n128 _Qm);
4253 __n64 __vpadd_f32(__n64 _Dn, __n64 _Dm);
4254 __n64 __vpadd_s16(__n64 _Dn, __n64 _Dm);
4255 __n64 __vpadd_s32(__n64 _Dn, __n64 _Dm);
4256 __n64 __vpadd_s8(__n64 _Dn, __n64 _Dm);
4257 __n64 __vpadd_u16(__n64 _Dn, __n64 _Dm);
4258 __n64 __vpadd_u32(__n64 _Dn, __n64 _Dm);
4259 __n64 __vpadd_u8(__n64 _Dn, __n64 _Dm);
4260 __n64 __vpaddl_s16(__n64 _Dm);
4261 __n64 __vpaddl_s32(__n64 _Dm);
4262 __n64 __vpaddl_s8(__n64 _Dm);
4263 __n64 __vpaddl_u16(__n64 _Dm);
4264 __n64 __vpaddl_u32(__n64 _Dm);
4265 __n64 __vpaddl_u8(__n64 _Dm);
4266 __n128 __vpaddlq_s16(__n128 _Qm);
4267 __n128 __vpaddlq_s32(__n128 _Qm);
4268 __n128 __vpaddlq_s8(__n128 _Qm);
4269 __n128 __vpaddlq_u16(__n128 _Qm);
4270 __n128 __vpaddlq_u32(__n128 _Qm);
4271 __n128 __vpaddlq_u8(__n128 _Qm);
4272 __n64 __vpmax_f32(__n64 _Dn, __n64 _Dm);
4273 __n64 __vpmin_f32(__n64 _Dn, __n64 _Dm);
4274 __n64 __vpmax_s16(__n64 _Dn, __n64 _Dm);
4275 __n64 __vpmax_s32(__n64 _Dn, __n64 _Dm);
4276 __n64 __vpmax_s8(__n64 _Dn, __n64 _Dm);
4277 __n64 __vpmax_u16(__n64 _Dn, __n64 _Dm);
4278 __n64 __vpmax_u32(__n64 _Dn, __n64 _Dm);
4279 __n64 __vpmax_u8(__n64 _Dn, __n64 _Dm);
4280 __n64 __vpmin_s16(__n64 _Dn, __n64 _Dm);
4281 __n64 __vpmin_s32(__n64 _Dn, __n64 _Dm);
4282 __n64 __vpmin_s8(__n64 _Dn, __n64 _Dm);
4283 __n64 __vpmin_u16(__n64 _Dn, __n64 _Dm);
4284 __n64 __vpmin_u32(__n64 _Dn, __n64 _Dm);
4285 __n64 __vpmin_u8(__n64 _Dn, __n64 _Dm);
4286 __n64 __vqabs_s16(__n64 _Dm);
4287 __n64 __vqabs_s32(__n64 _Dm);
4288 __n64 __vqabs_s8(__n64 _Dm);
4289 __n64 __vqneg_s16(__n64 _Dm);
4290 __n64 __vqneg_s32(__n64 _Dm);
4291 __n64 __vqneg_s8(__n64 _Dm);
4292 __n128 __vqabsq_s16(__n128 _Qm);
4293 __n128 __vqabsq_s32(__n128 _Qm);
4294 __n128 __vqabsq_s8(__n128 _Qm);
4295 __n128 __vqnegq_s16(__n128 _Qm);
4296 __n128 __vqnegq_s32(__n128 _Qm);
4297 __n128 __vqnegq_s8(__n128 _Qm);
4298 __n64 __vqadd_s16(__n64 _Dn, __n64 _Dm);
4299 __n64 __vqadd_s32(__n64 _Dn, __n64 _Dm);
4300 __n64 __vqadd_s64(__n64 _Dn, __n64 _Dm);
4301 __n64 __vqadd_s8(__n64 _Dn, __n64 _Dm);
4302 __n64 __vqadd_u16(__n64 _Dn, __n64 _Dm);
4303 __n64 __vqadd_u32(__n64 _Dn, __n64 _Dm);
4304 __n64 __vqadd_u64(__n64 _Dn, __n64 _Dm);
4305 __n64 __vqadd_u8(__n64 _Dn, __n64 _Dm);
4306 __n128 __vqaddq_s16(__n128 _Qn, __n128 _Qm);
4307 __n128 __vqaddq_s32(__n128 _Qn, __n128 _Qm);
4308 __n128 __vqaddq_s64(__n128 _Qn, __n128 _Qm);
4309 __n128 __vqaddq_s8(__n128 _Qn, __n128 _Qm);
4310 __n128 __vqaddq_u16(__n128 _Qn, __n128 _Qm);
4311 __n128 __vqaddq_u32(__n128 _Qn, __n128 _Qm);
4312 __n128 __vqaddq_u64(__n128 _Qn, __n128 _Qm);
4313 __n128 __vqaddq_u8(__n128 _Qn, __n128 _Qm);
4314 __n128 __vqdmlal_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4315 __n128 __vqdmlal_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4316 __n128 __vqdmlsl_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4317 __n128 __vqdmlsl_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm);
4318 __n128 __vqdmlal_lane_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4319 __n128 __vqdmlal_lane_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4320 __n128 __vqdmlsl_lane_s16(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4321 __n128 __vqdmlsl_lane_s32(__n128 _Qd, __n64 _Dn, __n64 _Dm, const int _Lane);
4322 __n64 __vqdmulh_lane_s16(__n64 _Dn, __n64 _Dm, const int _Lane);
4323 __n64 __vqdmulh_lane_s32(__n64 _Dn, __n64 _Dm, const int _Lane);
4324 __n64 __vqrdmulh_lane_s16(__n64 _Dn, __n64 _Dm, const int _Lane);
4325 __n64 __vqrdmulh_lane_s32(__n64 _Dn, __n64 _Dm, const int _Lane);
4326 __n128 __vqdmulhq_lane_s16(__n128 _Qn, __n64 _Dm, const int _Lane);
4327 __n128 __vqdmulhq_lane_s32(__n128 _Qn, __n64 _Dm, const int _Lane);
4328 __n128 __vqrdmulhq_lane_s16(__n128 _Qn, __n64 _Dm, const int _Lane);
4329 __n128 __vqrdmulhq_lane_s32(__n128 _Qn, __n64 _Dm, const int _Lane);
4330 __n64 __vqdmulh_s16(__n64 _Dn, __n64 _Dm);
4331 __n64 __vqdmulh_s32(__n64 _Dn, __n64 _Dm);
4332 __n64 __vqrdmulh_s16(__n64 _Dn, __n64 _Dm);
4333 __n64 __vqrdmulh_s32(__n64 _Dn, __n64 _Dm);
4334 __n128 __vqdmulhq_s16(__n128 _Qn, __n128 _Qm);
4335 __n128 __vqdmulhq_s32(__n128 _Qn, __n128 _Qm);
4336 __n128 __vqrdmulhq_s16(__n128 _Qn, __n128 _Qm);
4337 __n128 __vqrdmulhq_s32(__n128 _Qn, __n128 _Qm);
4338 __n128 __vqdmull_s16(__n64 _Dn, __n64 _Dm);
4339 __n128 __vqdmull_s32(__n64 _Dn, __n64 _Dm);
4340 __n128 __vqdmull_lane_s16(__n64 _Dn, __n64 _Dm, const int _Lane);
4341 __n128 __vqdmull_lane_s32(__n64 _Dn, __n64 _Dm, const int _Lane);
4342 __n64 __vqmovn_s16(__n128 _Qm);
4343 __n64 __vqmovn_s32(__n128 _Qm);
4344 __n64 __vqmovn_s64(__n128 _Qm);
4345 __n64 __vqmovn_u16(__n128 _Qm);
4346 __n64 __vqmovn_u32(__n128 _Qm);
4347 __n64 __vqmovn_u64(__n128 _Qm);
4348 __n64 __vqmovun_s16(__n128 _Qm);
4349 __n64 __vqmovun_s32(__n128 _Qm);
4350 __n64 __vqmovun_s64(__n128 _Qm);
4351 __n64 __vqshl_n_s16(__n64 _Dm, const int _Shift_amount);
4352 __n64 __vqshl_n_s32(__n64 _Dm, const int _Shift_amount);
4353 __n64 __vqshl_n_s64(__n64 _Dm, const int _Shift_amount);
4354 __n64 __vqshl_n_s8(__n64 _Dm, const int _Shift_amount);
4355 __n64 __vqshl_n_u16(__n64 _Dm, const int _Shift_amount);
4356 __n64 __vqshl_n_u32(__n64 _Dm, const int _Shift_amount);
4357 __n64 __vqshl_n_u64(__n64 _Dm, const int _Shift_amount);
4358 __n64 __vqshl_n_u8(__n64 _Dm, const int _Shift_amount);
4359 __n64 __vqshlu_n_s16(__n64 _Dm, const int _Shift_amount);
4360 __n64 __vqshlu_n_s32(__n64 _Dm, const int _Shift_amount);
4361 __n64 __vqshlu_n_s64(__n64 _Dm, const int _Shift_amount);
4362 __n64 __vqshlu_n_s8(__n64 _Dm, const int _Shift_amount);
4363 __n128 __vqshlq_n_s16(__n128 _Qm, const int _Shift_amount);
4364 __n128 __vqshlq_n_s32(__n128 _Qm, const int _Shift_amount);
4365 __n128 __vqshlq_n_s64(__n128 _Qm, const int _Shift_amount);
4366 __n128 __vqshlq_n_s8(__n128 _Qm, const int _Shift_amount);
4367 __n128 __vqshlq_n_u16(__n128 _Qm, const int _Shift_amount);
4368 __n128 __vqshlq_n_u32(__n128 _Qm, const int _Shift_amount);
4369 __n128 __vqshlq_n_u64(__n128 _Qm, const int _Shift_amount);
4370 __n128 __vqshlq_n_u8(__n128 _Qm, const int _Shift_amount);
4371 __n128 __vqshluq_n_s16(__n128 _Qm, const int _Shift_amount);
4372 __n128 __vqshluq_n_s32(__n128 _Qm, const int _Shift_amount);
4373 __n128 __vqshluq_n_s64(__n128 _Qm, const int _Shift_amount);
4374 __n128 __vqshluq_n_s8(__n128 _Qm, const int _Shift_amount);
4375 __n64 __vqrshrn_n_s16(__n128 _Qm, const int _Shift_amount);
4376 __n64 __vqrshrn_n_s32(__n128 _Qm, const int _Shift_amount);
4377 __n64 __vqrshrn_n_s64(__n128 _Qm, const int _Shift_amount);
4378 __n64 __vqrshrn_n_u16(__n128 _Qm, const int _Shift_amount);
4379 __n64 __vqrshrn_n_u32(__n128 _Qm, const int _Shift_amount);
4380 __n64 __vqrshrn_n_u64(__n128 _Qm, const int _Shift_amount);
4381 __n64 __vqrshrun_n_s16(__n128 _Qm, const int _Shift_amount);
4382 __n64 __vqrshrun_n_s32(__n128 _Qm, const int _Shift_amount);
4383 __n64 __vqrshrun_n_s64(__n128 _Qm, const int _Shift_amount);
4384 __n64 __vqshrn_n_s16(__n128 _Qm, const int _Shift_amount);
4385 __n64 __vqshrn_n_s32(__n128 _Qm, const int _Shift_amount);
4386 __n64 __vqshrn_n_s64(__n128 _Qm, const int _Shift_amount);
4387 __n64 __vqshrn_n_u16(__n128 _Qm, const int _Shift_amount);
4388 __n64 __vqshrn_n_u32(__n128 _Qm, const int _Shift_amount);
4389 __n64 __vqshrn_n_u64(__n128 _Qm, const int _Shift_amount);
4390 __n64 __vqshrun_n_s16(__n128 _Qm, const int _Shift_amount);
4391 __n64 __vqshrun_n_s32(__n128 _Qm, const int _Shift_amount);
4392 __n64 __vqshrun_n_s64(__n128 _Qm, const int _Shift_amount);
4393 __n64 __vqsub_s16(__n64 _Dn, __n64 _Dm);
4394 __n64 __vqsub_s32(__n64 _Dn, __n64 _Dm);
4395 __n64 __vqsub_s64(__n64 _Dn, __n64 _Dm);
4396 __n64 __vqsub_s8(__n64 _Dn, __n64 _Dm);
4397 __n64 __vqsub_u16(__n64 _Dn, __n64 _Dm);
4398 __n64 __vqsub_u32(__n64 _Dn, __n64 _Dm);
4399 __n64 __vqsub_u64(__n64 _Dn, __n64 _Dm);
4400 __n64 __vqsub_u8(__n64 _Dn, __n64 _Dm);
4401 __n128 __vqsubq_s16(__n128 _Qn, __n128 _Qm);
4402 __n128 __vqsubq_s32(__n128 _Qn, __n128 _Qm);
4403 __n128 __vqsubq_s64(__n128 _Qn, __n128 _Qm);
4404 __n128 __vqsubq_s8(__n128 _Qn, __n128 _Qm);
4405 __n128 __vqsubq_u16(__n128 _Qn, __n128 _Qm);
4406 __n128 __vqsubq_u32(__n128 _Qn, __n128 _Qm);
4407 __n128 __vqsubq_u64(__n128 _Qn, __n128 _Qm);
4408 __n128 __vqsubq_u8(__n128 _Qn, __n128 _Qm);
4409 __n64 __vrecpe_f32(__n64 _Dm);
4410 __n64 __vrecpe_u32(__n64 _Dm);
4411 __n64 __vrsqrte_f32(__n64 _Dm);
4412 __n64 __vrsqrte_u32(__n64 _Dm);
4413 __n128 __vrecpeq_f32(__n128 _Qm);
4414 __n128 __vrecpeq_u32(__n128 _Qm);
4415 __n128 __vrsqrteq_f32(__n128 _Qm);
4416 __n128 __vrsqrteq_u32(__n128 _Qm);
4417 __n64 __vrecps_f32(__n64 _Dn, __n64 _Dm);
4418 __n128 __vrecpsq_f32(__n128 _Qn, __n128 _Qm);
4419 __n64 __vrev16_p8(__n64 _Dm);
4420 __n64 __vrev16_s8(__n64 _Dm);
4421 __n64 __vrev16_u8(__n64 _Dm);
4422 __n64 __vrev32_p16(__n64 _Dm);
4423 __n64 __vrev32_p8(__n64 _Dm);
4424 __n64 __vrev32_s16(__n64 _Dm);
4425 __n64 __vrev32_s8(__n64 _Dm);
4426 __n64 __vrev32_u16(__n64 _Dm);
4427 __n64 __vrev32_u8(__n64 _Dm);
4428 __n64 __vrev64_f32(__n64 _Dm);
4429 __n64 __vrev64_p16(__n64 _Dm);
4430 __n64 __vrev64_p8(__n64 _Dm);
4431 __n64 __vrev64_s16(__n64 _Dm);
4432 __n64 __vrev64_s32(__n64 _Dm);
4433 __n64 __vrev64_s8(__n64 _Dm);
4434 __n64 __vrev64_u16(__n64 _Dm);
4435 __n64 __vrev64_u32(__n64 _Dm);
4436 __n64 __vrev64_u8(__n64 _Dm);
4437 __n128 __vrev16q_p8(__n128 _Qm);
4438 __n128 __vrev16q_s8(__n128 _Qm);
4439 __n128 __vrev16q_u8(__n128 _Qm);
4440 __n128 __vrev32q_p16(__n128 _Qm);
4441 __n128 __vrev32q_p8(__n128 _Qm);
4442 __n128 __vrev32q_s16(__n128 _Qm);
4443 __n128 __vrev32q_s8(__n128 _Qm);
4444 __n128 __vrev32q_u16(__n128 _Qm);
4445 __n128 __vrev32q_u8(__n128 _Qm);
4446 __n128 __vrev64q_f32(__n128 _Qm);
4447 __n128 __vrev64q_p16(__n128 _Qm);
4448 __n128 __vrev64q_p8(__n128 _Qm);
4449 __n128 __vrev64q_s16(__n128 _Qm);
4450 __n128 __vrev64q_s32(__n128 _Qm);
4451 __n128 __vrev64q_s8(__n128 _Qm);
4452 __n128 __vrev64q_u16(__n128 _Qm);
4453 __n128 __vrev64q_u32(__n128 _Qm);
4454 __n128 __vrev64q_u8(__n128 _Qm);
4455 __n64 __vrnd_f32(__n64 _Dm);
4456 __n64 __vrnda_f32(__n64 _Dm);
4457 __n64 __vrndm_f32(__n64 _Dm);
4458 __n64 __vrndn_f32(__n64 _Dm);
4459 __n64 __vrndp_f32(__n64 _Dm);
4460 __n64 __vrndx_f32(__n64 _Dm);
4461 __n128 __vrndq_f32(__n128 _Qm);
4462 __n128 __vrndaq_f32(__n128 _Qm);
4463 __n128 __vrndmq_f32(__n128 _Qm);
4464 __n128 __vrndnq_f32(__n128 _Qm);
4465 __n128 __vrndpq_f32(__n128 _Qm);
4466 __n128 __vrndxq_f32(__n128 _Qm);
4467 __n64 __vrsqrts_f32(__n64 _Dn, __n64 _Dm);
4468 __n128 __vrsqrtsq_f32(__n128 _Qn, __n128 _Qm);
4469 __n64 __vshl_n_s16(__n64 _Dm, const int _Shift_amount);
4470 __n64 __vshl_n_s32(__n64 _Dm, const int _Shift_amount);
4471 __n64 __vshl_n_s64(__n64 _Dm, const int _Shift_amount);
4472 __n64 __vshl_n_s8(__n64 _Dm, const int _Shift_amount);
4473 __n64 __vshl_n_u16(__n64 _Dm, const int _Shift_amount);
4474 __n64 __vshl_n_u32(__n64 _Dm, const int _Shift_amount);
4475 __n64 __vshl_n_u64(__n64 _Dm, const int _Shift_amount);
4476 __n64 __vshl_n_u8(__n64 _Dm, const int _Shift_amount);
4477 __n128 __vshlq_n_s16(__n128 _Qm, const int _Shift_amount);
4478 __n128 __vshlq_n_s32(__n128 _Qm, const int _Shift_amount);
4479 __n128 __vshlq_n_s64(__n128 _Qm, const int _Shift_amount);
4480 __n128 __vshlq_n_s8(__n128 _Qm, const int _Shift_amount);
4481 __n128 __vshlq_n_u16(__n128 _Qm, const int _Shift_amount);
4482 __n128 __vshlq_n_u32(__n128 _Qm, const int _Shift_amount);
4483 __n128 __vshlq_n_u64(__n128 _Qm, const int _Shift_amount);
4484 __n128 __vshlq_n_u8(__n128 _Qm, const int _Shift_amount);
4485 __n64 __vqrshl_s16(__n64 _Dn, __n64 _Dm);
4486 __n64 __vqrshl_s32(__n64 _Dn, __n64 _Dm);
4487 __n64 __vqrshl_s64(__n64 _Dn, __n64 _Dm);
4488 __n64 __vqrshl_s8(__n64 _Dn, __n64 _Dm);
4489 __n64 __vqrshl_u16(__n64 _Dn, __n64 _Dm);
4490 __n64 __vqrshl_u32(__n64 _Dn, __n64 _Dm);
4491 __n64 __vqrshl_u64(__n64 _Dn, __n64 _Dm);
4492 __n64 __vqrshl_u8(__n64 _Dn, __n64 _Dm);
4493 __n64 __vqshl_s16(__n64 _Dn, __n64 _Dm);
4494 __n64 __vqshl_s32(__n64 _Dn, __n64 _Dm);
4495 __n64 __vqshl_s64(__n64 _Dn, __n64 _Dm);
4496 __n64 __vqshl_s8(__n64 _Dn, __n64 _Dm);
4497 __n64 __vqshl_u16(__n64 _Dn, __n64 _Dm);
4498 __n64 __vqshl_u32(__n64 _Dn, __n64 _Dm);
4499 __n64 __vqshl_u64(__n64 _Dn, __n64 _Dm);
4500 __n64 __vqshl_u8(__n64 _Dn, __n64 _Dm);
4501 __n64 __vrshl_s16(__n64 _Dn, __n64 _Dm);
4502 __n64 __vrshl_s32(__n64 _Dn, __n64 _Dm);
4503 __n64 __vrshl_s64(__n64 _Dn, __n64 _Dm);
4504 __n64 __vrshl_s8(__n64 _Dn, __n64 _Dm);
4505 __n64 __vrshl_u16(__n64 _Dn, __n64 _Dm);
4506 __n64 __vrshl_u32(__n64 _Dn, __n64 _Dm);
4507 __n64 __vrshl_u64(__n64 _Dn, __n64 _Dm);
4508 __n64 __vrshl_u8(__n64 _Dn, __n64 _Dm);
4509 __n64 __vshl_s16(__n64 _Dn, __n64 _Dm);
4510 __n64 __vshl_s32(__n64 _Dn, __n64 _Dm);
4511 __n64 __vshl_s64(__n64 _Dn, __n64 _Dm);
4512 __n64 __vshl_s8(__n64 _Dn, __n64 _Dm);
4513 __n64 __vshl_u16(__n64 _Dn, __n64 _Dm);
4514 __n64 __vshl_u32(__n64 _Dn, __n64 _Dm);
4515 __n64 __vshl_u64(__n64 _Dn, __n64 _Dm);
4516 __n64 __vshl_u8(__n64 _Dn, __n64 _Dm);
4517 __n128 __vqrshlq_s16(__n128 _Qn, __n128 _Qm);
4518 __n128 __vqrshlq_s32(__n128 _Qn, __n128 _Qm);
4519 __n128 __vqrshlq_s64(__n128 _Qn, __n128 _Qm);
4520 __n128 __vqrshlq_s8(__n128 _Qn, __n128 _Qm);
4521 __n128 __vqrshlq_u16(__n128 _Qn, __n128 _Qm);
4522 __n128 __vqrshlq_u32(__n128 _Qn, __n128 _Qm);
4523 __n128 __vqrshlq_u64(__n128 _Qn, __n128 _Qm);
4524 __n128 __vqrshlq_u8(__n128 _Qn, __n128 _Qm);
4525 __n128 __vqshlq_s16(__n128 _Qn, __n128 _Qm);
4526 __n128 __vqshlq_s32(__n128 _Qn, __n128 _Qm);
4527 __n128 __vqshlq_s64(__n128 _Qn, __n128 _Qm);
4528 __n128 __vqshlq_s8(__n128 _Qn, __n128 _Qm);
4529 __n128 __vqshlq_u16(__n128 _Qn, __n128 _Qm);
4530 __n128 __vqshlq_u32(__n128 _Qn, __n128 _Qm);
4531 __n128 __vqshlq_u64(__n128 _Qn, __n128 _Qm);
4532 __n128 __vqshlq_u8(__n128 _Qn, __n128 _Qm);
4533 __n128 __vrshlq_s16(__n128 _Qn, __n128 _Qm);
4534 __n128 __vrshlq_s32(__n128 _Qn, __n128 _Qm);
4535 __n128 __vrshlq_s64(__n128 _Qn, __n128 _Qm);
4536 __n128 __vrshlq_s8(__n128 _Qn, __n128 _Qm);
4537 __n128 __vrshlq_u16(__n128 _Qn, __n128 _Qm);
4538 __n128 __vrshlq_u32(__n128 _Qn, __n128 _Qm);
4539 __n128 __vrshlq_u64(__n128 _Qn, __n128 _Qm);
4540 __n128 __vrshlq_u8(__n128 _Qn, __n128 _Qm);
4541 __n128 __vshlq_s16(__n128 _Qn, __n128 _Qm);
4542 __n128 __vshlq_s32(__n128 _Qn, __n128 _Qm);
4543 __n128 __vshlq_s64(__n128 _Qn, __n128 _Qm);
4544 __n128 __vshlq_s8(__n128 _Qn, __n128 _Qm);
4545 __n128 __vshlq_u16(__n128 _Qn, __n128 _Qm);
4546 __n128 __vshlq_u32(__n128 _Qn, __n128 _Qm);
4547 __n128 __vshlq_u64(__n128 _Qn, __n128 _Qm);
4548 __n128 __vshlq_u8(__n128 _Qn, __n128 _Qm);
4549 __n128 __vshll_n_s16(__n64 _Dm, const int _Shift_amount);
4550 __n128 __vshll_n_s32(__n64 _Dm, const int _Shift_amount);
4551 __n128 __vshll_n_s8(__n64 _Dm, const int _Shift_amount);
4552 __n128 __vshll_n_u16(__n64 _Dm, const int _Shift_amount);
4553 __n128 __vshll_n_u32(__n64 _Dm, const int _Shift_amount);
4554 __n128 __vshll_n_u8(__n64 _Dm, const int _Shift_amount);
4555 __n64 __vrshr_n_s16(__n64 _Dm, const int _Shift_amount);
4556 __n64 __vrshr_n_s32(__n64 _Dm, const int _Shift_amount);
4557 __n64 __vrshr_n_s64(__n64 _Dm, const int _Shift_amount);
4558 __n64 __vrshr_n_s8(__n64 _Dm, const int _Shift_amount);
4559 __n64 __vrshr_n_u16(__n64 _Dm, const int _Shift_amount);
4560 __n64 __vrshr_n_u32(__n64 _Dm, const int _Shift_amount);
4561 __n64 __vrshr_n_u64(__n64 _Dm, const int _Shift_amount);
4562 __n64 __vrshr_n_u8(__n64 _Dm, const int _Shift_amount);
4563 __n64 __vshr_n_s16(__n64 _Dm, const int _Shift_amount);
4564 __n64 __vshr_n_s32(__n64 _Dm, const int _Shift_amount);
4565 __n64 __vshr_n_s64(__n64 _Dm, const int _Shift_amount);
4566 __n64 __vshr_n_s8(__n64 _Dm, const int _Shift_amount);
4567 __n64 __vshr_n_u16(__n64 _Dm, const int _Shift_amount);
4568 __n64 __vshr_n_u32(__n64 _Dm, const int _Shift_amount);
4569 __n64 __vshr_n_u64(__n64 _Dm, const int _Shift_amount);
4570 __n64 __vshr_n_u8(__n64 _Dm, const int _Shift_amount);
4571 __n128 __vrshrq_n_s16(__n128 _Qm, const int _Shift_amount);
4572 __n128 __vrshrq_n_s32(__n128 _Qm, const int _Shift_amount);
4573 __n128 __vrshrq_n_s64(__n128 _Qm, const int _Shift_amount);
4574 __n128 __vrshrq_n_s8(__n128 _Qm, const int _Shift_amount);
4575 __n128 __vrshrq_n_u16(__n128 _Qm, const int _Shift_amount);
4576 __n128 __vrshrq_n_u32(__n128 _Qm, const int _Shift_amount);
4577 __n128 __vrshrq_n_u64(__n128 _Qm, const int _Shift_amount);
4578 __n128 __vrshrq_n_u8(__n128 _Qm, const int _Shift_amount);
4579 __n128 __vshrq_n_s16(__n128 _Qm, const int _Shift_amount);
4580 __n128 __vshrq_n_s32(__n128 _Qm, const int _Shift_amount);
4581 __n128 __vshrq_n_s64(__n128 _Qm, const int _Shift_amount);
4582 __n128 __vshrq_n_s8(__n128 _Qm, const int _Shift_amount);
4583 __n128 __vshrq_n_u16(__n128 _Qm, const int _Shift_amount);
4584 __n128 __vshrq_n_u32(__n128 _Qm, const int _Shift_amount);
4585 __n128 __vshrq_n_u64(__n128 _Qm, const int _Shift_amount);
4586 __n128 __vshrq_n_u8(__n128 _Qm, const int _Shift_amount);
4587 __n64 __vrshrn_n_s16(__n128 _Qm, const int _Shift_amount);
4588 __n64 __vrshrn_n_s32(__n128 _Qm, const int _Shift_amount);
4589 __n64 __vrshrn_n_s64(__n128 _Qm, const int _Shift_amount);
4590 __n64 __vrshrn_n_u16(__n128 _Qm, const int _Shift_amount);
4591 __n64 __vrshrn_n_u32(__n128 _Qm, const int _Shift_amount);
4592 __n64 __vrshrn_n_u64(__n128 _Qm, const int _Shift_amount);
4593 __n64 __vshrn_n_s16(__n128 _Qm, const int _Shift_amount);
4594 __n64 __vshrn_n_s32(__n128 _Qm, const int _Shift_amount);
4595 __n64 __vshrn_n_s64(__n128 _Qm, const int _Shift_amount);
4596 __n64 __vshrn_n_u16(__n128 _Qm, const int _Shift_amount);
4597 __n64 __vshrn_n_u32(__n128 _Qm, const int _Shift_amount);
4598 __n64 __vshrn_n_u64(__n128 _Qm, const int _Shift_amount);
4599 __n64 __vsli_n_p16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4600 __n64 __vsli_n_p8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4601 __n64 __vsli_n_s16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4602 __n64 __vsli_n_s32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4603 __n64 __vsli_n_s64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4604 __n64 __vsli_n_s8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4605 __n64 __vsli_n_u16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4606 __n64 __vsli_n_u32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4607 __n64 __vsli_n_u64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4608 __n64 __vsli_n_u8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4609 __n128 __vsliq_n_p16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4610 __n128 __vsliq_n_p8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4611 __n128 __vsliq_n_s16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4612 __n128 __vsliq_n_s32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4613 __n128 __vsliq_n_s64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4614 __n128 __vsliq_n_s8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4615 __n128 __vsliq_n_u16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4616 __n128 __vsliq_n_u32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4617 __n128 __vsliq_n_u64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4618 __n128 __vsliq_n_u8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4619 __n64 __vrsra_n_s16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4620 __n64 __vrsra_n_s32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4621 __n64 __vrsra_n_s64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4622 __n64 __vrsra_n_s8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4623 __n64 __vrsra_n_u16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4624 __n64 __vrsra_n_u32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4625 __n64 __vrsra_n_u64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4626 __n64 __vrsra_n_u8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4627 __n64 __vsra_n_s16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4628 __n64 __vsra_n_s32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4629 __n64 __vsra_n_s64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4630 __n64 __vsra_n_s8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4631 __n64 __vsra_n_u16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4632 __n64 __vsra_n_u32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4633 __n64 __vsra_n_u64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4634 __n64 __vsra_n_u8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4635 __n128 __vrsraq_n_s16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4636 __n128 __vrsraq_n_s32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4637 __n128 __vrsraq_n_s64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4638 __n128 __vrsraq_n_s8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4639 __n128 __vrsraq_n_u16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4640 __n128 __vrsraq_n_u32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4641 __n128 __vrsraq_n_u64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4642 __n128 __vrsraq_n_u8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4643 __n128 __vsraq_n_s16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4644 __n128 __vsraq_n_s32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4645 __n128 __vsraq_n_s64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4646 __n128 __vsraq_n_s8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4647 __n128 __vsraq_n_u16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4648 __n128 __vsraq_n_u32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4649 __n128 __vsraq_n_u64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4650 __n128 __vsraq_n_u8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4651 __n64 __vsri_n_p16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4652 __n64 __vsri_n_p8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4653 __n64 __vsri_n_s16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4654 __n64 __vsri_n_s32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4655 __n64 __vsri_n_s64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4656 __n64 __vsri_n_s8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4657 __n64 __vsri_n_u16(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4658 __n64 __vsri_n_u32(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4659 __n64 __vsri_n_u64(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4660 __n64 __vsri_n_u8(__n64 _Dd, __n64 _Dm, const int _Shift_amount);
4661 __n128 __vsriq_n_p16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4662 __n128 __vsriq_n_p8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4663 __n128 __vsriq_n_s16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4664 __n128 __vsriq_n_s32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4665 __n128 __vsriq_n_s64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4666 __n128 __vsriq_n_s8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4667 __n128 __vsriq_n_u16(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4668 __n128 __vsriq_n_u32(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4669 __n128 __vsriq_n_u64(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4670 __n128 __vsriq_n_u8(__n128 _Qd, __n128 _Qm, const int _Shift_amount);
4671 void __vst1_f32(_Out_writes_(2) float32_t * _PD, __n64 _D);
4672 void __vst1_p16(_Out_writes_(4) poly16_t * _PD, __n64 _D);
4673 void __vst1_p8(_Out_writes_(8) poly8_t * _PD, __n64 _D);
4674 void __vst1_s16(_Out_writes_(4) int16_t * _PD, __n64 _D);
4675 void __vst1_s32(_Out_writes_(2) int32_t * _PD, __n64 _D);
4676 void __vst1_s64(_Out_writes_(1) int64_t * _PD, __n64 _D);
4677 void __vst1_s8(_Out_writes_(8) int8_t * _PD, __n64 _D);
4678 void __vst1_u16(_Out_writes_(4) uint16_t * _PD, __n64 _D);
4679 void __vst1_u32(_Out_writes_(2) uint32_t * _PD, __n64 _D);
4680 void __vst1_u64(_Out_writes_(1) uint64_t * _PD, __n64 _D);
4681 void __vst1_u8(_Out_writes_(8) uint8_t * _PD, __n64 _D);
4682 void __vst1_f32_ex(_Out_writes_(2) float32_t * _PD, __n64 _D, const int _Align);
4683 void __vst1_p16_ex(_Out_writes_(4) poly16_t * _PD, __n64 _D, const int _Align);
4684 void __vst1_p8_ex(_Out_writes_(8) poly8_t * _PD, __n64 _D, const int _Align);
4685 void __vst1_s16_ex(_Out_writes_(4) int16_t * _PD, __n64 _D, const int _Align);
4686 void __vst1_s32_ex(_Out_writes_(2) int32_t * _PD, __n64 _D, const int _Align);
4687 void __vst1_s64_ex(_Out_writes_(1) int64_t * _PD, __n64 _D, const int _Align);
4688 void __vst1_s8_ex(_Out_writes_(8) int8_t * _PD, __n64 _D, const int _Align);
4689 void __vst1_u16_ex(_Out_writes_(4) uint16_t * _PD, __n64 _D, const int _Align);
4690 void __vst1_u32_ex(_Out_writes_(2) uint32_t * _PD, __n64 _D, const int _Align);
4691 void __vst1_u64_ex(_Out_writes_(1) uint64_t * _PD, __n64 _D, const int _Align);
4692 void __vst1_u8_ex(_Out_writes_(8) uint8_t * _PD, __n64 _D, const int _Align);
4693 void __vst1q_f32(_Out_writes_(4) float32_t * _PD, __n128 _Q);
4694 void __vst1q_p16(_Out_writes_(8) poly16_t * _PD, __n128 _Q);
4695 void __vst1q_p8(_Out_writes_(16) poly8_t * _PD, __n128 _Q);
4696 void __vst1q_s16(_Out_writes_(8) int16_t * _PD, __n128 _Q);
4697 void __vst1q_s32(_Out_writes_(4) int32_t * _PD, __n128 _Q);
4698 void __vst1q_s64(_Out_writes_(2) int64_t * _PD, __n128 _Q);
4699 void __vst1q_s8(_Out_writes_(16) int8_t * _PD, __n128 _Q);
4700 void __vst1q_u16(_Out_writes_(8) uint16_t * _PD, __n128 _Q);
4701 void __vst1q_u32(_Out_writes_(4) uint32_t * _PD, __n128 _Q);
4702 void __vst1q_u64(_Out_writes_(2) uint64_t * _PD, __n128 _Q);
4703 void __vst1q_u8(_Out_writes_(16) uint8_t * _PD, __n128 _Q);
4704 void __vst1q_f32_ex(_Out_writes_(4) float32_t * _PD, __n128 _Q, const int _Align);
4705 void __vst1q_p16_ex(_Out_writes_(8) poly16_t * _PD, __n128 _Q, const int _Align);
4706 void __vst1q_p8_ex(_Out_writes_(16) poly8_t * _PD, __n128 _Q, const int _Align);
4707 void __vst1q_s16_ex(_Out_writes_(8) int16_t * _PD, __n128 _Q, const int _Align);
4708 void __vst1q_s32_ex(_Out_writes_(4) int32_t * _PD, __n128 _Q, const int _Align);
4709 void __vst1q_s64_ex(_Out_writes_(2) int64_t * _PD, __n128 _Q, const int _Align);
4710 void __vst1q_s8_ex(_Out_writes_(16) int8_t * _PD, __n128 _Q, const int _Align);
4711 void __vst1q_u16_ex(_Out_writes_(8) uint16_t * _PD, __n128 _Q, const int _Align);
4712 void __vst1q_u32_ex(_Out_writes_(4) uint32_t * _PD, __n128 _Q, const int _Align);
4713 void __vst1q_u64_ex(_Out_writes_(2) uint64_t * _PD, __n128 _Q, const int _Align);
4714 void __vst1q_u8_ex(_Out_writes_(16) uint8_t * _PD, __n128 _Q, const int _Align);
4715 void __vst1_lane_f32(_Out_writes_(1) float32_t * _PD, __n64 _D, const int _Lane);
4716 void __vst1_lane_p16(_Out_writes_(1) poly16_t * _PD, __n64 _D, const int _Lane);
4717 void __vst1_lane_p8(_Out_writes_(1) poly8_t * _PD, __n64 _D, const int _Lane);
4718 void __vst1_lane_s16(_Out_writes_(1) int16_t * _PD, __n64 _D, const int _Lane);
4719 void __vst1_lane_s32(_Out_writes_(1) int32_t * _PD, __n64 _D, const int _Lane);
4720 void __vst1_lane_s8(_Out_writes_(1) int8_t * _PD, __n64 _D, const int _Lane);
4721 void __vst1_lane_u16(_Out_writes_(1) uint16_t * _PD, __n64 _D, const int _Lane);
4722 void __vst1_lane_u32(_Out_writes_(1) uint32_t * _PD, __n64 _D, const int _Lane);
4723 void __vst1_lane_u8(_Out_writes_(1) uint8_t * _PD, __n64 _D, const int _Lane);
4724 void __vst1q_lane_f32(_Out_writes_(1) float32_t * _PD, __n128 _Q, const int _Lane);
4725 void __vst1q_lane_p16(_Out_writes_(1) poly16_t * _PD, __n128 _Q, const int _Lane);
4726 void __vst1q_lane_p8(_Out_writes_(1) poly8_t * _PD, __n128 _Q, const int _Lane);
4727 void __vst1q_lane_s16(_Out_writes_(1) int16_t * _PD, __n128 _Q, const int _Lane);
4728 void __vst1q_lane_s32(_Out_writes_(1) int32_t * _PD, __n128 _Q, const int _Lane);
4729 void __vst1q_lane_s8(_Out_writes_(1) int8_t * _PD, __n128 _Q, const int _Lane);
4730 void __vst1q_lane_u16(_Out_writes_(1) uint16_t * _PD, __n128 _Q, const int _Lane);
4731 void __vst1q_lane_u32(_Out_writes_(1) uint32_t * _PD, __n128 _Q, const int _Lane);
4732 void __vst1q_lane_u8(_Out_writes_(1) uint8_t * _PD, __n128 _Q, const int _Lane);
4733 void __vst1_lane_f32_ex(_Out_writes_(1) float32_t * _PD, __n64 _D, const int _Lane, const int _Align);
4734 void __vst1_lane_p16_ex(_Out_writes_(1) poly16_t * _PD, __n64 _D, const int _Lane, const int _Align);
4735 void __vst1_lane_s16_ex(_Out_writes_(1) int16_t * _PD, __n64 _D, const int _Lane, const int _Align);
4736 void __vst1_lane_s32_ex(_Out_writes_(1) int32_t * _PD, __n64 _D, const int _Lane, const int _Align);
4737 void __vst1_lane_u16_ex(_Out_writes_(1) uint16_t * _PD, __n64 _D, const int _Lane, const int _Align);
4738 void __vst1_lane_u32_ex(_Out_writes_(1) uint32_t * _PD, __n64 _D, const int _Lane, const int _Align);
4739 void __vst1q_lane_f32_ex(_Out_writes_(1) float32_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4740 void __vst1q_lane_p16_ex(_Out_writes_(1) poly16_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4741 void __vst1q_lane_s16_ex(_Out_writes_(1) int16_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4742 void __vst1q_lane_s32_ex(_Out_writes_(1) int32_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4743 void __vst1q_lane_u16_ex(_Out_writes_(1) uint16_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4744 void __vst1q_lane_u32_ex(_Out_writes_(1) uint32_t * _PD, __n128 _Q, const int _Lane, const int _Align);
4745 void __vst2_f32(_Out_writes_(4) float32_t * _PD, __n64x2 _D2);
4746 void __vst2_p16(_Out_writes_(8) poly16_t * _PD, __n64x2 _D2);
4747 void __vst2_p8(_Out_writes_(16) poly8_t * _PD, __n64x2 _D2);
4748 void __vst2_s16(_Out_writes_(8) int16_t * _PD, __n64x2 _D2);
4749 void __vst2_s32(_Out_writes_(4) int32_t * _PD, __n64x2 _D2);
4750 void __vst2_s8(_Out_writes_(16) int8_t * _PD, __n64x2 _D2);
4751 void __vst2_u16(_Out_writes_(8) uint16_t * _PD, __n64x2 _D2);
4752 void __vst2_u32(_Out_writes_(4) uint32_t * _PD, __n64x2 _D2);
4753 void __vst2_u8(_Out_writes_(16) uint8_t * _PD, __n64x2 _D2);
4754 void __vst2_s64(_Out_writes_(2) int64_t * _PD, __n64x2 _D2);
4755 void __vst2_u64(_Out_writes_(2) uint64_t * _PD, __n64x2 _D2);
4756 void __vst2_s64_ex(_Out_writes_(2) int64_t * _PD, __n64x2 _D2, const int _Align);
4757 void __vst2_u64_ex(_Out_writes_(2) uint64_t * _PD, __n64x2 _D2, const int _Align);
4758 void __vst2_f32_ex(_Out_writes_(4) float32_t * _PD, __n64x2 _D2, const int _Align);
4759 void __vst2_p16_ex(_Out_writes_(8) poly16_t * _PD, __n64x2 _D2, const int _Align);
4760 void __vst2_p8_ex(_Out_writes_(16) poly8_t * _PD, __n64x2 _D2, const int _Align);
4761 void __vst2_s16_ex(_Out_writes_(8) int16_t * _PD, __n64x2 _D2, const int _Align);
4762 void __vst2_s32_ex(_Out_writes_(4) int32_t * _PD, __n64x2 _D2, const int _Align);
4763 void __vst2_s8_ex(_Out_writes_(16) int8_t * _PD, __n64x2 _D2, const int _Align);
4764 void __vst2_u16_ex(_Out_writes_(8) uint16_t * _PD, __n64x2 _D2, const int _Align);
4765 void __vst2_u32_ex(_Out_writes_(4) uint32_t * _PD, __n64x2 _D2, const int _Align);
4766 void __vst2_u8_ex(_Out_writes_(16) uint8_t * _PD, __n64x2 _D2, const int _Align);
4767 void __vst2q_f32(_Out_writes_(8) float32_t * _PD, __n128x2 _Q2);
4768 void __vst2q_p16(_Out_writes_(16) poly16_t * _PD, __n128x2 _Q2);
4769 void __vst2q_p8(_Out_writes_(32) poly8_t * _PD, __n128x2 _Q2);
4770 void __vst2q_s16(_Out_writes_(16) int16_t * _PD, __n128x2 _Q2);
4771 void __vst2q_s32(_Out_writes_(8) int32_t * _PD, __n128x2 _Q2);
4772 void __vst2q_s8(_Out_writes_(32) int8_t * _PD, __n128x2 _Q2);
4773 void __vst2q_u16(_Out_writes_(16) uint16_t * _PD, __n128x2 _Q2);
4774 void __vst2q_u32(_Out_writes_(8) uint32_t * _PD, __n128x2 _Q2);
4775 void __vst2q_u8(_Out_writes_(32) uint8_t * _PD, __n128x2 _Q2);
4776 void __vst2q_f32_ex(_Out_writes_(8) float32_t * _PD, __n128x2 _Q2, const int _Align);
4777 void __vst2q_p16_ex(_Out_writes_(16) poly16_t * _PD, __n128x2 _Q2, const int _Align);
4778 void __vst2q_p8_ex(_Out_writes_(32) poly8_t * _PD, __n128x2 _Q2, const int _Align);
4779 void __vst2q_s16_ex(_Out_writes_(16) int16_t * _PD, __n128x2 _Q2, const int _Align);
4780 void __vst2q_s32_ex(_Out_writes_(8) int32_t * _PD, __n128x2 _Q2, const int _Align);
4781 void __vst2q_s8_ex(_Out_writes_(32) int8_t * _PD, __n128x2 _Q2, const int _Align);
4782 void __vst2q_u16_ex(_Out_writes_(16) uint16_t * _PD, __n128x2 _Q2, const int _Align);
4783 void __vst2q_u32_ex(_Out_writes_(8) uint32_t * _PD, __n128x2 _Q2, const int _Align);
4784 void __vst2q_u8_ex(_Out_writes_(32) uint8_t * _PD, __n128x2 _Q2, const int _Align);
4785 void __vst2_lane_f32(_Out_writes_(2) float32_t * _PD, __n64x2 _D2, const int _Lane);
4786 void __vst2_lane_p16(_Out_writes_(2) poly16_t * _PD, __n64x2 _D2, const int _Lane);
4787 void __vst2_lane_p8(_Out_writes_(2) poly8_t * _PD, __n64x2 _D2, const int _Lane);
4788 void __vst2_lane_s16(_Out_writes_(2) int16_t * _PD, __n64x2 _D2, const int _Lane);
4789 void __vst2_lane_s32(_Out_writes_(2) int32_t * _PD, __n64x2 _D2, const int _Lane);
4790 void __vst2_lane_s8(_Out_writes_(2) int8_t * _PD, __n64x2 _D2, const int _Lane);
4791 void __vst2_lane_u16(_Out_writes_(2) uint16_t * _PD, __n64x2 _D2, const int _Lane);
4792 void __vst2_lane_u32(_Out_writes_(2) uint32_t * _PD, __n64x2 _D2, const int _Lane);
4793 void __vst2_lane_u8(_Out_writes_(2) uint8_t * _PD, __n64x2 _D2, const int _Lane);
4794 void __vst2q_lane_f32(_Out_writes_(2) float32_t * _PD, __n128x2 _Q2, const int _Lane);
4795 void __vst2q_lane_p16(_Out_writes_(2) poly16_t * _PD, __n128x2 _Q2, const int _Lane);
4796 void __vst2q_lane_s16(_Out_writes_(2) int16_t * _PD, __n128x2 _Q2, const int _Lane);
4797 void __vst2q_lane_s32(_Out_writes_(2) int32_t * _PD, __n128x2 _Q2, const int _Lane);
4798 void __vst2q_lane_u16(_Out_writes_(2) uint16_t * _PD, __n128x2 _Q2, const int _Lane);
4799 void __vst2q_lane_u32(_Out_writes_(2) uint32_t * _PD, __n128x2 _Q2, const int _Lane);
4800 void __vst2_lane_f32_ex(_Out_writes_(2) float32_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4801 void __vst2_lane_p16_ex(_Out_writes_(2) poly16_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4802 void __vst2_lane_p8_ex(_Out_writes_(2) poly8_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4803 void __vst2_lane_s16_ex(_Out_writes_(2) int16_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4804 void __vst2_lane_s32_ex(_Out_writes_(2) int32_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4805 void __vst2_lane_s8_ex(_Out_writes_(2) int8_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4806 void __vst2_lane_u16_ex(_Out_writes_(2) uint16_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4807 void __vst2_lane_u32_ex(_Out_writes_(2) uint32_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4808 void __vst2_lane_u8_ex(_Out_writes_(2) uint8_t * _PD, __n64x2 _D2, const int _Lane, const int _Align);
4809 void __vst2q_lane_f32_ex(_Out_writes_(2) float32_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4810 void __vst2q_lane_p16_ex(_Out_writes_(2) poly16_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4811 void __vst2q_lane_s16_ex(_Out_writes_(2) int16_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4812 void __vst2q_lane_s32_ex(_Out_writes_(2) int32_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4813 void __vst2q_lane_u16_ex(_Out_writes_(2) uint16_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4814 void __vst2q_lane_u32_ex(_Out_writes_(2) uint32_t * _PD, __n128x2 _Q2, const int _Lane, const int _Align);
4815 void __vst3_f32(_Out_writes_(6) float32_t * _PD, __n64x3 _D3);
4816 void __vst3_p16(_Out_writes_(12) poly16_t * _PD, __n64x3 _D3);
4817 void __vst3_p8(_Out_writes_(24) poly8_t * _PD, __n64x3 _D3);
4818 void __vst3_s16(_Out_writes_(12) int16_t * _PD, __n64x3 _D3);
4819 void __vst3_s32(_Out_writes_(6) int32_t * _PD, __n64x3 _D3);
4820 void __vst3_s8(_Out_writes_(24) int8_t * _PD, __n64x3 _D3);
4821 void __vst3_u16(_Out_writes_(12) uint16_t * _PD, __n64x3 _D3);
4822 void __vst3_u32(_Out_writes_(6) uint32_t * _PD, __n64x3 _D3);
4823 void __vst3_u8(_Out_writes_(24) uint8_t * _PD, __n64x3 _D3);
4824 void __vst3_s64(_Out_writes_(3) int64_t * _PD, __n64x3 _D3);
4825 void __vst3_u64(_Out_writes_(3) uint64_t * _PD, __n64x3 _D3);
4826 void __vst3_s64_ex(_Out_writes_(3) int64_t * _PD, __n64x3 _D3, const int _Align);
4827 void __vst3_u64_ex(_Out_writes_(3) uint64_t * _PD, __n64x3 _D3, const int _Align);
4828 void __vst3_f32_ex(_Out_writes_(6) float32_t * _PD, __n64x3 _D3, const int _Align);
4829 void __vst3_p16_ex(_Out_writes_(12) poly16_t * _PD, __n64x3 _D3, const int _Align);
4830 void __vst3_p8_ex(_Out_writes_(24) poly8_t * _PD, __n64x3 _D3, const int _Align);
4831 void __vst3_s16_ex(_Out_writes_(12) int16_t * _PD, __n64x3 _D3, const int _Align);
4832 void __vst3_s32_ex(_Out_writes_(6) int32_t * _PD, __n64x3 _D3, const int _Align);
4833 void __vst3_s8_ex(_Out_writes_(24) int8_t * _PD, __n64x3 _D3, const int _Align);
4834 void __vst3_u16_ex(_Out_writes_(12) uint16_t * _PD, __n64x3 _D3, const int _Align);
4835 void __vst3_u32_ex(_Out_writes_(6) uint32_t * _PD, __n64x3 _D3, const int _Align);
4836 void __vst3_u8_ex(_Out_writes_(24) uint8_t * _PD, __n64x3 _D3, const int _Align);
4837 void __vst3q_f32(_Out_writes_(12) float32_t * _PD, __n128x3 _Q3);
4838 void __vst3q_p16(_Out_writes_(24) poly16_t * _PD, __n128x3 _Q3);
4839 void __vst3q_p8(_Out_writes_(48) poly8_t * _PD, __n128x3 _Q3);
4840 void __vst3q_s16(_Out_writes_(24) int16_t * _PD, __n128x3 _Q3);
4841 void __vst3q_s32(_Out_writes_(12) int32_t * _PD, __n128x3 _Q3);
4842 void __vst3q_s8(_Out_writes_(48) int8_t * _PD, __n128x3 _Q3);
4843 void __vst3q_u16(_Out_writes_(24) uint16_t * _PD, __n128x3 _Q3);
4844 void __vst3q_u32(_Out_writes_(12) uint32_t * _PD, __n128x3 _Q3);
4845 void __vst3q_u8(_Out_writes_(48) uint8_t * _PD, __n128x3 _Q3);
4846 void __vst3q_f32_ex(_Out_writes_(12) float32_t * _PD, __n128x3 _Q3, const int _Align);
4847 void __vst3q_p16_ex(_Out_writes_(24) poly16_t * _PD, __n128x3 _Q3, const int _Align);
4848 void __vst3q_p8_ex(_Out_writes_(48) poly8_t * _PD, __n128x3 _Q3, const int _Align);
4849 void __vst3q_s16_ex(_Out_writes_(24) int16_t * _PD, __n128x3 _Q3, const int _Align);
4850 void __vst3q_s32_ex(_Out_writes_(12) int32_t * _PD, __n128x3 _Q3, const int _Align);
4851 void __vst3q_s8_ex(_Out_writes_(48) int8_t * _PD, __n128x3 _Q3, const int _Align);
4852 void __vst3q_u16_ex(_Out_writes_(24) uint16_t * _PD, __n128x3 _Q3, const int _Align);
4853 void __vst3q_u32_ex(_Out_writes_(12) uint32_t * _PD, __n128x3 _Q3, const int _Align);
4854 void __vst3q_u8_ex(_Out_writes_(48) uint8_t * _PD, __n128x3 _Q3, const int _Align);
4855 void __vst3_lane_f32(_Out_writes_(3) float32_t * _PD, __n64x3 _D3, const int _Lane);
4856 void __vst3_lane_p16(_Out_writes_(3) poly16_t * _PD, __n64x3 _D3, const int _Lane);
4857 void __vst3_lane_p8(_Out_writes_(3) poly8_t * _PD, __n64x3 _D3, const int _Lane);
4858 void __vst3_lane_s16(_Out_writes_(3) int16_t * _PD, __n64x3 _D3, const int _Lane);
4859 void __vst3_lane_s32(_Out_writes_(3) int32_t * _PD, __n64x3 _D3, const int _Lane);
4860 void __vst3_lane_s8(_Out_writes_(3) int8_t * _PD, __n64x3 _D3, const int _Lane);
4861 void __vst3_lane_u16(_Out_writes_(3) uint16_t * _PD, __n64x3 _D3, const int _Lane);
4862 void __vst3_lane_u32(_Out_writes_(3) uint32_t * _PD, __n64x3 _D3, const int _Lane);
4863 void __vst3_lane_u8(_Out_writes_(3) uint8_t * _PD, __n64x3 _D3, const int _Lane);
4864 void __vst3q_lane_f32(_Out_writes_(3) float32_t * _PD, __n128x3 _Q3, const int _Lane);
4865 void __vst3q_lane_p16(_Out_writes_(3) poly16_t * _PD, __n128x3 _Q3, const int _Lane);
4866 void __vst3q_lane_s16(_Out_writes_(3) int16_t * _PD, __n128x3 _Q3, const int _Lane);
4867 void __vst3q_lane_s32(_Out_writes_(3) int32_t * _PD, __n128x3 _Q3, const int _Lane);
4868 void __vst3q_lane_u16(_Out_writes_(3) uint16_t * _PD, __n128x3 _Q3, const int _Lane);
4869 void __vst3q_lane_u32(_Out_writes_(3) uint32_t * _PD, __n128x3 _Q3, const int _Lane);
4870 void __vst4_f32(_Out_writes_(8) float32_t * _PD, __n64x4 _D4);
4871 void __vst4_p16(_Out_writes_(16) poly16_t * _PD, __n64x4 _D4);
4872 void __vst4_p8(_Out_writes_(32) poly8_t * _PD, __n64x4 _D4);
4873 void __vst4_s16(_Out_writes_(16) int16_t * _PD, __n64x4 _D4);
4874 void __vst4_s32(_Out_writes_(8) int32_t * _PD, __n64x4 _D4);
4875 void __vst4_s8(_Out_writes_(32) int8_t * _PD, __n64x4 _D4);
4876 void __vst4_u16(_Out_writes_(16) uint16_t * _PD, __n64x4 _D4);
4877 void __vst4_u32(_Out_writes_(8) uint32_t * _PD, __n64x4 _D4);
4878 void __vst4_u8(_Out_writes_(32) uint8_t * _PD, __n64x4 _D4);
4879 void __vst4_s64(_Out_writes_(4) int64_t * _PD, __n64x4 _D4);
4880 void __vst4_u64(_Out_writes_(4) uint64_t * _PD, __n64x4 _D4);
4881 void __vst4_s64_ex(_Out_writes_(4) int64_t * _PD, __n64x4 _D4, const int _Align);
4882 void __vst4_u64_ex(_Out_writes_(4) uint64_t * _PD, __n64x4 _D4, const int _Align);
4883 void __vst4_f32_ex(_Out_writes_(8) float32_t * _PD, __n64x4 _D4, const int _Align);
4884 void __vst4_p16_ex(_Out_writes_(16) poly16_t * _PD, __n64x4 _D4, const int _Align);
4885 void __vst4_p8_ex(_Out_writes_(32) poly8_t * _PD, __n64x4 _D4, const int _Align);
4886 void __vst4_s16_ex(_Out_writes_(16) int16_t * _PD, __n64x4 _D4, const int _Align);
4887 void __vst4_s32_ex(_Out_writes_(8) int32_t * _PD, __n64x4 _D4, const int _Align);
4888 void __vst4_s8_ex(_Out_writes_(32) int8_t * _PD, __n64x4 _D4, const int _Align);
4889 void __vst4_u16_ex(_Out_writes_(16) uint16_t * _PD, __n64x4 _D4, const int _Align);
4890 void __vst4_u32_ex(_Out_writes_(8) uint32_t * _PD, __n64x4 _D4, const int _Align);
4891 void __vst4_u8_ex(_Out_writes_(32) uint8_t * _PD, __n64x4 _D4, const int _Align);
4892 void __vst4q_f32(_Out_writes_(16) float32_t * _PD, __n128x4 _Q4);
4893 void __vst4q_p16(_Out_writes_(32) poly16_t * _PD, __n128x4 _Q4);
4894 void __vst4q_p8(_Out_writes_(64) poly8_t * _PD, __n128x4 _Q4);
4895 void __vst4q_s16(_Out_writes_(32) int16_t * _PD, __n128x4 _Q4);
4896 void __vst4q_s32(_Out_writes_(16) int32_t * _PD, __n128x4 _Q4);
4897 void __vst4q_s8(_Out_writes_(64) int8_t * _PD, __n128x4 _Q4);
4898 void __vst4q_u16(_Out_writes_(32) uint16_t * _PD, __n128x4 _Q4);
4899 void __vst4q_u32(_Out_writes_(16) uint32_t * _PD, __n128x4 _Q4);
4900 void __vst4q_u8(_Out_writes_(64) uint8_t * _PD, __n128x4 _Q4);
4901 void __vst4q_f32_ex(_Out_writes_(16) float32_t * _PD, __n128x4 _Q4, const int _Align);
4902 void __vst4q_p16_ex(_Out_writes_(32) poly16_t * _PD, __n128x4 _Q4, const int _Align);
4903 void __vst4q_p8_ex(_Out_writes_(64) poly8_t * _PD, __n128x4 _Q4, const int _Align);
4904 void __vst4q_s16_ex(_Out_writes_(32) int16_t * _PD, __n128x4 _Q4, const int _Align);
4905 void __vst4q_s32_ex(_Out_writes_(16) int32_t * _PD, __n128x4 _Q4, const int _Align);
4906 void __vst4q_s8_ex(_Out_writes_(64) int8_t * _PD, __n128x4 _Q4, const int _Align);
4907 void __vst4q_u16_ex(_Out_writes_(32) uint16_t * _PD, __n128x4 _Q4, const int _Align);
4908 void __vst4q_u32_ex(_Out_writes_(16) uint32_t * _PD, __n128x4 _Q4, const int _Align);
4909 void __vst4q_u8_ex(_Out_writes_(64) uint8_t * _PD, __n128x4 _Q4, const int _Align);
4910 void __vst4_lane_f32(_Out_writes_(4) float32_t * _PD, __n64x4 _D4, const int _Lane);
4911 void __vst4_lane_p16(_Out_writes_(4) poly16_t * _PD, __n64x4 _D4, const int _Lane);
4912 void __vst4_lane_p8(_Out_writes_(4) poly8_t * _PD, __n64x4 _D4, const int _Lane);
4913 void __vst4_lane_s16(_Out_writes_(4) int16_t * _PD, __n64x4 _D4, const int _Lane);
4914 void __vst4_lane_s32(_Out_writes_(4) int32_t * _PD, __n64x4 _D4, const int _Lane);
4915 void __vst4_lane_s8(_Out_writes_(4) int8_t * _PD, __n64x4 _D4, const int _Lane);
4916 void __vst4_lane_u16(_Out_writes_(4) uint16_t * _PD, __n64x4 _D4, const int _Lane);
4917 void __vst4_lane_u32(_Out_writes_(4) uint32_t * _PD, __n64x4 _D4, const int _Lane);
4918 void __vst4_lane_u8(_Out_writes_(4) uint8_t * _PD, __n64x4 _D4, const int _Lane);
4919 void __vst4q_lane_f32(_Out_writes_(4) float32_t * _PD, __n128x4 _Q4, const int _Lane);
4920 void __vst4q_lane_p16(_Out_writes_(4) poly16_t * _PD, __n128x4 _Q4, const int _Lane);
4921 void __vst4q_lane_s16(_Out_writes_(4) int16_t * _PD, __n128x4 _Q4, const int _Lane);
4922 void __vst4q_lane_s32(_Out_writes_(4) int32_t * _PD, __n128x4 _Q4, const int _Lane);
4923 void __vst4q_lane_u16(_Out_writes_(4) uint16_t * _PD, __n128x4 _Q4, const int _Lane);
4924 void __vst4q_lane_u32(_Out_writes_(4) uint32_t * _PD, __n128x4 _Q4, const int _Lane);
4925 void __vst4_lane_f32_ex(_Out_writes_(4) float32_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4926 void __vst4_lane_p16_ex(_Out_writes_(4) poly16_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4927 void __vst4_lane_p8_ex(_Out_writes_(4) poly8_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4928 void __vst4_lane_s16_ex(_Out_writes_(4) int16_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4929 void __vst4_lane_s32_ex(_Out_writes_(4) int32_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4930 void __vst4_lane_s8_ex(_Out_writes_(4) int8_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4931 void __vst4_lane_u16_ex(_Out_writes_(4) uint16_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4932 void __vst4_lane_u32_ex(_Out_writes_(4) uint32_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4933 void __vst4_lane_u8_ex(_Out_writes_(4) uint8_t * _PD, __n64x4 _D4, const int _Lane, const int _Align);
4934 void __vst4q_lane_f32_ex(_Out_writes_(4) float32_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4935 void __vst4q_lane_p16_ex(_Out_writes_(4) poly16_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4936 void __vst4q_lane_s16_ex(_Out_writes_(4) int16_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4937 void __vst4q_lane_s32_ex(_Out_writes_(4) int32_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4938 void __vst4q_lane_u16_ex(_Out_writes_(4) uint16_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4939 void __vst4q_lane_u32_ex(_Out_writes_(4) uint32_t * _PD, __n128x4 _Q4, const int _Lane, const int _Align);
4940 __n64 __vsub_f32(__n64 _Dn, __n64 _Dm);
4941 __n64 __vsub_s16(__n64 _Dn, __n64 _Dm);
4942 __n64 __vsub_s32(__n64 _Dn, __n64 _Dm);
4943 __n64 __vsub_s64(__n64 _Dn, __n64 _Dm);
4944 __n64 __vsub_s8(__n64 _Dn, __n64 _Dm);
4945 __n64 __vsub_u16(__n64 _Dn, __n64 _Dm);
4946 __n64 __vsub_u32(__n64 _Dn, __n64 _Dm);
4947 __n64 __vsub_u64(__n64 _Dn, __n64 _Dm);
4948 __n64 __vsub_u8(__n64 _Dn, __n64 _Dm);
4949 __n128 __vsubq_f32(__n128 _Qn, __n128 _Qm);
4950 __n128 __vsubq_s16(__n128 _Qn, __n128 _Qm);
4951 __n128 __vsubq_s32(__n128 _Qn, __n128 _Qm);
4952 __n128 __vsubq_s64(__n128 _Qn, __n128 _Qm);
4953 __n128 __vsubq_s8(__n128 _Qn, __n128 _Qm);
4954 __n128 __vsubq_u16(__n128 _Qn, __n128 _Qm);
4955 __n128 __vsubq_u32(__n128 _Qn, __n128 _Qm);
4956 __n128 __vsubq_u64(__n128 _Qn, __n128 _Qm);
4957 __n128 __vsubq_u8(__n128 _Qn, __n128 _Qm);
4958 __n64 __vrsubhn_s16(__n128 _Qn, __n128 _Qm);
4959 __n64 __vrsubhn_s32(__n128 _Qn, __n128 _Qm);
4960 __n64 __vrsubhn_s64(__n128 _Qn, __n128 _Qm);
4961 __n64 __vrsubhn_u16(__n128 _Qn, __n128 _Qm);
4962 __n64 __vrsubhn_u32(__n128 _Qn, __n128 _Qm);
4963 __n64 __vrsubhn_u64(__n128 _Qn, __n128 _Qm);
4964 __n64 __vsubhn_s16(__n128 _Qn, __n128 _Qm);
4965 __n64 __vsubhn_s32(__n128 _Qn, __n128 _Qm);
4966 __n64 __vsubhn_s64(__n128 _Qn, __n128 _Qm);
4967 __n64 __vsubhn_u16(__n128 _Qn, __n128 _Qm);
4968 __n64 __vsubhn_u32(__n128 _Qn, __n128 _Qm);
4969 __n64 __vsubhn_u64(__n128 _Qn, __n128 _Qm);
4970 __n128 __vsubl_s16(__n64 _Dn, __n64 _Dm);
4971 __n128 __vsubl_s32(__n64 _Dn, __n64 _Dm);
4972 __n128 __vsubl_s8(__n64 _Dn, __n64 _Dm);
4973 __n128 __vsubl_u16(__n64 _Dn, __n64 _Dm);
4974 __n128 __vsubl_u32(__n64 _Dn, __n64 _Dm);
4975 __n128 __vsubl_u8(__n64 _Dn, __n64 _Dm);
4976 __n128 __vsubw_s16(__n128 _Qn, __n64 _Dm);
4977 __n128 __vsubw_s32(__n128 _Qn, __n64 _Dm);
4978 __n128 __vsubw_s8(__n128 _Qn, __n64 _Dm);
4979 __n128 __vsubw_u16(__n128 _Qn, __n64 _Dm);
4980 __n128 __vsubw_u32(__n128 _Qn, __n64 _Dm);
4981 __n128 __vsubw_u8(__n128 _Qn, __n64 _Dm);
4982 __n64 __vtbl2_p8(__n64x2 _D2, __n64 _Dm);
4983 __n64 __vtbl2_s8(__n64x2 _D2, __n64 _Dm);
4984 __n64 __vtbl2_u8(__n64x2 _D2, __n64 _Dm);
4985 __n64 __vtbx2_p8(__n64 _Dd, __n64x2 _D2, __n64 _Dm);
4986 __n64 __vtbx2_s8(__n64 _Dd, __n64x2 _D2, __n64 _Dm);
4987 __n64 __vtbx2_u8(__n64 _Dd, __n64x2 _D2, __n64 _Dm);
4988 __n64 __vtbl3_p8(__n64x3 _D3, __n64 _Dm);
4989 __n64 __vtbl3_s8(__n64x3 _D3, __n64 _Dm);
4990 __n64 __vtbl3_u8(__n64x3 _D3, __n64 _Dm);
4991 __n64 __vtbx3_p8(__n64 _Dd, __n64x3 _D3, __n64 _Dm);
4992 __n64 __vtbx3_s8(__n64 _Dd, __n64x3 _D3, __n64 _Dm);
4993 __n64 __vtbx3_u8(__n64 _Dd, __n64x3 _D3, __n64 _Dm);
4994 __n64 __vtbl4_p8(__n64x4 _D4, __n64 _Dm);
4995 __n64 __vtbl4_s8(__n64x4 _D4, __n64 _Dm);
4996 __n64 __vtbl4_u8(__n64x4 _D4, __n64 _Dm);
4997 __n64 __vtbx4_p8(__n64 _Dd, __n64x4 _D4, __n64 _Dm);
4998 __n64 __vtbx4_s8(__n64 _Dd, __n64x4 _D4, __n64 _Dm);
4999 __n64 __vtbx4_u8(__n64 _Dd, __n64x4 _D4, __n64 _Dm);
5000 __n64 __vtbl1_p8(__n64 _Dn, __n64 _Dm);
5001 __n64 __vtbl1_s8(__n64 _Dn, __n64 _Dm);
5002 __n64 __vtbl1_u8(__n64 _Dn, __n64 _Dm);
5003 __n64 __vtbx1_p8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
5004 __n64 __vtbx1_s8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
5005 __n64 __vtbx1_u8(__n64 _Dd, __n64 _Dn, __n64 _Dm);
5006 __n64x2 __vtrn_f32(__n64 _Dd, __n64 _Dm);
5007 __n64x2 __vtrn_p16(__n64 _Dd, __n64 _Dm);
5008 __n64x2 __vtrn_p8(__n64 _Dd, __n64 _Dm);
5009 __n64x2 __vtrn_s16(__n64 _Dd, __n64 _Dm);
5010 __n64x2 __vtrn_s32(__n64 _Dd, __n64 _Dm);
5011 __n64x2 __vtrn_s8(__n64 _Dd, __n64 _Dm);
5012 __n64x2 __vtrn_u16(__n64 _Dd, __n64 _Dm);
5013 __n64x2 __vtrn_u32(__n64 _Dd, __n64 _Dm);
5014 __n64x2 __vtrn_u8(__n64 _Dd, __n64 _Dm);
5015 __n128x2 __vtrnq_f32(__n128 _Qd, __n128 _Qm);
5016 __n128x2 __vtrnq_p16(__n128 _Qd, __n128 _Qm);
5017 __n128x2 __vtrnq_p8(__n128 _Qd, __n128 _Qm);
5018 __n128x2 __vtrnq_s16(__n128 _Qd, __n128 _Qm);
5019 __n128x2 __vtrnq_s32(__n128 _Qd, __n128 _Qm);
5020 __n128x2 __vtrnq_s8(__n128 _Qd, __n128 _Qm);
5021 __n128x2 __vtrnq_u16(__n128 _Qd, __n128 _Qm);
5022 __n128x2 __vtrnq_u32(__n128 _Qd, __n128 _Qm);
5023 __n128x2 __vtrnq_u8(__n128 _Qd, __n128 _Qm);
5024 __n128x2 __vtrnq_s64(__n128 _Qd, __n128 _Qm);
5025 __n128x2 __vtrnq_u64(__n128 _Qd, __n128 _Qm);
5026 __n64 __vtst_p8(__n64 _Dn, __n64 _Dm);
5027 __n64 __vtst_s16(__n64 _Dn, __n64 _Dm);
5028 __n64 __vtst_s32(__n64 _Dn, __n64 _Dm);
5029 __n64 __vtst_s8(__n64 _Dn, __n64 _Dm);
5030 __n64 __vtst_u16(__n64 _Dn, __n64 _Dm);
5031 __n64 __vtst_u32(__n64 _Dn, __n64 _Dm);
5032 __n64 __vtst_u8(__n64 _Dn, __n64 _Dm);
5033 __n128 __vtstq_p8(__n128 _Qn, __n128 _Qm);
5034 __n128 __vtstq_s16(__n128 _Qn, __n128 _Qm);
5035 __n128 __vtstq_s32(__n128 _Qn, __n128 _Qm);
5036 __n128 __vtstq_s8(__n128 _Qn, __n128 _Qm);
5037 __n128 __vtstq_u16(__n128 _Qn, __n128 _Qm);
5038 __n128 __vtstq_u32(__n128 _Qn, __n128 _Qm);
5039 __n128 __vtstq_u8(__n128 _Qn, __n128 _Qm);
5040 __n64x2 __vuzp_p16(__n64 _Dd, __n64 _Dm);
5041 __n64x2 __vuzp_p8(__n64 _Dd, __n64 _Dm);
5042 __n64x2 __vuzp_s16(__n64 _Dd, __n64 _Dm);
5043 __n64x2 __vuzp_s8(__n64 _Dd, __n64 _Dm);
5044 __n64x2 __vuzp_u16(__n64 _Dd, __n64 _Dm);
5045 __n64x2 __vuzp_u8(__n64 _Dd, __n64 _Dm);
5046 __n64x2 __vuzp_f32(__n64 _Dd, __n64 _Dm);
5047 __n64x2 __vuzp_s32(__n64 _Dd, __n64 _Dm);
5048 __n64x2 __vuzp_u32(__n64 _Dd, __n64 _Dm);
5049 __n128x2 __vuzpq_f32(__n128 _Qd, __n128 _Qm);
5050 __n128x2 __vuzpq_p16(__n128 _Qd, __n128 _Qm);
5051 __n128x2 __vuzpq_p8(__n128 _Qd, __n128 _Qm);
5052 __n128x2 __vuzpq_s16(__n128 _Qd, __n128 _Qm);
5053 __n128x2 __vuzpq_s32(__n128 _Qd, __n128 _Qm);
5054 __n128x2 __vuzpq_s8(__n128 _Qd, __n128 _Qm);
5055 __n128x2 __vuzpq_u16(__n128 _Qd, __n128 _Qm);
5056 __n128x2 __vuzpq_u32(__n128 _Qd, __n128 _Qm);
5057 __n128x2 __vuzpq_u8(__n128 _Qd, __n128 _Qm);
5058 __n64x2 __vzip_p16(__n64 _Dd, __n64 _Dm);
5059 __n64x2 __vzip_p8(__n64 _Dd, __n64 _Dm);
5060 __n64x2 __vzip_s16(__n64 _Dd, __n64 _Dm);
5061 __n64x2 __vzip_s8(__n64 _Dd, __n64 _Dm);
5062 __n64x2 __vzip_u16(__n64 _Dd, __n64 _Dm);
5063 __n64x2 __vzip_u8(__n64 _Dd, __n64 _Dm);
5064 __n64x2 __vzip_f32(__n64 _Dd, __n64 _Dm);
5065 __n64x2 __vzip_s32(__n64 _Dd, __n64 _Dm);
5066 __n64x2 __vzip_u32(__n64 _Dd, __n64 _Dm);
5067 __n128x2 __vzipq_f32(__n128 _Qd, __n128 _Qm);
5068 __n128x2 __vzipq_p16(__n128 _Qd, __n128 _Qm);
5069 __n128x2 __vzipq_p8(__n128 _Qd, __n128 _Qm);
5070 __n128x2 __vzipq_s16(__n128 _Qd, __n128 _Qm);
5071 __n128x2 __vzipq_s32(__n128 _Qd, __n128 _Qm);
5072 __n128x2 __vzipq_s8(__n128 _Qd, __n128 _Qm);
5073 __n128x2 __vzipq_u16(__n128 _Qd, __n128 _Qm);
5074 __n128x2 __vzipq_u32(__n128 _Qd, __n128 _Qm);
5075 __n128x2 __vzipq_u8(__n128 _Qd, __n128 _Qm);
5076 
5077 // Type reinterpretation no-ops
5078 #define __vreinterpret_f32_s8(a) (a)
5079 #define __vreinterpret_f32_s16(a) (a)
5080 #define __vreinterpret_f32_s32(a) (a)
5081 #define __vreinterpret_f32_s64(a) (a)
5082 #define __vreinterpret_f32_p8(a) (a)
5083 #define __vreinterpret_f32_p16(a) (a)
5084 #define __vreinterpret_f32_u8(a) (a)
5085 #define __vreinterpret_f32_u16(a) (a)
5086 #define __vreinterpret_f32_u32(a) (a)
5087 #define __vreinterpret_f32_u64(a) (a)
5088 #define __vreinterpret_s8_f32(a) (a)
5089 #define __vreinterpret_s8_s16(a) (a)
5090 #define __vreinterpret_s8_s32(a) (a)
5091 #define __vreinterpret_s8_s64(a) (a)
5092 #define __vreinterpret_s8_p8(a) (a)
5093 #define __vreinterpret_s8_p16(a) (a)
5094 #define __vreinterpret_s8_u8(a) (a)
5095 #define __vreinterpret_s8_u16(a) (a)
5096 #define __vreinterpret_s8_u32(a) (a)
5097 #define __vreinterpret_s8_u64(a) (a)
5098 #define __vreinterpret_s16_f32(a) (a)
5099 #define __vreinterpret_s16_s8(a) (a)
5100 #define __vreinterpret_s16_s32(a) (a)
5101 #define __vreinterpret_s16_s64(a) (a)
5102 #define __vreinterpret_s16_p8(a) (a)
5103 #define __vreinterpret_s16_p16(a) (a)
5104 #define __vreinterpret_s16_u8(a) (a)
5105 #define __vreinterpret_s16_u16(a) (a)
5106 #define __vreinterpret_s16_u32(a) (a)
5107 #define __vreinterpret_s16_u64(a) (a)
5108 #define __vreinterpret_s32_f32(a) (a)
5109 #define __vreinterpret_s32_s8(a) (a)
5110 #define __vreinterpret_s32_s16(a) (a)
5111 #define __vreinterpret_s32_s64(a) (a)
5112 #define __vreinterpret_s32_p8(a) (a)
5113 #define __vreinterpret_s32_p16(a) (a)
5114 #define __vreinterpret_s32_u8(a) (a)
5115 #define __vreinterpret_s32_u16(a) (a)
5116 #define __vreinterpret_s32_u32(a) (a)
5117 #define __vreinterpret_s32_u64(a) (a)
5118 #define __vreinterpret_s64_f32(a) (a)
5119 #define __vreinterpret_s64_s8(a) (a)
5120 #define __vreinterpret_s64_s16(a) (a)
5121 #define __vreinterpret_s64_s32(a) (a)
5122 #define __vreinterpret_s64_p8(a) (a)
5123 #define __vreinterpret_s64_p16(a) (a)
5124 #define __vreinterpret_s64_u8(a) (a)
5125 #define __vreinterpret_s64_u16(a) (a)
5126 #define __vreinterpret_s64_u32(a) (a)
5127 #define __vreinterpret_s64_u64(a) (a)
5128 #define __vreinterpret_p8_f32(a) (a)
5129 #define __vreinterpret_p8_s8(a) (a)
5130 #define __vreinterpret_p8_s16(a) (a)
5131 #define __vreinterpret_p8_s32(a) (a)
5132 #define __vreinterpret_p8_s64(a) (a)
5133 #define __vreinterpret_p8_p16(a) (a)
5134 #define __vreinterpret_p8_u8(a) (a)
5135 #define __vreinterpret_p8_u16(a) (a)
5136 #define __vreinterpret_p8_u32(a) (a)
5137 #define __vreinterpret_p8_u64(a) (a)
5138 #define __vreinterpret_p16_f32(a) (a)
5139 #define __vreinterpret_p16_s8(a) (a)
5140 #define __vreinterpret_p16_s16(a) (a)
5141 #define __vreinterpret_p16_s32(a) (a)
5142 #define __vreinterpret_p16_s64(a) (a)
5143 #define __vreinterpret_p16_p8(a) (a)
5144 #define __vreinterpret_p16_u8(a) (a)
5145 #define __vreinterpret_p16_u16(a) (a)
5146 #define __vreinterpret_p16_u32(a) (a)
5147 #define __vreinterpret_p16_u64(a) (a)
5148 #define __vreinterpret_u8_f32(a) (a)
5149 #define __vreinterpret_u8_s8(a) (a)
5150 #define __vreinterpret_u8_s16(a) (a)
5151 #define __vreinterpret_u8_s32(a) (a)
5152 #define __vreinterpret_u8_s64(a) (a)
5153 #define __vreinterpret_u8_p8(a) (a)
5154 #define __vreinterpret_u8_p16(a) (a)
5155 #define __vreinterpret_u8_u16(a) (a)
5156 #define __vreinterpret_u8_u32(a) (a)
5157 #define __vreinterpret_u8_u64(a) (a)
5158 #define __vreinterpret_u16_f32(a) (a)
5159 #define __vreinterpret_u16_s8(a) (a)
5160 #define __vreinterpret_u16_s16(a) (a)
5161 #define __vreinterpret_u16_s32(a) (a)
5162 #define __vreinterpret_u16_s64(a) (a)
5163 #define __vreinterpret_u16_p8(a) (a)
5164 #define __vreinterpret_u16_p16(a) (a)
5165 #define __vreinterpret_u16_u8(a) (a)
5166 #define __vreinterpret_u16_u32(a) (a)
5167 #define __vreinterpret_u16_u64(a) (a)
5168 #define __vreinterpret_u32_f32(a) (a)
5169 #define __vreinterpret_u32_s8(a) (a)
5170 #define __vreinterpret_u32_s16(a) (a)
5171 #define __vreinterpret_u32_s32(a) (a)
5172 #define __vreinterpret_u32_s64(a) (a)
5173 #define __vreinterpret_u32_p8(a) (a)
5174 #define __vreinterpret_u32_p16(a) (a)
5175 #define __vreinterpret_u32_u8(a) (a)
5176 #define __vreinterpret_u32_u16(a) (a)
5177 #define __vreinterpret_u32_u64(a) (a)
5178 #define __vreinterpret_u64_f32(a) (a)
5179 #define __vreinterpret_u64_s8(a) (a)
5180 #define __vreinterpret_u64_s16(a) (a)
5181 #define __vreinterpret_u64_s32(a) (a)
5182 #define __vreinterpret_u64_s64(a) (a)
5183 #define __vreinterpret_u64_p8(a) (a)
5184 #define __vreinterpret_u64_p16(a) (a)
5185 #define __vreinterpret_u64_u8(a) (a)
5186 #define __vreinterpret_u64_u16(a) (a)
5187 #define __vreinterpret_u64_u32(a) (a)
5188 #define __vreinterpretq_f32_s8(a) (a)
5189 #define __vreinterpretq_f32_s16(a) (a)
5190 #define __vreinterpretq_f32_s32(a) (a)
5191 #define __vreinterpretq_f32_s64(a) (a)
5192 #define __vreinterpretq_f32_p8(a) (a)
5193 #define __vreinterpretq_f32_p16(a) (a)
5194 #define __vreinterpretq_f32_u8(a) (a)
5195 #define __vreinterpretq_f32_u16(a) (a)
5196 #define __vreinterpretq_f32_u32(a) (a)
5197 #define __vreinterpretq_f32_u64(a) (a)
5198 #define __vreinterpretq_s8_f32(a) (a)
5199 #define __vreinterpretq_s8_s16(a) (a)
5200 #define __vreinterpretq_s8_s32(a) (a)
5201 #define __vreinterpretq_s8_s64(a) (a)
5202 #define __vreinterpretq_s8_p8(a) (a)
5203 #define __vreinterpretq_s8_p16(a) (a)
5204 #define __vreinterpretq_s8_u8(a) (a)
5205 #define __vreinterpretq_s8_u16(a) (a)
5206 #define __vreinterpretq_s8_u32(a) (a)
5207 #define __vreinterpretq_s8_u64(a) (a)
5208 #define __vreinterpretq_s16_f32(a) (a)
5209 #define __vreinterpretq_s16_s8(a) (a)
5210 #define __vreinterpretq_s16_s32(a) (a)
5211 #define __vreinterpretq_s16_s64(a) (a)
5212 #define __vreinterpretq_s16_p8(a) (a)
5213 #define __vreinterpretq_s16_p16(a) (a)
5214 #define __vreinterpretq_s16_u8(a) (a)
5215 #define __vreinterpretq_s16_u16(a) (a)
5216 #define __vreinterpretq_s16_u32(a) (a)
5217 #define __vreinterpretq_s16_u64(a) (a)
5218 #define __vreinterpretq_s32_f32(a) (a)
5219 #define __vreinterpretq_s32_s8(a) (a)
5220 #define __vreinterpretq_s32_s16(a) (a)
5221 #define __vreinterpretq_s32_s64(a) (a)
5222 #define __vreinterpretq_s32_p8(a) (a)
5223 #define __vreinterpretq_s32_p16(a) (a)
5224 #define __vreinterpretq_s32_u8(a) (a)
5225 #define __vreinterpretq_s32_u16(a) (a)
5226 #define __vreinterpretq_s32_u32(a) (a)
5227 #define __vreinterpretq_s32_u64(a) (a)
5228 #define __vreinterpretq_s64_f32(a) (a)
5229 #define __vreinterpretq_s64_s8(a) (a)
5230 #define __vreinterpretq_s64_s16(a) (a)
5231 #define __vreinterpretq_s64_s32(a) (a)
5232 #define __vreinterpretq_s64_p8(a) (a)
5233 #define __vreinterpretq_s64_p16(a) (a)
5234 #define __vreinterpretq_s64_u8(a) (a)
5235 #define __vreinterpretq_s64_u16(a) (a)
5236 #define __vreinterpretq_s64_u32(a) (a)
5237 #define __vreinterpretq_s64_u64(a) (a)
5238 #define __vreinterpretq_p8_f32(a) (a)
5239 #define __vreinterpretq_p8_s8(a) (a)
5240 #define __vreinterpretq_p8_s16(a) (a)
5241 #define __vreinterpretq_p8_s32(a) (a)
5242 #define __vreinterpretq_p8_s64(a) (a)
5243 #define __vreinterpretq_p8_p16(a) (a)
5244 #define __vreinterpretq_p8_u8(a) (a)
5245 #define __vreinterpretq_p8_u16(a) (a)
5246 #define __vreinterpretq_p8_u32(a) (a)
5247 #define __vreinterpretq_p8_u64(a) (a)
5248 #define __vreinterpretq_p16_f32(a) (a)
5249 #define __vreinterpretq_p16_s8(a) (a)
5250 #define __vreinterpretq_p16_s16(a) (a)
5251 #define __vreinterpretq_p16_s32(a) (a)
5252 #define __vreinterpretq_p16_s64(a) (a)
5253 #define __vreinterpretq_p16_p8(a) (a)
5254 #define __vreinterpretq_p16_u8(a) (a)
5255 #define __vreinterpretq_p16_u16(a) (a)
5256 #define __vreinterpretq_p16_u32(a) (a)
5257 #define __vreinterpretq_p16_u64(a) (a)
5258 #define __vreinterpretq_u8_f32(a) (a)
5259 #define __vreinterpretq_u8_s8(a) (a)
5260 #define __vreinterpretq_u8_s16(a) (a)
5261 #define __vreinterpretq_u8_s32(a) (a)
5262 #define __vreinterpretq_u8_s64(a) (a)
5263 #define __vreinterpretq_u8_p8(a) (a)
5264 #define __vreinterpretq_u8_p16(a) (a)
5265 #define __vreinterpretq_u8_u16(a) (a)
5266 #define __vreinterpretq_u8_u32(a) (a)
5267 #define __vreinterpretq_u8_u64(a) (a)
5268 #define __vreinterpretq_u16_f32(a) (a)
5269 #define __vreinterpretq_u16_s8(a) (a)
5270 #define __vreinterpretq_u16_s16(a) (a)
5271 #define __vreinterpretq_u16_s32(a) (a)
5272 #define __vreinterpretq_u16_s64(a) (a)
5273 #define __vreinterpretq_u16_p8(a) (a)
5274 #define __vreinterpretq_u16_p16(a) (a)
5275 #define __vreinterpretq_u16_u8(a) (a)
5276 #define __vreinterpretq_u16_u32(a) (a)
5277 #define __vreinterpretq_u16_u64(a) (a)
5278 #define __vreinterpretq_u32_f32(a) (a)
5279 #define __vreinterpretq_u32_s8(a) (a)
5280 #define __vreinterpretq_u32_s16(a) (a)
5281 #define __vreinterpretq_u32_s32(a) (a)
5282 #define __vreinterpretq_u32_s64(a) (a)
5283 #define __vreinterpretq_u32_p8(a) (a)
5284 #define __vreinterpretq_u32_p16(a) (a)
5285 #define __vreinterpretq_u32_u8(a) (a)
5286 #define __vreinterpretq_u32_u16(a) (a)
5287 #define __vreinterpretq_u32_u64(a) (a)
5288 #define __vreinterpretq_u64_f32(a) (a)
5289 #define __vreinterpretq_u64_s8(a) (a)
5290 #define __vreinterpretq_u64_s16(a) (a)
5291 #define __vreinterpretq_u64_s32(a) (a)
5292 #define __vreinterpretq_u64_s64(a) (a)
5293 #define __vreinterpretq_u64_p8(a) (a)
5294 #define __vreinterpretq_u64_p16(a) (a)
5295 #define __vreinterpretq_u64_u8(a) (a)
5296 #define __vreinterpretq_u64_u16(a) (a)
5297 #define __vreinterpretq_u64_u32(a) (a)
5298 
5299 // Multiply by scalar
5300 #define __vmul_n_s16(Vd, Rt) __vmul_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5301 #define __vmul_n_s32(Vd, Rt) __vmul_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5302 #define __vmul_n_u16(Vd, Rt) __vmul_lane_u16((Vd), __vmov_n_u16(Rt), 0)
5303 #define __vmul_n_u32(Vd, Rt) __vmul_lane_u32((Vd), __vmov_n_u32(Rt), 0)
5304 #define __vmulq_n_s16(Vd, Rt) __vmulq_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5305 #define __vmulq_n_s32(Vd, Rt) __vmulq_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5306 #define __vmulq_n_u16(Vd, Rt) __vmulq_lane_u16((Vd), __vmov_n_u16(Rt), 0)
5307 #define __vmulq_n_u32(Vd, Rt) __vmulq_lane_u32((Vd), __vmov_n_u32(Rt), 0)
5308 #define __vmull_n_s16(Vd, Rt) __vmull_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5309 #define __vmull_n_s32(Vd, Rt) __vmull_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5310 #define __vmull_n_u16(Vd, Rt) __vmull_lane_u16((Vd), __vmov_n_u16(Rt), 0)
5311 #define __vmull_n_u32(Vd, Rt) __vmull_lane_u32((Vd), __vmov_n_u32(Rt), 0)
5312 #define __vqdmulh_n_s16(Vd, Rt) __vqdmulh_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5313 #define __vqdmulh_n_s32(Vd, Rt) __vqdmulh_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5314 #define __vqdmulhq_n_s16(Vd, Rt) __vqdmulhq_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5315 #define __vqdmulhq_n_s32(Vd, Rt) __vqdmulhq_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5316 #define __vqdmull_n_s16(Vd, Rt) __vqdmull_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5317 #define __vqdmull_n_s32(Vd, Rt) __vqdmull_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5318 #define __vqrdmulh_n_s16(Vd, Rt) __vqrdmulh_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5319 #define __vqrdmulh_n_s32(Vd, Rt) __vqrdmulh_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5320 #define __vqrdmulhq_n_s16(Vd, Rt) __vqrdmulhq_lane_s16((Vd), __vmov_n_s16(Rt), 0)
5321 #define __vqrdmulhq_n_s32(Vd, Rt) __vqrdmulhq_lane_s32((Vd), __vmov_n_s32(Rt), 0)
5322 
5323 // Multiply by scalar with accumulate
5324 #define __vmla_n_s16(Vd, Vn, Rt) __vmla_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5325 #define __vmla_n_s32(Vd, Vn, Rt) __vmla_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5326 #define __vmla_n_u16(Vd, Vn, Rt) __vmla_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5327 #define __vmla_n_u32(Vd, Vn, Rt) __vmla_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5328 #define __vmlaq_n_s16(Vd, Vn, Rt) __vmlaq_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5329 #define __vmlaq_n_s32(Vd, Vn, Rt) __vmlaq_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5330 #define __vmlaq_n_u16(Vd, Vn, Rt) __vmlaq_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5331 #define __vmlaq_n_u32(Vd, Vn, Rt) __vmlaq_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5332 #define __vmlal_n_s16(Vd, Vn, Rt) __vmlal_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5333 #define __vmlal_n_s32(Vd, Vn, Rt) __vmlal_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5334 #define __vmlal_n_u16(Vd, Vn, Rt) __vmlal_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5335 #define __vmlal_n_u32(Vd, Vn, Rt) __vmlal_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5336 #define __vmls_n_s16(Vd, Vn, Rt) __vmls_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5337 #define __vmls_n_s32(Vd, Vn, Rt) __vmls_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5338 #define __vmls_n_u16(Vd, Vn, Rt) __vmls_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5339 #define __vmls_n_u32(Vd, Vn, Rt) __vmls_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5340 #define __vmlsq_n_s16(Vd, Vn, Rt) __vmlsq_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5341 #define __vmlsq_n_s32(Vd, Vn, Rt) __vmlsq_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5342 #define __vmlsq_n_u16(Vd, Vn, Rt) __vmlsq_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5343 #define __vmlsq_n_u32(Vd, Vn, Rt) __vmlsq_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5344 #define __vmlsl_n_s16(Vd, Vn, Rt) __vmlsl_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5345 #define __vmlsl_n_s32(Vd, Vn, Rt) __vmlsl_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5346 #define __vmlsl_n_u16(Vd, Vn, Rt) __vmlsl_lane_u16((Vd), (Vn), __vmov_n_u16(Rt), 0)
5347 #define __vmlsl_n_u32(Vd, Vn, Rt) __vmlsl_lane_u32((Vd), (Vn), __vmov_n_u32(Rt), 0)
5348 #define __vqdmlal_n_s16(Vd, Vn, Rt) __vqdmlal_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5349 #define __vqdmlal_n_s32(Vd, Vn, Rt) __vqdmlal_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5350 #define __vqdmlsl_n_s16(Vd, Vn, Rt) __vqdmlsl_lane_s16((Vd), (Vn), __vmov_n_s16(Rt), 0)
5351 #define __vqdmlsl_n_s32(Vd, Vn, Rt) __vqdmlsl_lane_s32((Vd), (Vn), __vmov_n_s32(Rt), 0)
5352 
5353 // VDUP.64 (scalar)
5354 #define __vdup_lane_s64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), (Dn))
5355 #define __vdup_lane_u64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), (Dn))
5356 
5357 // VDUP.W.64 (scalar)
5358 #define __vdupq_lane_s64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), vcombine_s64((Dn), (Dn)))
5359 #define __vdupq_lane_u64(Dn, lane) (__static_assert((lane) == 0, "invalid lane index"), vcombine_u64((Dn), (Dn)))
5360 
5361 #if !defined(_ARM_ISO_COMPATIBLE_INTRINSIC_NAMES)
5362 
5363 #define aesd_p8 __aesd_p8
5364 #define aesd_s8 __aesd_s8
5365 #define aesd_u8 __aesd_u8
5366 #define aese_p8 __aese_p8
5367 #define aese_s8 __aese_s8
5368 #define aese_u8 __aese_u8
5369 #define aesimc_p8 __aesimc_p8
5370 #define aesimc_s8 __aesimc_s8
5371 #define aesimc_u8 __aesimc_u8
5372 #define aesmc_p8 __aesmc_p8
5373 #define aesmc_s8 __aesmc_s8
5374 #define aesmc_u8 __aesmc_u8
5375 #define sha1h_f32 __sha1h_f32
5376 #define sha1h_s32 __sha1h_s32
5377 #define sha1h_u32 __sha1h_u32
5378 #define sha1su1_f32 __sha1su1_f32
5379 #define sha1su1_s32 __sha1su1_s32
5380 #define sha1su1_u32 __sha1su1_u32
5381 #define sha256su0_f32 __sha256su0_f32
5382 #define sha256su0_s32 __sha256su0_s32
5383 #define sha256su0_u32 __sha256su0_u32
5384 #define sha1c_f32 __sha1c_f32
5385 #define sha1c_s32 __sha1c_s32
5386 #define sha1c_u32 __sha1c_u32
5387 #define sha1m_f32 __sha1m_f32
5388 #define sha1m_s32 __sha1m_s32
5389 #define sha1m_u32 __sha1m_u32
5390 #define sha1p_f32 __sha1p_f32
5391 #define sha1p_s32 __sha1p_s32
5392 #define sha1p_u32 __sha1p_u32
5393 #define sha1su0_f32 __sha1su0_f32
5394 #define sha1su0_s32 __sha1su0_s32
5395 #define sha1su0_u32 __sha1su0_u32
5396 #define sha256h_f32 __sha256h_f32
5397 #define sha256h_s32 __sha256h_s32
5398 #define sha256h_u32 __sha256h_u32
5399 #define sha256h2_f32 __sha256h2_f32
5400 #define sha256h2_s32 __sha256h2_s32
5401 #define sha256h2_u32 __sha256h2_u32
5402 #define sha256su1_f32 __sha256su1_f32
5403 #define sha256su1_s32 __sha256su1_s32
5404 #define sha256su1_u32 __sha256su1_u32
5405 #define vaba_s16 __vaba_s16
5406 #define vaba_s32 __vaba_s32
5407 #define vaba_s8 __vaba_s8
5408 #define vaba_u16 __vaba_u16
5409 #define vaba_u32 __vaba_u32
5410 #define vaba_u8 __vaba_u8
5411 #define vabal_s16 __vabal_s16
5412 #define vabal_s32 __vabal_s32
5413 #define vabal_s8 __vabal_s8
5414 #define vabal_u16 __vabal_u16
5415 #define vabal_u32 __vabal_u32
5416 #define vabal_u8 __vabal_u8
5417 #define vabaq_s16 __vabaq_s16
5418 #define vabaq_s32 __vabaq_s32
5419 #define vabaq_s8 __vabaq_s8
5420 #define vabaq_u16 __vabaq_u16
5421 #define vabaq_u32 __vabaq_u32
5422 #define vabaq_u8 __vabaq_u8
5423 #define vabd_f32 __vabd_f32
5424 #define vabdq_f32 __vabdq_f32
5425 #define vabd_s16 __vabd_s16
5426 #define vabd_s32 __vabd_s32
5427 #define vabd_s8 __vabd_s8
5428 #define vabd_u16 __vabd_u16
5429 #define vabd_u32 __vabd_u32
5430 #define vabd_u8 __vabd_u8
5431 #define vabdl_s16 __vabdl_s16
5432 #define vabdl_s32 __vabdl_s32
5433 #define vabdl_s8 __vabdl_s8
5434 #define vabdl_u16 __vabdl_u16
5435 #define vabdl_u32 __vabdl_u32
5436 #define vabdl_u8 __vabdl_u8
5437 #define vabdq_s16 __vabdq_s16
5438 #define vabdq_s32 __vabdq_s32
5439 #define vabdq_s8 __vabdq_s8
5440 #define vabdq_u16 __vabdq_u16
5441 #define vabdq_u32 __vabdq_u32
5442 #define vabdq_u8 __vabdq_u8
5443 #define vabs_f32 __vabs_f32
5444 #define vabs_s16 __vabs_s16
5445 #define vabs_s32 __vabs_s32
5446 #define vabs_s8 __vabs_s8
5447 #define vneg_f32 __vneg_f32
5448 #define vneg_s16 __vneg_s16
5449 #define vneg_s32 __vneg_s32
5450 #define vneg_s8 __vneg_s8
5451 #define vabsq_f32 __vabsq_f32
5452 #define vabsq_s16 __vabsq_s16
5453 #define vabsq_s32 __vabsq_s32
5454 #define vabsq_s8 __vabsq_s8
5455 #define vnegq_f32 __vnegq_f32
5456 #define vnegq_s16 __vnegq_s16
5457 #define vnegq_s32 __vnegq_s32
5458 #define vnegq_s8 __vnegq_s8
5459 #define vacge_f32 __vacge_f32
5460 #define vacgt_f32 __vacgt_f32
5461 #define vacle_f32 __vacle_f32
5462 #define vaclt_f32 __vaclt_f32
5463 #define vacgeq_f32 __vacgeq_f32
5464 #define vacgtq_f32 __vacgtq_f32
5465 #define vacleq_f32 __vacleq_f32
5466 #define vacltq_f32 __vacltq_f32
5467 #define vadd_f32 __vadd_f32
5468 #define vadd_s16 __vadd_s16
5469 #define vadd_s32 __vadd_s32
5470 #define vadd_s64 __vadd_s64
5471 #define vadd_s8 __vadd_s8
5472 #define vadd_u16 __vadd_u16
5473 #define vadd_u32 __vadd_u32
5474 #define vadd_u64 __vadd_u64
5475 #define vadd_u8 __vadd_u8
5476 #define vaddq_f32 __vaddq_f32
5477 #define vaddq_s16 __vaddq_s16
5478 #define vaddq_s32 __vaddq_s32
5479 #define vaddq_s64 __vaddq_s64
5480 #define vaddq_s8 __vaddq_s8
5481 #define vaddq_u16 __vaddq_u16
5482 #define vaddq_u32 __vaddq_u32
5483 #define vaddq_u64 __vaddq_u64
5484 #define vaddq_u8 __vaddq_u8
5485 #define vaddhn_s16 __vaddhn_s16
5486 #define vaddhn_s32 __vaddhn_s32
5487 #define vaddhn_s64 __vaddhn_s64
5488 #define vaddhn_u16 __vaddhn_u16
5489 #define vaddhn_u32 __vaddhn_u32
5490 #define vaddhn_u64 __vaddhn_u64
5491 #define vraddhn_s16 __vraddhn_s16
5492 #define vraddhn_s32 __vraddhn_s32
5493 #define vraddhn_s64 __vraddhn_s64
5494 #define vraddhn_u16 __vraddhn_u16
5495 #define vraddhn_u32 __vraddhn_u32
5496 #define vraddhn_u64 __vraddhn_u64
5497 #define vaddl_s16 __vaddl_s16
5498 #define vaddl_s32 __vaddl_s32
5499 #define vaddl_s8 __vaddl_s8
5500 #define vaddl_u16 __vaddl_u16
5501 #define vaddl_u32 __vaddl_u32
5502 #define vaddl_u8 __vaddl_u8
5503 #define vaddw_s16 __vaddw_s16
5504 #define vaddw_s32 __vaddw_s32
5505 #define vaddw_s8 __vaddw_s8
5506 #define vaddw_u16 __vaddw_u16
5507 #define vaddw_u32 __vaddw_u32
5508 #define vaddw_u8 __vaddw_u8
5509 #define vand_s16 __vand_s16
5510 #define vand_s32 __vand_s32
5511 #define vand_s64 __vand_s64
5512 #define vand_s8 __vand_s8
5513 #define vand_u16 __vand_u16
5514 #define vand_u32 __vand_u32
5515 #define vand_u64 __vand_u64
5516 #define vand_u8 __vand_u8
5517 #define vorr_s16 __vorr_s16
5518 #define vorr_s32 __vorr_s32
5519 #define vorr_s64 __vorr_s64
5520 #define vorr_s8 __vorr_s8
5521 #define vorr_u16 __vorr_u16
5522 #define vorr_u32 __vorr_u32
5523 #define vorr_u64 __vorr_u64
5524 #define vorr_u8 __vorr_u8
5525 #define vandq_s16 __vandq_s16
5526 #define vandq_s32 __vandq_s32
5527 #define vandq_s64 __vandq_s64
5528 #define vandq_s8 __vandq_s8
5529 #define vandq_u16 __vandq_u16
5530 #define vandq_u32 __vandq_u32
5531 #define vandq_u64 __vandq_u64
5532 #define vandq_u8 __vandq_u8
5533 #define vorrq_s16 __vorrq_s16
5534 #define vorrq_s32 __vorrq_s32
5535 #define vorrq_s64 __vorrq_s64
5536 #define vorrq_s8 __vorrq_s8
5537 #define vorrq_u16 __vorrq_u16
5538 #define vorrq_u32 __vorrq_u32
5539 #define vorrq_u64 __vorrq_u64
5540 #define vorrq_u8 __vorrq_u8
5541 #define vbif_f32 __vbif_f32
5542 #define vbif_p16 __vbif_p16
5543 #define vbif_p8 __vbif_p8
5544 #define vbif_s16 __vbif_s16
5545 #define vbif_s32 __vbif_s32
5546 #define vbif_s64 __vbif_s64
5547 #define vbif_s8 __vbif_s8
5548 #define vbif_u16 __vbif_u16
5549 #define vbif_u32 __vbif_u32
5550 #define vbif_u64 __vbif_u64
5551 #define vbif_u8 __vbif_u8
5552 #define vbit_f32 __vbit_f32
5553 #define vbit_p16 __vbit_p16
5554 #define vbit_p8 __vbit_p8
5555 #define vbit_s16 __vbit_s16
5556 #define vbit_s32 __vbit_s32
5557 #define vbit_s64 __vbit_s64
5558 #define vbit_s8 __vbit_s8
5559 #define vbit_u16 __vbit_u16
5560 #define vbit_u32 __vbit_u32
5561 #define vbit_u64 __vbit_u64
5562 #define vbit_u8 __vbit_u8
5563 #define vbsl_f32 __vbsl_f32
5564 #define vbsl_p16 __vbsl_p16
5565 #define vbsl_p8 __vbsl_p8
5566 #define vbsl_s16 __vbsl_s16
5567 #define vbsl_s32 __vbsl_s32
5568 #define vbsl_s64 __vbsl_s64
5569 #define vbsl_s8 __vbsl_s8
5570 #define vbsl_u16 __vbsl_u16
5571 #define vbsl_u32 __vbsl_u32
5572 #define vbsl_u64 __vbsl_u64
5573 #define vbsl_u8 __vbsl_u8
5574 #define vbifq_f32 __vbifq_f32
5575 #define vbifq_p16 __vbifq_p16
5576 #define vbifq_p8 __vbifq_p8
5577 #define vbifq_s16 __vbifq_s16
5578 #define vbifq_s32 __vbifq_s32
5579 #define vbifq_s64 __vbifq_s64
5580 #define vbifq_s8 __vbifq_s8
5581 #define vbifq_u16 __vbifq_u16
5582 #define vbifq_u32 __vbifq_u32
5583 #define vbifq_u64 __vbifq_u64
5584 #define vbifq_u8 __vbifq_u8
5585 #define vbitq_f32 __vbitq_f32
5586 #define vbitq_p16 __vbitq_p16
5587 #define vbitq_p8 __vbitq_p8
5588 #define vbitq_s16 __vbitq_s16
5589 #define vbitq_s32 __vbitq_s32
5590 #define vbitq_s64 __vbitq_s64
5591 #define vbitq_s8 __vbitq_s8
5592 #define vbitq_u16 __vbitq_u16
5593 #define vbitq_u32 __vbitq_u32
5594 #define vbitq_u64 __vbitq_u64
5595 #define vbitq_u8 __vbitq_u8
5596 #define vbslq_f32 __vbslq_f32
5597 #define vbslq_p16 __vbslq_p16
5598 #define vbslq_p8 __vbslq_p8
5599 #define vbslq_s16 __vbslq_s16
5600 #define vbslq_s32 __vbslq_s32
5601 #define vbslq_s64 __vbslq_s64
5602 #define vbslq_s8 __vbslq_s8
5603 #define vbslq_u16 __vbslq_u16
5604 #define vbslq_u32 __vbslq_u32
5605 #define vbslq_u64 __vbslq_u64
5606 #define vbslq_u8 __vbslq_u8
5607 #define vceq_z_f32_ex __vceq_z_f32_ex
5608 #define vceq_z_s16_ex __vceq_z_s16_ex
5609 #define vceq_z_s32_ex __vceq_z_s32_ex
5610 #define vceq_z_s8_ex __vceq_z_s8_ex
5611 #define vceq_z_u16_ex __vceq_z_u16_ex
5612 #define vceq_z_u32_ex __vceq_z_u32_ex
5613 #define vceq_z_u8_ex __vceq_z_u8_ex
5614 #define vceqq_z_f32_ex __vceqq_z_f32_ex
5615 #define vceqq_z_s16_ex __vceqq_z_s16_ex
5616 #define vceqq_z_s32_ex __vceqq_z_s32_ex
5617 #define vceqq_z_s8_ex __vceqq_z_s8_ex
5618 #define vceqq_z_u16_ex __vceqq_z_u16_ex
5619 #define vceqq_z_u32_ex __vceqq_z_u32_ex
5620 #define vceqq_z_u8_ex __vceqq_z_u8_ex
5621 #define vceq_f32 __vceq_f32
5622 #define vceq_p8 __vceq_p8
5623 #define vceq_s16 __vceq_s16
5624 #define vceq_s32 __vceq_s32
5625 #define vceq_s8 __vceq_s8
5626 #define vceq_u16 __vceq_u16
5627 #define vceq_u32 __vceq_u32
5628 #define vceq_u8 __vceq_u8
5629 #define vceqq_f32 __vceqq_f32
5630 #define vceqq_p8 __vceqq_p8
5631 #define vceqq_s16 __vceqq_s16
5632 #define vceqq_s32 __vceqq_s32
5633 #define vceqq_s8 __vceqq_s8
5634 #define vceqq_u16 __vceqq_u16
5635 #define vceqq_u32 __vceqq_u32
5636 #define vceqq_u8 __vceqq_u8
5637 #define vcge_z_f32_ex __vcge_z_f32_ex
5638 #define vcge_z_s16_ex __vcge_z_s16_ex
5639 #define vcge_z_s32_ex __vcge_z_s32_ex
5640 #define vcge_z_s8_ex __vcge_z_s8_ex
5641 #define vcgeq_z_f32_ex __vcgeq_z_f32_ex
5642 #define vcgeq_z_s16_ex __vcgeq_z_s16_ex
5643 #define vcgeq_z_s32_ex __vcgeq_z_s32_ex
5644 #define vcgeq_z_s8_ex __vcgeq_z_s8_ex
5645 #define vcge_f32 __vcge_f32
5646 #define vcge_s16 __vcge_s16
5647 #define vcge_s32 __vcge_s32
5648 #define vcge_s8 __vcge_s8
5649 #define vcge_u16 __vcge_u16
5650 #define vcge_u32 __vcge_u32
5651 #define vcge_u8 __vcge_u8
5652 #define vcle_f32 __vcle_f32
5653 #define vcle_s16 __vcle_s16
5654 #define vcle_s32 __vcle_s32
5655 #define vcle_s8 __vcle_s8
5656 #define vcle_u16 __vcle_u16
5657 #define vcle_u32 __vcle_u32
5658 #define vcle_u8 __vcle_u8
5659 #define vcgeq_f32 __vcgeq_f32
5660 #define vcgeq_s16 __vcgeq_s16
5661 #define vcgeq_s32 __vcgeq_s32
5662 #define vcgeq_s8 __vcgeq_s8
5663 #define vcgeq_u16 __vcgeq_u16
5664 #define vcgeq_u32 __vcgeq_u32
5665 #define vcgeq_u8 __vcgeq_u8
5666 #define vcleq_f32 __vcleq_f32
5667 #define vcleq_s16 __vcleq_s16
5668 #define vcleq_s32 __vcleq_s32
5669 #define vcleq_s8 __vcleq_s8
5670 #define vcleq_u16 __vcleq_u16
5671 #define vcleq_u32 __vcleq_u32
5672 #define vcleq_u8 __vcleq_u8
5673 #define vcgt_z_f32_ex __vcgt_z_f32_ex
5674 #define vcgt_z_s16_ex __vcgt_z_s16_ex
5675 #define vcgt_z_s32_ex __vcgt_z_s32_ex
5676 #define vcgt_z_s8_ex __vcgt_z_s8_ex
5677 #define vcgtq_z_f32_ex __vcgtq_z_f32_ex
5678 #define vcgtq_z_s16_ex __vcgtq_z_s16_ex
5679 #define vcgtq_z_s32_ex __vcgtq_z_s32_ex
5680 #define vcgtq_z_s8_ex __vcgtq_z_s8_ex
5681 #define vcgt_f32 __vcgt_f32
5682 #define vcgt_s16 __vcgt_s16
5683 #define vcgt_s32 __vcgt_s32
5684 #define vcgt_s8 __vcgt_s8
5685 #define vcgt_u16 __vcgt_u16
5686 #define vcgt_u32 __vcgt_u32
5687 #define vcgt_u8 __vcgt_u8
5688 #define vclt_f32 __vclt_f32
5689 #define vclt_s16 __vclt_s16
5690 #define vclt_s32 __vclt_s32
5691 #define vclt_s8 __vclt_s8
5692 #define vclt_u16 __vclt_u16
5693 #define vclt_u32 __vclt_u32
5694 #define vclt_u8 __vclt_u8
5695 #define vcgtq_f32 __vcgtq_f32
5696 #define vcgtq_s16 __vcgtq_s16
5697 #define vcgtq_s32 __vcgtq_s32
5698 #define vcgtq_s8 __vcgtq_s8
5699 #define vcgtq_u16 __vcgtq_u16
5700 #define vcgtq_u32 __vcgtq_u32
5701 #define vcgtq_u8 __vcgtq_u8
5702 #define vcltq_f32 __vcltq_f32
5703 #define vcltq_s16 __vcltq_s16
5704 #define vcltq_s32 __vcltq_s32
5705 #define vcltq_s8 __vcltq_s8
5706 #define vcltq_u16 __vcltq_u16
5707 #define vcltq_u32 __vcltq_u32
5708 #define vcltq_u8 __vcltq_u8
5709 #define vcle_z_f32_ex __vcle_z_f32_ex
5710 #define vcle_z_s16_ex __vcle_z_s16_ex
5711 #define vcle_z_s32_ex __vcle_z_s32_ex
5712 #define vcle_z_s8_ex __vcle_z_s8_ex
5713 #define vcleq_z_f32_ex __vcleq_z_f32_ex
5714 #define vcleq_z_s16_ex __vcleq_z_s16_ex
5715 #define vcleq_z_s32_ex __vcleq_z_s32_ex
5716 #define vcleq_z_s8_ex __vcleq_z_s8_ex
5717 #define vcls_s16 __vcls_s16
5718 #define vcls_s32 __vcls_s32
5719 #define vcls_s8 __vcls_s8
5720 #define vclz_s16 __vclz_s16
5721 #define vclz_s32 __vclz_s32
5722 #define vclz_s8 __vclz_s8
5723 #define vclz_u16 __vclz_u16
5724 #define vclz_u32 __vclz_u32
5725 #define vclz_u8 __vclz_u8
5726 #define vclsq_s16 __vclsq_s16
5727 #define vclsq_s32 __vclsq_s32
5728 #define vclsq_s8 __vclsq_s8
5729 #define vclzq_s16 __vclzq_s16
5730 #define vclzq_s32 __vclzq_s32
5731 #define vclzq_s8 __vclzq_s8
5732 #define vclzq_u16 __vclzq_u16
5733 #define vclzq_u32 __vclzq_u32
5734 #define vclzq_u8 __vclzq_u8
5735 #define vclt_z_f32_ex __vclt_z_f32_ex
5736 #define vclt_z_s16_ex __vclt_z_s16_ex
5737 #define vclt_z_s32_ex __vclt_z_s32_ex
5738 #define vclt_z_s8_ex __vclt_z_s8_ex
5739 #define vcltq_z_f32_ex __vcltq_z_f32_ex
5740 #define vcltq_z_s16_ex __vcltq_z_s16_ex
5741 #define vcltq_z_s32_ex __vcltq_z_s32_ex
5742 #define vcltq_z_s8_ex __vcltq_z_s8_ex
5743 #define vcnt_p8 __vcnt_p8
5744 #define vcnt_s8 __vcnt_s8
5745 #define vcnt_u8 __vcnt_u8
5746 #define vcntq_p8 __vcntq_p8
5747 #define vcntq_s8 __vcntq_s8
5748 #define vcntq_u8 __vcntq_u8
5749 #define vcombine_f32 __vcombine_f32
5750 #define vcombine_p16 __vcombine_p16
5751 #define vcombine_p8 __vcombine_p8
5752 #define vcombine_s16 __vcombine_s16
5753 #define vcombine_s32 __vcombine_s32
5754 #define vcombine_s64 __vcombine_s64
5755 #define vcombine_s8 __vcombine_s8
5756 #define vcombine_u16 __vcombine_u16
5757 #define vcombine_u32 __vcombine_u32
5758 #define vcombine_u64 __vcombine_u64
5759 #define vcombine_u8 __vcombine_u8
5760 #define vcreate_f32 __vcreate_f32
5761 #define vcreate_p16 __vcreate_p16
5762 #define vcreate_p8 __vcreate_p8
5763 #define vcreate_s16 __vcreate_s16
5764 #define vcreate_s32 __vcreate_s32
5765 #define vcreate_s64 __vcreate_s64
5766 #define vcreate_s8 __vcreate_s8
5767 #define vcreate_u16 __vcreate_u16
5768 #define vcreate_u32 __vcreate_u32
5769 #define vcreate_u64 __vcreate_u64
5770 #define vcreate_u8 __vcreate_u8
5771 #define vcvt_n_f32_s32 __vcvt_n_f32_s32
5772 #define vcvt_n_f32_u32 __vcvt_n_f32_u32
5773 #define vcvt_n_s32_f32 __vcvt_n_s32_f32
5774 #define vcvt_n_u32_f32 __vcvt_n_u32_f32
5775 #define vcvtq_n_f32_s32 __vcvtq_n_f32_s32
5776 #define vcvtq_n_f32_u32 __vcvtq_n_f32_u32
5777 #define vcvtq_n_s32_f32 __vcvtq_n_s32_f32
5778 #define vcvtq_n_u32_f32 __vcvtq_n_u32_f32
5779 #define vcvta_s32_f32 __vcvta_s32_f32
5780 #define vcvta_u32_f32 __vcvta_u32_f32
5781 #define vcvtm_s32_f32 __vcvtm_s32_f32
5782 #define vcvtm_u32_f32 __vcvtm_u32_f32
5783 #define vcvtn_s32_f32 __vcvtn_s32_f32
5784 #define vcvtn_u32_f32 __vcvtn_u32_f32
5785 #define vcvtp_s32_f32 __vcvtp_s32_f32
5786 #define vcvtp_u32_f32 __vcvtp_u32_f32
5787 #define vcvtaq_s32_f32 __vcvtaq_s32_f32
5788 #define vcvtaq_u32_f32 __vcvtaq_u32_f32
5789 #define vcvtmq_s32_f32 __vcvtmq_s32_f32
5790 #define vcvtmq_u32_f32 __vcvtmq_u32_f32
5791 #define vcvtnq_s32_f32 __vcvtnq_s32_f32
5792 #define vcvtnq_u32_f32 __vcvtnq_u32_f32
5793 #define vcvtpq_s32_f32 __vcvtpq_s32_f32
5794 #define vcvtpq_u32_f32 __vcvtpq_u32_f32
5795 #define vcvt_f32_s32 __vcvt_f32_s32
5796 #define vcvt_f32_u32 __vcvt_f32_u32
5797 #define vcvt_s32_f32 __vcvt_s32_f32
5798 #define vcvt_u32_f32 __vcvt_u32_f32
5799 #define vcvtq_f32_s32 __vcvtq_f32_s32
5800 #define vcvtq_f32_u32 __vcvtq_f32_u32
5801 #define vcvtq_s32_f32 __vcvtq_s32_f32
5802 #define vcvtq_u32_f32 __vcvtq_u32_f32
5803 #define vdup_lane_f32 __vdup_lane_f32
5804 #define vdup_lane_p16 __vdup_lane_p16
5805 #define vdup_lane_p8 __vdup_lane_p8
5806 #define vdup_lane_s16 __vdup_lane_s16
5807 #define vdup_lane_s32 __vdup_lane_s32
5808 #define vdup_lane_s8 __vdup_lane_s8
5809 #define vdup_lane_u16 __vdup_lane_u16
5810 #define vdup_lane_u32 __vdup_lane_u32
5811 #define vdup_lane_u8 __vdup_lane_u8
5812 #define vdupq_lane_f32 __vdupq_lane_f32
5813 #define vdupq_lane_p16 __vdupq_lane_p16
5814 #define vdupq_lane_p8 __vdupq_lane_p8
5815 #define vdupq_lane_s16 __vdupq_lane_s16
5816 #define vdupq_lane_s32 __vdupq_lane_s32
5817 #define vdupq_lane_s8 __vdupq_lane_s8
5818 #define vdupq_lane_u16 __vdupq_lane_u16
5819 #define vdupq_lane_u32 __vdupq_lane_u32
5820 #define vdupq_lane_u8 __vdupq_lane_u8
5821 #define vdup_n_f32 __vdup_n_f32
5822 #define vmov_n_f32 __vmov_n_f32
5823 #define vdup_n_p16 __vdup_n_p16
5824 #define vdup_n_p8 __vdup_n_p8
5825 #define vdup_n_s16 __vdup_n_s16
5826 #define vdup_n_s32 __vdup_n_s32
5827 #define vdup_n_s8 __vdup_n_s8
5828 #define vdup_n_u16 __vdup_n_u16
5829 #define vdup_n_u32 __vdup_n_u32
5830 #define vdup_n_u8 __vdup_n_u8
5831 #define vmov_n_p16 __vmov_n_p16
5832 #define vmov_n_p8 __vmov_n_p8
5833 #define vmov_n_s16 __vmov_n_s16
5834 #define vmov_n_s32 __vmov_n_s32
5835 #define vmov_n_s8 __vmov_n_s8
5836 #define vmov_n_u16 __vmov_n_u16
5837 #define vmov_n_u32 __vmov_n_u32
5838 #define vmov_n_u8 __vmov_n_u8
5839 #define vdupq_n_f32 __vdupq_n_f32
5840 #define vmovq_n_f32 __vmovq_n_f32
5841 #define vdupq_n_p16 __vdupq_n_p16
5842 #define vdupq_n_p8 __vdupq_n_p8
5843 #define vdupq_n_s16 __vdupq_n_s16
5844 #define vdupq_n_s32 __vdupq_n_s32
5845 #define vdupq_n_s8 __vdupq_n_s8
5846 #define vdupq_n_u16 __vdupq_n_u16
5847 #define vdupq_n_u32 __vdupq_n_u32
5848 #define vdupq_n_u8 __vdupq_n_u8
5849 #define vmovq_n_p16 __vmovq_n_p16
5850 #define vmovq_n_p8 __vmovq_n_p8
5851 #define vmovq_n_s16 __vmovq_n_s16
5852 #define vmovq_n_s32 __vmovq_n_s32
5853 #define vmovq_n_s8 __vmovq_n_s8
5854 #define vmovq_n_u16 __vmovq_n_u16
5855 #define vmovq_n_u32 __vmovq_n_u32
5856 #define vmovq_n_u8 __vmovq_n_u8
5857 #define vdup_n_s64 __vdup_n_s64
5858 #define vdup_n_u64 __vdup_n_u64
5859 #define vmov_n_s64 __vmov_n_s64
5860 #define vmov_n_u64 __vmov_n_u64
5861 #define vdupq_n_s64 __vdupq_n_s64
5862 #define vdupq_n_u64 __vdupq_n_u64
5863 #define vmovq_n_s64 __vmovq_n_s64
5864 #define vmovq_n_u64 __vmovq_n_u64
5865 #define vbic_s16 __vbic_s16
5866 #define vbic_s32 __vbic_s32
5867 #define vbic_s64 __vbic_s64
5868 #define vbic_s8 __vbic_s8
5869 #define vbic_u16 __vbic_u16
5870 #define vbic_u32 __vbic_u32
5871 #define vbic_u64 __vbic_u64
5872 #define vbic_u8 __vbic_u8
5873 #define veor_s16 __veor_s16
5874 #define veor_s32 __veor_s32
5875 #define veor_s64 __veor_s64
5876 #define veor_s8 __veor_s8
5877 #define veor_u16 __veor_u16
5878 #define veor_u32 __veor_u32
5879 #define veor_u64 __veor_u64
5880 #define veor_u8 __veor_u8
5881 #define vorn_s16 __vorn_s16
5882 #define vorn_s32 __vorn_s32
5883 #define vorn_s64 __vorn_s64
5884 #define vorn_s8 __vorn_s8
5885 #define vorn_u16 __vorn_u16
5886 #define vorn_u32 __vorn_u32
5887 #define vorn_u64 __vorn_u64
5888 #define vorn_u8 __vorn_u8
5889 #define vbicq_s16 __vbicq_s16
5890 #define vbicq_s32 __vbicq_s32
5891 #define vbicq_s64 __vbicq_s64
5892 #define vbicq_s8 __vbicq_s8
5893 #define vbicq_u16 __vbicq_u16
5894 #define vbicq_u32 __vbicq_u32
5895 #define vbicq_u64 __vbicq_u64
5896 #define vbicq_u8 __vbicq_u8
5897 #define veorq_s16 __veorq_s16
5898 #define veorq_s32 __veorq_s32
5899 #define veorq_s64 __veorq_s64
5900 #define veorq_s8 __veorq_s8
5901 #define veorq_u16 __veorq_u16
5902 #define veorq_u32 __veorq_u32
5903 #define veorq_u64 __veorq_u64
5904 #define veorq_u8 __veorq_u8
5905 #define vornq_s16 __vornq_s16
5906 #define vornq_s32 __vornq_s32
5907 #define vornq_s64 __vornq_s64
5908 #define vornq_s8 __vornq_s8
5909 #define vornq_u16 __vornq_u16
5910 #define vornq_u32 __vornq_u32
5911 #define vornq_u64 __vornq_u64
5912 #define vornq_u8 __vornq_u8
5913 #define vext_f32 __vext_f32
5914 #define vext_p16 __vext_p16
5915 #define vext_p8 __vext_p8
5916 #define vext_s16 __vext_s16
5917 #define vext_s32 __vext_s32
5918 #define vext_s64 __vext_s64
5919 #define vext_s8 __vext_s8
5920 #define vext_u16 __vext_u16
5921 #define vext_u32 __vext_u32
5922 #define vext_u64 __vext_u64
5923 #define vext_u8 __vext_u8
5924 #define vextq_f32 __vextq_f32
5925 #define vextq_p16 __vextq_p16
5926 #define vextq_p8 __vextq_p8
5927 #define vextq_s16 __vextq_s16
5928 #define vextq_s32 __vextq_s32
5929 #define vextq_s64 __vextq_s64
5930 #define vextq_s8 __vextq_s8
5931 #define vextq_u16 __vextq_u16
5932 #define vextq_u32 __vextq_u32
5933 #define vextq_u64 __vextq_u64
5934 #define vextq_u8 __vextq_u8
5935 #define vget_high_f32 __vget_high_f32
5936 #define vget_high_p16 __vget_high_p16
5937 #define vget_high_p8 __vget_high_p8
5938 #define vget_high_s16 __vget_high_s16
5939 #define vget_high_s32 __vget_high_s32
5940 #define vget_high_s64 __vget_high_s64
5941 #define vget_high_s8 __vget_high_s8
5942 #define vget_high_u16 __vget_high_u16
5943 #define vget_high_u32 __vget_high_u32
5944 #define vget_high_u64 __vget_high_u64
5945 #define vget_high_u8 __vget_high_u8
5946 #define vget_low_f32 __vget_low_f32
5947 #define vget_low_p16 __vget_low_p16
5948 #define vget_low_p8 __vget_low_p8
5949 #define vget_low_s16 __vget_low_s16
5950 #define vget_low_s32 __vget_low_s32
5951 #define vget_low_s64 __vget_low_s64
5952 #define vget_low_s8 __vget_low_s8
5953 #define vget_low_u16 __vget_low_u16
5954 #define vget_low_u32 __vget_low_u32
5955 #define vget_low_u64 __vget_low_u64
5956 #define vget_low_u8 __vget_low_u8
5957 #define vhadd_s16 __vhadd_s16
5958 #define vhadd_s32 __vhadd_s32
5959 #define vhadd_s8 __vhadd_s8
5960 #define vhadd_u16 __vhadd_u16
5961 #define vhadd_u32 __vhadd_u32
5962 #define vhadd_u8 __vhadd_u8
5963 #define vhsub_s16 __vhsub_s16
5964 #define vhsub_s32 __vhsub_s32
5965 #define vhsub_s8 __vhsub_s8
5966 #define vhsub_u16 __vhsub_u16
5967 #define vhsub_u32 __vhsub_u32
5968 #define vhsub_u8 __vhsub_u8
5969 #define vrhadd_s16 __vrhadd_s16
5970 #define vrhadd_s32 __vrhadd_s32
5971 #define vrhadd_s8 __vrhadd_s8
5972 #define vrhadd_u16 __vrhadd_u16
5973 #define vrhadd_u32 __vrhadd_u32
5974 #define vrhadd_u8 __vrhadd_u8
5975 #define vhaddq_s16 __vhaddq_s16
5976 #define vhaddq_s32 __vhaddq_s32
5977 #define vhaddq_s8 __vhaddq_s8
5978 #define vhaddq_u16 __vhaddq_u16
5979 #define vhaddq_u32 __vhaddq_u32
5980 #define vhaddq_u8 __vhaddq_u8
5981 #define vhsubq_s16 __vhsubq_s16
5982 #define vhsubq_s32 __vhsubq_s32
5983 #define vhsubq_s8 __vhsubq_s8
5984 #define vhsubq_u16 __vhsubq_u16
5985 #define vhsubq_u32 __vhsubq_u32
5986 #define vhsubq_u8 __vhsubq_u8
5987 #define vrhaddq_s16 __vrhaddq_s16
5988 #define vrhaddq_s32 __vrhaddq_s32
5989 #define vrhaddq_s8 __vrhaddq_s8
5990 #define vrhaddq_u16 __vrhaddq_u16
5991 #define vrhaddq_u32 __vrhaddq_u32
5992 #define vrhaddq_u8 __vrhaddq_u8
5993 #define vld1_f32 __vld1_f32
5994 #define vld1_p16 __vld1_p16
5995 #define vld1_p8 __vld1_p8
5996 #define vld1_s16 __vld1_s16
5997 #define vld1_s32 __vld1_s32
5998 #define vld1_s64 __vld1_s64
5999 #define vld1_s8 __vld1_s8
6000 #define vld1_u16 __vld1_u16
6001 #define vld1_u32 __vld1_u32
6002 #define vld1_u64 __vld1_u64
6003 #define vld1_u8 __vld1_u8
6004 #define vld1_f32_ex __vld1_f32_ex
6005 #define vld1_p16_ex __vld1_p16_ex
6006 #define vld1_p8_ex __vld1_p8_ex
6007 #define vld1_s16_ex __vld1_s16_ex
6008 #define vld1_s32_ex __vld1_s32_ex
6009 #define vld1_s64_ex __vld1_s64_ex
6010 #define vld1_s8_ex __vld1_s8_ex
6011 #define vld1_u16_ex __vld1_u16_ex
6012 #define vld1_u32_ex __vld1_u32_ex
6013 #define vld1_u64_ex __vld1_u64_ex
6014 #define vld1_u8_ex __vld1_u8_ex
6015 #define vld1q_f32 __vld1q_f32
6016 #define vld1q_p16 __vld1q_p16
6017 #define vld1q_p8 __vld1q_p8
6018 #define vld1q_s16 __vld1q_s16
6019 #define vld1q_s32 __vld1q_s32
6020 #define vld1q_s64 __vld1q_s64
6021 #define vld1q_s8 __vld1q_s8
6022 #define vld1q_u16 __vld1q_u16
6023 #define vld1q_u32 __vld1q_u32
6024 #define vld1q_u64 __vld1q_u64
6025 #define vld1q_u8 __vld1q_u8
6026 #define vld1q_f32_ex __vld1q_f32_ex
6027 #define vld1q_p16_ex __vld1q_p16_ex
6028 #define vld1q_p8_ex __vld1q_p8_ex
6029 #define vld1q_s16_ex __vld1q_s16_ex
6030 #define vld1q_s32_ex __vld1q_s32_ex
6031 #define vld1q_s64_ex __vld1q_s64_ex
6032 #define vld1q_s8_ex __vld1q_s8_ex
6033 #define vld1q_u16_ex __vld1q_u16_ex
6034 #define vld1q_u32_ex __vld1q_u32_ex
6035 #define vld1q_u64_ex __vld1q_u64_ex
6036 #define vld1q_u8_ex __vld1q_u8_ex
6037 #define vld1_dup_f32 __vld1_dup_f32
6038 #define vld1_dup_p16 __vld1_dup_p16
6039 #define vld1_dup_p8 __vld1_dup_p8
6040 #define vld1_dup_s16 __vld1_dup_s16
6041 #define vld1_dup_s32 __vld1_dup_s32
6042 #define vld1_dup_s8 __vld1_dup_s8
6043 #define vld1_dup_u16 __vld1_dup_u16
6044 #define vld1_dup_u32 __vld1_dup_u32
6045 #define vld1_dup_u8 __vld1_dup_u8
6046 #define vld1q_dup_f32 __vld1q_dup_f32
6047 #define vld1q_dup_p16 __vld1q_dup_p16
6048 #define vld1q_dup_p8 __vld1q_dup_p8
6049 #define vld1q_dup_s16 __vld1q_dup_s16
6050 #define vld1q_dup_s32 __vld1q_dup_s32
6051 #define vld1q_dup_s8 __vld1q_dup_s8
6052 #define vld1q_dup_u16 __vld1q_dup_u16
6053 #define vld1q_dup_u32 __vld1q_dup_u32
6054 #define vld1q_dup_u8 __vld1q_dup_u8
6055 #define vld1_dup_f32_ex __vld1_dup_f32_ex
6056 #define vld1_dup_p16_ex __vld1_dup_p16_ex
6057 #define vld1_dup_s16_ex __vld1_dup_s16_ex
6058 #define vld1_dup_s32_ex __vld1_dup_s32_ex
6059 #define vld1_dup_u16_ex __vld1_dup_u16_ex
6060 #define vld1_dup_u32_ex __vld1_dup_u32_ex
6061 #define vld1q_dup_f32_ex __vld1q_dup_f32_ex
6062 #define vld1q_dup_p16_ex __vld1q_dup_p16_ex
6063 #define vld1q_dup_s16_ex __vld1q_dup_s16_ex
6064 #define vld1q_dup_s32_ex __vld1q_dup_s32_ex
6065 #define vld1q_dup_u16_ex __vld1q_dup_u16_ex
6066 #define vld1q_dup_u32_ex __vld1q_dup_u32_ex
6067 #define vld1_lane_f32 __vld1_lane_f32
6068 #define vld1_lane_p16 __vld1_lane_p16
6069 #define vld1_lane_p8 __vld1_lane_p8
6070 #define vld1_lane_s16 __vld1_lane_s16
6071 #define vld1_lane_s32 __vld1_lane_s32
6072 #define vld1_lane_s8 __vld1_lane_s8
6073 #define vld1_lane_u16 __vld1_lane_u16
6074 #define vld1_lane_u32 __vld1_lane_u32
6075 #define vld1_lane_u8 __vld1_lane_u8
6076 #define vld1q_lane_f32 __vld1q_lane_f32
6077 #define vld1q_lane_p16 __vld1q_lane_p16
6078 #define vld1q_lane_p8 __vld1q_lane_p8
6079 #define vld1q_lane_s16 __vld1q_lane_s16
6080 #define vld1q_lane_s32 __vld1q_lane_s32
6081 #define vld1q_lane_s8 __vld1q_lane_s8
6082 #define vld1q_lane_u16 __vld1q_lane_u16
6083 #define vld1q_lane_u32 __vld1q_lane_u32
6084 #define vld1q_lane_u8 __vld1q_lane_u8
6085 #define vld1_lane_f32_ex __vld1_lane_f32_ex
6086 #define vld1_lane_p16_ex __vld1_lane_p16_ex
6087 #define vld1_lane_s16_ex __vld1_lane_s16_ex
6088 #define vld1_lane_s32_ex __vld1_lane_s32_ex
6089 #define vld1_lane_u16_ex __vld1_lane_u16_ex
6090 #define vld1_lane_u32_ex __vld1_lane_u32_ex
6091 #define vld1q_lane_f32_ex __vld1q_lane_f32_ex
6092 #define vld1q_lane_p16_ex __vld1q_lane_p16_ex
6093 #define vld1q_lane_s16_ex __vld1q_lane_s16_ex
6094 #define vld1q_lane_s32_ex __vld1q_lane_s32_ex
6095 #define vld1q_lane_u16_ex __vld1q_lane_u16_ex
6096 #define vld1q_lane_u32_ex __vld1q_lane_u32_ex
6097 #define vld2_f32 __vld2_f32
6098 #define vld2_p16 __vld2_p16
6099 #define vld2_p8 __vld2_p8
6100 #define vld2_s16 __vld2_s16
6101 #define vld2_s32 __vld2_s32
6102 #define vld2_s8 __vld2_s8
6103 #define vld2_u16 __vld2_u16
6104 #define vld2_u32 __vld2_u32
6105 #define vld2_u8 __vld2_u8
6106 #define vld2_s64 __vld2_s64
6107 #define vld2_u64 __vld2_u64
6108 #define vld2_s64_ex __vld2_s64_ex
6109 #define vld2_u64_ex __vld2_u64_ex
6110 #define vld2_f32_ex __vld2_f32_ex
6111 #define vld2_p16_ex __vld2_p16_ex
6112 #define vld2_p8_ex __vld2_p8_ex
6113 #define vld2_s16_ex __vld2_s16_ex
6114 #define vld2_s32_ex __vld2_s32_ex
6115 #define vld2_s8_ex __vld2_s8_ex
6116 #define vld2_u16_ex __vld2_u16_ex
6117 #define vld2_u32_ex __vld2_u32_ex
6118 #define vld2_u8_ex __vld2_u8_ex
6119 #define vld2q_f32 __vld2q_f32
6120 #define vld2q_p16 __vld2q_p16
6121 #define vld2q_p8 __vld2q_p8
6122 #define vld2q_s16 __vld2q_s16
6123 #define vld2q_s32 __vld2q_s32
6124 #define vld2q_s8 __vld2q_s8
6125 #define vld2q_u16 __vld2q_u16
6126 #define vld2q_u32 __vld2q_u32
6127 #define vld2q_u8 __vld2q_u8
6128 #define vld2q_f32_ex __vld2q_f32_ex
6129 #define vld2q_p16_ex __vld2q_p16_ex
6130 #define vld2q_p8_ex __vld2q_p8_ex
6131 #define vld2q_s16_ex __vld2q_s16_ex
6132 #define vld2q_s32_ex __vld2q_s32_ex
6133 #define vld2q_s8_ex __vld2q_s8_ex
6134 #define vld2q_u16_ex __vld2q_u16_ex
6135 #define vld2q_u32_ex __vld2q_u32_ex
6136 #define vld2q_u8_ex __vld2q_u8_ex
6137 #define vld2_dup_f32 __vld2_dup_f32
6138 #define vld2_dup_p16 __vld2_dup_p16
6139 #define vld2_dup_p8 __vld2_dup_p8
6140 #define vld2_dup_s16 __vld2_dup_s16
6141 #define vld2_dup_s32 __vld2_dup_s32
6142 #define vld2_dup_s8 __vld2_dup_s8
6143 #define vld2_dup_u16 __vld2_dup_u16
6144 #define vld2_dup_u32 __vld2_dup_u32
6145 #define vld2_dup_u8 __vld2_dup_u8
6146 #define vld2_dup_s64 __vld2_dup_s64
6147 #define vld2_dup_u64 __vld2_dup_u64
6148 #define vld2_dup_s64_ex __vld2_dup_s64_ex
6149 #define vld2_dup_u64_ex __vld2_dup_u64_ex
6150 #define vld2_dup_f32_ex __vld2_dup_f32_ex
6151 #define vld2_dup_p16_ex __vld2_dup_p16_ex
6152 #define vld2_dup_p8_ex __vld2_dup_p8_ex
6153 #define vld2_dup_s16_ex __vld2_dup_s16_ex
6154 #define vld2_dup_s32_ex __vld2_dup_s32_ex
6155 #define vld2_dup_s8_ex __vld2_dup_s8_ex
6156 #define vld2_dup_u16_ex __vld2_dup_u16_ex
6157 #define vld2_dup_u32_ex __vld2_dup_u32_ex
6158 #define vld2_dup_u8_ex __vld2_dup_u8_ex
6159 #define vld2_lane_f32 __vld2_lane_f32
6160 #define vld2_lane_p16 __vld2_lane_p16
6161 #define vld2_lane_p8 __vld2_lane_p8
6162 #define vld2_lane_s16 __vld2_lane_s16
6163 #define vld2_lane_s32 __vld2_lane_s32
6164 #define vld2_lane_s8 __vld2_lane_s8
6165 #define vld2_lane_u16 __vld2_lane_u16
6166 #define vld2_lane_u32 __vld2_lane_u32
6167 #define vld2_lane_u8 __vld2_lane_u8
6168 #define vld2q_lane_f32 __vld2q_lane_f32
6169 #define vld2q_lane_p16 __vld2q_lane_p16
6170 #define vld2q_lane_s16 __vld2q_lane_s16
6171 #define vld2q_lane_s32 __vld2q_lane_s32
6172 #define vld2q_lane_u16 __vld2q_lane_u16
6173 #define vld2q_lane_u32 __vld2q_lane_u32
6174 #define vld2_lane_f32_ex __vld2_lane_f32_ex
6175 #define vld2_lane_p16_ex __vld2_lane_p16_ex
6176 #define vld2_lane_p8_ex __vld2_lane_p8_ex
6177 #define vld2_lane_s16_ex __vld2_lane_s16_ex
6178 #define vld2_lane_s32_ex __vld2_lane_s32_ex
6179 #define vld2_lane_s8_ex __vld2_lane_s8_ex
6180 #define vld2_lane_u16_ex __vld2_lane_u16_ex
6181 #define vld2_lane_u32_ex __vld2_lane_u32_ex
6182 #define vld2_lane_u8_ex __vld2_lane_u8_ex
6183 #define vld2q_lane_f32_ex __vld2q_lane_f32_ex
6184 #define vld2q_lane_p16_ex __vld2q_lane_p16_ex
6185 #define vld2q_lane_s16_ex __vld2q_lane_s16_ex
6186 #define vld2q_lane_s32_ex __vld2q_lane_s32_ex
6187 #define vld2q_lane_u16_ex __vld2q_lane_u16_ex
6188 #define vld2q_lane_u32_ex __vld2q_lane_u32_ex
6189 #define vld3_f32 __vld3_f32
6190 #define vld3_p16 __vld3_p16
6191 #define vld3_p8 __vld3_p8
6192 #define vld3_s16 __vld3_s16
6193 #define vld3_s32 __vld3_s32
6194 #define vld3_s8 __vld3_s8
6195 #define vld3_u16 __vld3_u16
6196 #define vld3_u32 __vld3_u32
6197 #define vld3_u8 __vld3_u8
6198 #define vld3_s64 __vld3_s64
6199 #define vld3_u64 __vld3_u64
6200 #define vld3_s64_ex __vld3_s64_ex
6201 #define vld3_u64_ex __vld3_u64_ex
6202 #define vld3_f32_ex __vld3_f32_ex
6203 #define vld3_p16_ex __vld3_p16_ex
6204 #define vld3_p8_ex __vld3_p8_ex
6205 #define vld3_s16_ex __vld3_s16_ex
6206 #define vld3_s32_ex __vld3_s32_ex
6207 #define vld3_s8_ex __vld3_s8_ex
6208 #define vld3_u16_ex __vld3_u16_ex
6209 #define vld3_u32_ex __vld3_u32_ex
6210 #define vld3_u8_ex __vld3_u8_ex
6211 #define vld3q_f32 __vld3q_f32
6212 #define vld3q_p16 __vld3q_p16
6213 #define vld3q_p8 __vld3q_p8
6214 #define vld3q_s16 __vld3q_s16
6215 #define vld3q_s32 __vld3q_s32
6216 #define vld3q_s8 __vld3q_s8
6217 #define vld3q_u16 __vld3q_u16
6218 #define vld3q_u32 __vld3q_u32
6219 #define vld3q_u8 __vld3q_u8
6220 #define vld3q_f32_ex __vld3q_f32_ex
6221 #define vld3q_p16_ex __vld3q_p16_ex
6222 #define vld3q_p8_ex __vld3q_p8_ex
6223 #define vld3q_s16_ex __vld3q_s16_ex
6224 #define vld3q_s32_ex __vld3q_s32_ex
6225 #define vld3q_s8_ex __vld3q_s8_ex
6226 #define vld3q_u16_ex __vld3q_u16_ex
6227 #define vld3q_u32_ex __vld3q_u32_ex
6228 #define vld3q_u8_ex __vld3q_u8_ex
6229 #define vld3_dup_f32 __vld3_dup_f32
6230 #define vld3_dup_p16 __vld3_dup_p16
6231 #define vld3_dup_p8 __vld3_dup_p8
6232 #define vld3_dup_s16 __vld3_dup_s16
6233 #define vld3_dup_s32 __vld3_dup_s32
6234 #define vld3_dup_s8 __vld3_dup_s8
6235 #define vld3_dup_u16 __vld3_dup_u16
6236 #define vld3_dup_u32 __vld3_dup_u32
6237 #define vld3_dup_u8 __vld3_dup_u8
6238 #define vld3_dup_s64 __vld3_dup_s64
6239 #define vld3_dup_u64 __vld3_dup_u64
6240 #define vld3_lane_f32 __vld3_lane_f32
6241 #define vld3_lane_p16 __vld3_lane_p16
6242 #define vld3_lane_p8 __vld3_lane_p8
6243 #define vld3_lane_s16 __vld3_lane_s16
6244 #define vld3_lane_s32 __vld3_lane_s32
6245 #define vld3_lane_s8 __vld3_lane_s8
6246 #define vld3_lane_u16 __vld3_lane_u16
6247 #define vld3_lane_u32 __vld3_lane_u32
6248 #define vld3_lane_u8 __vld3_lane_u8
6249 #define vld3q_lane_f32 __vld3q_lane_f32
6250 #define vld3q_lane_p16 __vld3q_lane_p16
6251 #define vld3q_lane_s16 __vld3q_lane_s16
6252 #define vld3q_lane_s32 __vld3q_lane_s32
6253 #define vld3q_lane_u16 __vld3q_lane_u16
6254 #define vld3q_lane_u32 __vld3q_lane_u32
6255 #define vld4_f32 __vld4_f32
6256 #define vld4_p16 __vld4_p16
6257 #define vld4_p8 __vld4_p8
6258 #define vld4_s16 __vld4_s16
6259 #define vld4_s32 __vld4_s32
6260 #define vld4_s8 __vld4_s8
6261 #define vld4_u16 __vld4_u16
6262 #define vld4_u32 __vld4_u32
6263 #define vld4_u8 __vld4_u8
6264 #define vld4_s64 __vld4_s64
6265 #define vld4_u64 __vld4_u64
6266 #define vld4_s64_ex __vld4_s64_ex
6267 #define vld4_u64_ex __vld4_u64_ex
6268 #define vld4_f32_ex __vld4_f32_ex
6269 #define vld4_p16_ex __vld4_p16_ex
6270 #define vld4_p8_ex __vld4_p8_ex
6271 #define vld4_s16_ex __vld4_s16_ex
6272 #define vld4_s32_ex __vld4_s32_ex
6273 #define vld4_s8_ex __vld4_s8_ex
6274 #define vld4_u16_ex __vld4_u16_ex
6275 #define vld4_u32_ex __vld4_u32_ex
6276 #define vld4_u8_ex __vld4_u8_ex
6277 #define vld4q_f32 __vld4q_f32
6278 #define vld4q_p16 __vld4q_p16
6279 #define vld4q_p8 __vld4q_p8
6280 #define vld4q_s16 __vld4q_s16
6281 #define vld4q_s32 __vld4q_s32
6282 #define vld4q_s8 __vld4q_s8
6283 #define vld4q_u16 __vld4q_u16
6284 #define vld4q_u32 __vld4q_u32
6285 #define vld4q_u8 __vld4q_u8
6286 #define vld4q_f32_ex __vld4q_f32_ex
6287 #define vld4q_p16_ex __vld4q_p16_ex
6288 #define vld4q_p8_ex __vld4q_p8_ex
6289 #define vld4q_s16_ex __vld4q_s16_ex
6290 #define vld4q_s32_ex __vld4q_s32_ex
6291 #define vld4q_s8_ex __vld4q_s8_ex
6292 #define vld4q_u16_ex __vld4q_u16_ex
6293 #define vld4q_u32_ex __vld4q_u32_ex
6294 #define vld4q_u8_ex __vld4q_u8_ex
6295 #define vld4_dup_f32 __vld4_dup_f32
6296 #define vld4_dup_p16 __vld4_dup_p16
6297 #define vld4_dup_p8 __vld4_dup_p8
6298 #define vld4_dup_s16 __vld4_dup_s16
6299 #define vld4_dup_s32 __vld4_dup_s32
6300 #define vld4_dup_s8 __vld4_dup_s8
6301 #define vld4_dup_u16 __vld4_dup_u16
6302 #define vld4_dup_u32 __vld4_dup_u32
6303 #define vld4_dup_u8 __vld4_dup_u8
6304 #define vld4_dup_s64 __vld4_dup_s64
6305 #define vld4_dup_u64 __vld4_dup_u64
6306 #define vld4_dup_f32_ex __vld4_dup_f32_ex
6307 #define vld4_dup_p16_ex __vld4_dup_p16_ex
6308 #define vld4_dup_p8_ex __vld4_dup_p8_ex
6309 #define vld4_dup_s16_ex __vld4_dup_s16_ex
6310 #define vld4_dup_s32_ex __vld4_dup_s32_ex
6311 #define vld4_dup_s8_ex __vld4_dup_s8_ex
6312 #define vld4_dup_u16_ex __vld4_dup_u16_ex
6313 #define vld4_dup_u32_ex __vld4_dup_u32_ex
6314 #define vld4_dup_u8_ex __vld4_dup_u8_ex
6315 #define vld4_lane_f32 __vld4_lane_f32
6316 #define vld4_lane_p16 __vld4_lane_p16
6317 #define vld4_lane_p8 __vld4_lane_p8
6318 #define vld4_lane_s16 __vld4_lane_s16
6319 #define vld4_lane_s32 __vld4_lane_s32
6320 #define vld4_lane_s8 __vld4_lane_s8
6321 #define vld4_lane_u16 __vld4_lane_u16
6322 #define vld4_lane_u32 __vld4_lane_u32
6323 #define vld4_lane_u8 __vld4_lane_u8
6324 #define vld4q_lane_f32 __vld4q_lane_f32
6325 #define vld4q_lane_p16 __vld4q_lane_p16
6326 #define vld4q_lane_s16 __vld4q_lane_s16
6327 #define vld4q_lane_s32 __vld4q_lane_s32
6328 #define vld4q_lane_u16 __vld4q_lane_u16
6329 #define vld4q_lane_u32 __vld4q_lane_u32
6330 #define vld4_lane_f32_ex __vld4_lane_f32_ex
6331 #define vld4_lane_p16_ex __vld4_lane_p16_ex
6332 #define vld4_lane_p8_ex __vld4_lane_p8_ex
6333 #define vld4_lane_s16_ex __vld4_lane_s16_ex
6334 #define vld4_lane_s32_ex __vld4_lane_s32_ex
6335 #define vld4_lane_s8_ex __vld4_lane_s8_ex
6336 #define vld4_lane_u16_ex __vld4_lane_u16_ex
6337 #define vld4_lane_u32_ex __vld4_lane_u32_ex
6338 #define vld4_lane_u8_ex __vld4_lane_u8_ex
6339 #define vld4q_lane_f32_ex __vld4q_lane_f32_ex
6340 #define vld4q_lane_p16_ex __vld4q_lane_p16_ex
6341 #define vld4q_lane_s16_ex __vld4q_lane_s16_ex
6342 #define vld4q_lane_s32_ex __vld4q_lane_s32_ex
6343 #define vld4q_lane_u16_ex __vld4q_lane_u16_ex
6344 #define vld4q_lane_u32_ex __vld4q_lane_u32_ex
6345 #define vmax_f32 __vmax_f32
6346 #define vmaxnm_f32 __vmaxnm_f32
6347 #define vmin_f32 __vmin_f32
6348 #define vminnm_f32 __vminnm_f32
6349 #define vmaxq_f32 __vmaxq_f32
6350 #define vmaxnmq_f32 __vmaxnmq_f32
6351 #define vminq_f32 __vminq_f32
6352 #define vminnmq_f32 __vminnmq_f32
6353 #define vmax_s16 __vmax_s16
6354 #define vmax_s32 __vmax_s32
6355 #define vmax_s8 __vmax_s8
6356 #define vmax_u16 __vmax_u16
6357 #define vmax_u32 __vmax_u32
6358 #define vmax_u8 __vmax_u8
6359 #define vmin_s16 __vmin_s16
6360 #define vmin_s32 __vmin_s32
6361 #define vmin_s8 __vmin_s8
6362 #define vmin_u16 __vmin_u16
6363 #define vmin_u32 __vmin_u32
6364 #define vmin_u8 __vmin_u8
6365 #define vmaxq_s16 __vmaxq_s16
6366 #define vmaxq_s32 __vmaxq_s32
6367 #define vmaxq_s8 __vmaxq_s8
6368 #define vmaxq_u16 __vmaxq_u16
6369 #define vmaxq_u32 __vmaxq_u32
6370 #define vmaxq_u8 __vmaxq_u8
6371 #define vminq_s16 __vminq_s16
6372 #define vminq_s32 __vminq_s32
6373 #define vminq_s8 __vminq_s8
6374 #define vminq_u16 __vminq_u16
6375 #define vminq_u32 __vminq_u32
6376 #define vminq_u8 __vminq_u8
6377 #define vmla_lane_f32 __vmla_lane_f32
6378 #define vmla_lane_s16 __vmla_lane_s16
6379 #define vmla_lane_s32 __vmla_lane_s32
6380 #define vmla_lane_u16 __vmla_lane_u16
6381 #define vmla_lane_u32 __vmla_lane_u32
6382 #define vmls_lane_f32 __vmls_lane_f32
6383 #define vmls_lane_s16 __vmls_lane_s16
6384 #define vmls_lane_s32 __vmls_lane_s32
6385 #define vmls_lane_u16 __vmls_lane_u16
6386 #define vmls_lane_u32 __vmls_lane_u32
6387 #define vmlaq_lane_f32 __vmlaq_lane_f32
6388 #define vmlaq_lane_s16 __vmlaq_lane_s16
6389 #define vmlaq_lane_s32 __vmlaq_lane_s32
6390 #define vmlaq_lane_u16 __vmlaq_lane_u16
6391 #define vmlaq_lane_u32 __vmlaq_lane_u32
6392 #define vmlsq_lane_f32 __vmlsq_lane_f32
6393 #define vmlsq_lane_s16 __vmlsq_lane_s16
6394 #define vmlsq_lane_s32 __vmlsq_lane_s32
6395 #define vmlsq_lane_u16 __vmlsq_lane_u16
6396 #define vmlsq_lane_u32 __vmlsq_lane_u32
6397 #define vmla_n_f32 __vmla_n_f32
6398 #define vmls_n_f32 __vmls_n_f32
6399 #define vmlaq_n_f32 __vmlaq_n_f32
6400 #define vmlsq_n_f32 __vmlsq_n_f32
6401 #define vmla_f32 __vmla_f32
6402 #define vmls_f32 __vmls_f32
6403 #define vmlaq_f32 __vmlaq_f32
6404 #define vmlsq_f32 __vmlsq_f32
6405 #define vmla_s16 __vmla_s16
6406 #define vmla_s32 __vmla_s32
6407 #define vmla_s8 __vmla_s8
6408 #define vmla_u16 __vmla_u16
6409 #define vmla_u32 __vmla_u32
6410 #define vmla_u8 __vmla_u8
6411 #define vmls_s16 __vmls_s16
6412 #define vmls_s32 __vmls_s32
6413 #define vmls_s8 __vmls_s8
6414 #define vmls_u16 __vmls_u16
6415 #define vmls_u32 __vmls_u32
6416 #define vmls_u8 __vmls_u8
6417 #define vmlaq_s16 __vmlaq_s16
6418 #define vmlaq_s32 __vmlaq_s32
6419 #define vmlaq_s8 __vmlaq_s8
6420 #define vmlaq_u16 __vmlaq_u16
6421 #define vmlaq_u32 __vmlaq_u32
6422 #define vmlaq_u8 __vmlaq_u8
6423 #define vmlsq_s16 __vmlsq_s16
6424 #define vmlsq_s32 __vmlsq_s32
6425 #define vmlsq_s8 __vmlsq_s8
6426 #define vmlsq_u16 __vmlsq_u16
6427 #define vmlsq_u32 __vmlsq_u32
6428 #define vmlsq_u8 __vmlsq_u8
6429 #define vmlal_s16 __vmlal_s16
6430 #define vmlal_s32 __vmlal_s32
6431 #define vmlal_s8 __vmlal_s8
6432 #define vmlal_u16 __vmlal_u16
6433 #define vmlal_u32 __vmlal_u32
6434 #define vmlal_u8 __vmlal_u8
6435 #define vmlsl_s16 __vmlsl_s16
6436 #define vmlsl_s32 __vmlsl_s32
6437 #define vmlsl_s8 __vmlsl_s8
6438 #define vmlsl_u16 __vmlsl_u16
6439 #define vmlsl_u32 __vmlsl_u32
6440 #define vmlsl_u8 __vmlsl_u8
6441 #define vmlal_lane_s16 __vmlal_lane_s16
6442 #define vmlal_lane_s32 __vmlal_lane_s32
6443 #define vmlal_lane_u16 __vmlal_lane_u16
6444 #define vmlal_lane_u32 __vmlal_lane_u32
6445 #define vmlsl_lane_s16 __vmlsl_lane_s16
6446 #define vmlsl_lane_s32 __vmlsl_lane_s32
6447 #define vmlsl_lane_u16 __vmlsl_lane_u16
6448 #define vmlsl_lane_u32 __vmlsl_lane_u32
6449 #define vset_lane_f32 __vset_lane_f32
6450 #define vset_lane_p16 __vset_lane_p16
6451 #define vset_lane_p8 __vset_lane_p8
6452 #define vset_lane_s16 __vset_lane_s16
6453 #define vset_lane_s32 __vset_lane_s32
6454 #define vset_lane_s8 __vset_lane_s8
6455 #define vset_lane_u16 __vset_lane_u16
6456 #define vset_lane_u32 __vset_lane_u32
6457 #define vset_lane_u8 __vset_lane_u8
6458 #define vget_lane_f32 __vget_lane_f32
6459 #define vget_lane_p16 __vget_lane_p16
6460 #define vget_lane_p8 __vget_lane_p8
6461 #define vget_lane_s16 __vget_lane_s16
6462 #define vget_lane_s8 __vget_lane_s8
6463 #define vget_lane_s32 __vget_lane_s32
6464 #define vget_lane_u16 __vget_lane_u16
6465 #define vget_lane_u8 __vget_lane_u8
6466 #define vget_lane_u32 __vget_lane_u32
6467 #define vset_lane_s64 __vset_lane_s64
6468 #define vset_lane_u64 __vset_lane_u64
6469 #define vsetq_lane_s64 __vsetq_lane_s64
6470 #define vsetq_lane_u64 __vsetq_lane_u64
6471 #define vget_lane_s64 __vget_lane_s64
6472 #define vget_lane_u64 __vget_lane_u64
6473 #define vgetq_lane_s64 __vgetq_lane_s64
6474 #define vgetq_lane_u64 __vgetq_lane_u64
6475 #define vsetq_lane_f32 __vsetq_lane_f32
6476 #define vsetq_lane_p16 __vsetq_lane_p16
6477 #define vsetq_lane_p8 __vsetq_lane_p8
6478 #define vsetq_lane_s16 __vsetq_lane_s16
6479 #define vsetq_lane_s32 __vsetq_lane_s32
6480 #define vsetq_lane_s8 __vsetq_lane_s8
6481 #define vsetq_lane_u16 __vsetq_lane_u16
6482 #define vsetq_lane_u32 __vsetq_lane_u32
6483 #define vsetq_lane_u8 __vsetq_lane_u8
6484 #define vgetq_lane_f32 __vgetq_lane_f32
6485 #define vgetq_lane_p16 __vgetq_lane_p16
6486 #define vgetq_lane_p8 __vgetq_lane_p8
6487 #define vgetq_lane_s16 __vgetq_lane_s16
6488 #define vgetq_lane_s8 __vgetq_lane_s8
6489 #define vgetq_lane_s32 __vgetq_lane_s32
6490 #define vgetq_lane_u16 __vgetq_lane_u16
6491 #define vgetq_lane_u8 __vgetq_lane_u8
6492 #define vgetq_lane_u32 __vgetq_lane_u32
6493 #define vmovl_s16 __vmovl_s16
6494 #define vmovl_s32 __vmovl_s32
6495 #define vmovl_s8 __vmovl_s8
6496 #define vmovl_u16 __vmovl_u16
6497 #define vmovl_u32 __vmovl_u32
6498 #define vmovl_u8 __vmovl_u8
6499 #define vmovn_s16 __vmovn_s16
6500 #define vmovn_s32 __vmovn_s32
6501 #define vmovn_s64 __vmovn_s64
6502 #define vmovn_u16 __vmovn_u16
6503 #define vmovn_u32 __vmovn_u32
6504 #define vmovn_u64 __vmovn_u64
6505 #define vmul_f32 __vmul_f32
6506 #define vmul_p8 __vmul_p8
6507 #define vmul_s16 __vmul_s16
6508 #define vmul_s32 __vmul_s32
6509 #define vmul_s8 __vmul_s8
6510 #define vmul_u16 __vmul_u16
6511 #define vmul_u32 __vmul_u32
6512 #define vmul_u8 __vmul_u8
6513 #define vmulq_f32 __vmulq_f32
6514 #define vmulq_p8 __vmulq_p8
6515 #define vmulq_s16 __vmulq_s16
6516 #define vmulq_s32 __vmulq_s32
6517 #define vmulq_s8 __vmulq_s8
6518 #define vmulq_u16 __vmulq_u16
6519 #define vmulq_u32 __vmulq_u32
6520 #define vmulq_u8 __vmulq_u8
6521 #define vmul_n_f32 __vmul_n_f32
6522 #define vmulq_n_f32 __vmulq_n_f32
6523 #define vmul_lane_f32 __vmul_lane_f32
6524 #define vmul_lane_s16 __vmul_lane_s16
6525 #define vmul_lane_s32 __vmul_lane_s32
6526 #define vmul_lane_u16 __vmul_lane_u16
6527 #define vmul_lane_u32 __vmul_lane_u32
6528 #define vmulq_lane_f32 __vmulq_lane_f32
6529 #define vmulq_lane_s16 __vmulq_lane_s16
6530 #define vmulq_lane_s32 __vmulq_lane_s32
6531 #define vmulq_lane_u16 __vmulq_lane_u16
6532 #define vmulq_lane_u32 __vmulq_lane_u32
6533 #define vmull_p64 __vmull_p64
6534 #define vmull_p8 __vmull_p8
6535 #define vmull_s16 __vmull_s16
6536 #define vmull_s32 __vmull_s32
6537 #define vmull_s8 __vmull_s8
6538 #define vmull_u16 __vmull_u16
6539 #define vmull_u32 __vmull_u32
6540 #define vmull_u8 __vmull_u8
6541 #define vmull_lane_s16 __vmull_lane_s16
6542 #define vmull_lane_s32 __vmull_lane_s32
6543 #define vmull_lane_u16 __vmull_lane_u16
6544 #define vmull_lane_u32 __vmull_lane_u32
6545 #define vmvn_p16 __vmvn_p16
6546 #define vmvn_p8 __vmvn_p8
6547 #define vmvn_s16 __vmvn_s16
6548 #define vmvn_s32 __vmvn_s32
6549 #define vmvn_s8 __vmvn_s8
6550 #define vmvn_u16 __vmvn_u16
6551 #define vmvn_u32 __vmvn_u32
6552 #define vmvn_u8 __vmvn_u8
6553 #define vmvnq_p16 __vmvnq_p16
6554 #define vmvnq_p8 __vmvnq_p8
6555 #define vmvnq_s16 __vmvnq_s16
6556 #define vmvnq_s32 __vmvnq_s32
6557 #define vmvnq_s8 __vmvnq_s8
6558 #define vmvnq_u16 __vmvnq_u16
6559 #define vmvnq_u32 __vmvnq_u32
6560 #define vmvnq_u8 __vmvnq_u8
6561 #define vpadal_s16 __vpadal_s16
6562 #define vpadal_s32 __vpadal_s32
6563 #define vpadal_s8 __vpadal_s8
6564 #define vpadal_u16 __vpadal_u16
6565 #define vpadal_u32 __vpadal_u32
6566 #define vpadal_u8 __vpadal_u8
6567 #define vpadalq_s16 __vpadalq_s16
6568 #define vpadalq_s32 __vpadalq_s32
6569 #define vpadalq_s8 __vpadalq_s8
6570 #define vpadalq_u16 __vpadalq_u16
6571 #define vpadalq_u32 __vpadalq_u32
6572 #define vpadalq_u8 __vpadalq_u8
6573 #define vpadd_f32 __vpadd_f32
6574 #define vpadd_s16 __vpadd_s16
6575 #define vpadd_s32 __vpadd_s32
6576 #define vpadd_s8 __vpadd_s8
6577 #define vpadd_u16 __vpadd_u16
6578 #define vpadd_u32 __vpadd_u32
6579 #define vpadd_u8 __vpadd_u8
6580 #define vpaddl_s16 __vpaddl_s16
6581 #define vpaddl_s32 __vpaddl_s32
6582 #define vpaddl_s8 __vpaddl_s8
6583 #define vpaddl_u16 __vpaddl_u16
6584 #define vpaddl_u32 __vpaddl_u32
6585 #define vpaddl_u8 __vpaddl_u8
6586 #define vpaddlq_s16 __vpaddlq_s16
6587 #define vpaddlq_s32 __vpaddlq_s32
6588 #define vpaddlq_s8 __vpaddlq_s8
6589 #define vpaddlq_u16 __vpaddlq_u16
6590 #define vpaddlq_u32 __vpaddlq_u32
6591 #define vpaddlq_u8 __vpaddlq_u8
6592 #define vpmax_f32 __vpmax_f32
6593 #define vpmin_f32 __vpmin_f32
6594 #define vpmax_s16 __vpmax_s16
6595 #define vpmax_s32 __vpmax_s32
6596 #define vpmax_s8 __vpmax_s8
6597 #define vpmax_u16 __vpmax_u16
6598 #define vpmax_u32 __vpmax_u32
6599 #define vpmax_u8 __vpmax_u8
6600 #define vpmin_s16 __vpmin_s16
6601 #define vpmin_s32 __vpmin_s32
6602 #define vpmin_s8 __vpmin_s8
6603 #define vpmin_u16 __vpmin_u16
6604 #define vpmin_u32 __vpmin_u32
6605 #define vpmin_u8 __vpmin_u8
6606 #define vqabs_s16 __vqabs_s16
6607 #define vqabs_s32 __vqabs_s32
6608 #define vqabs_s8 __vqabs_s8
6609 #define vqneg_s16 __vqneg_s16
6610 #define vqneg_s32 __vqneg_s32
6611 #define vqneg_s8 __vqneg_s8
6612 #define vqabsq_s16 __vqabsq_s16
6613 #define vqabsq_s32 __vqabsq_s32
6614 #define vqabsq_s8 __vqabsq_s8
6615 #define vqnegq_s16 __vqnegq_s16
6616 #define vqnegq_s32 __vqnegq_s32
6617 #define vqnegq_s8 __vqnegq_s8
6618 #define vqadd_s16 __vqadd_s16
6619 #define vqadd_s32 __vqadd_s32
6620 #define vqadd_s64 __vqadd_s64
6621 #define vqadd_s8 __vqadd_s8
6622 #define vqadd_u16 __vqadd_u16
6623 #define vqadd_u32 __vqadd_u32
6624 #define vqadd_u64 __vqadd_u64
6625 #define vqadd_u8 __vqadd_u8
6626 #define vqaddq_s16 __vqaddq_s16
6627 #define vqaddq_s32 __vqaddq_s32
6628 #define vqaddq_s64 __vqaddq_s64
6629 #define vqaddq_s8 __vqaddq_s8
6630 #define vqaddq_u16 __vqaddq_u16
6631 #define vqaddq_u32 __vqaddq_u32
6632 #define vqaddq_u64 __vqaddq_u64
6633 #define vqaddq_u8 __vqaddq_u8
6634 #define vqdmlal_s16 __vqdmlal_s16
6635 #define vqdmlal_s32 __vqdmlal_s32
6636 #define vqdmlsl_s16 __vqdmlsl_s16
6637 #define vqdmlsl_s32 __vqdmlsl_s32
6638 #define vqdmlal_lane_s16 __vqdmlal_lane_s16
6639 #define vqdmlal_lane_s32 __vqdmlal_lane_s32
6640 #define vqdmlsl_lane_s16 __vqdmlsl_lane_s16
6641 #define vqdmlsl_lane_s32 __vqdmlsl_lane_s32
6642 #define vqdmulh_lane_s16 __vqdmulh_lane_s16
6643 #define vqdmulh_lane_s32 __vqdmulh_lane_s32
6644 #define vqrdmulh_lane_s16 __vqrdmulh_lane_s16
6645 #define vqrdmulh_lane_s32 __vqrdmulh_lane_s32
6646 #define vqdmulhq_lane_s16 __vqdmulhq_lane_s16
6647 #define vqdmulhq_lane_s32 __vqdmulhq_lane_s32
6648 #define vqrdmulhq_lane_s16 __vqrdmulhq_lane_s16
6649 #define vqrdmulhq_lane_s32 __vqrdmulhq_lane_s32
6650 #define vqdmulh_s16 __vqdmulh_s16
6651 #define vqdmulh_s32 __vqdmulh_s32
6652 #define vqrdmulh_s16 __vqrdmulh_s16
6653 #define vqrdmulh_s32 __vqrdmulh_s32
6654 #define vqdmulhq_s16 __vqdmulhq_s16
6655 #define vqdmulhq_s32 __vqdmulhq_s32
6656 #define vqrdmulhq_s16 __vqrdmulhq_s16
6657 #define vqrdmulhq_s32 __vqrdmulhq_s32
6658 #define vqdmull_s16 __vqdmull_s16
6659 #define vqdmull_s32 __vqdmull_s32
6660 #define vqdmull_lane_s16 __vqdmull_lane_s16
6661 #define vqdmull_lane_s32 __vqdmull_lane_s32
6662 #define vqmovn_s16 __vqmovn_s16
6663 #define vqmovn_s32 __vqmovn_s32
6664 #define vqmovn_s64 __vqmovn_s64
6665 #define vqmovn_u16 __vqmovn_u16
6666 #define vqmovn_u32 __vqmovn_u32
6667 #define vqmovn_u64 __vqmovn_u64
6668 #define vqmovun_s16 __vqmovun_s16
6669 #define vqmovun_s32 __vqmovun_s32
6670 #define vqmovun_s64 __vqmovun_s64
6671 #define vqshl_n_s16 __vqshl_n_s16
6672 #define vqshl_n_s32 __vqshl_n_s32
6673 #define vqshl_n_s64 __vqshl_n_s64
6674 #define vqshl_n_s8 __vqshl_n_s8
6675 #define vqshl_n_u16 __vqshl_n_u16
6676 #define vqshl_n_u32 __vqshl_n_u32
6677 #define vqshl_n_u64 __vqshl_n_u64
6678 #define vqshl_n_u8 __vqshl_n_u8
6679 #define vqshlu_n_s16 __vqshlu_n_s16
6680 #define vqshlu_n_s32 __vqshlu_n_s32
6681 #define vqshlu_n_s64 __vqshlu_n_s64
6682 #define vqshlu_n_s8 __vqshlu_n_s8
6683 #define vqshlq_n_s16 __vqshlq_n_s16
6684 #define vqshlq_n_s32 __vqshlq_n_s32
6685 #define vqshlq_n_s64 __vqshlq_n_s64
6686 #define vqshlq_n_s8 __vqshlq_n_s8
6687 #define vqshlq_n_u16 __vqshlq_n_u16
6688 #define vqshlq_n_u32 __vqshlq_n_u32
6689 #define vqshlq_n_u64 __vqshlq_n_u64
6690 #define vqshlq_n_u8 __vqshlq_n_u8
6691 #define vqshluq_n_s16 __vqshluq_n_s16
6692 #define vqshluq_n_s32 __vqshluq_n_s32
6693 #define vqshluq_n_s64 __vqshluq_n_s64
6694 #define vqshluq_n_s8 __vqshluq_n_s8
6695 #define vqrshrn_n_s16 __vqrshrn_n_s16
6696 #define vqrshrn_n_s32 __vqrshrn_n_s32
6697 #define vqrshrn_n_s64 __vqrshrn_n_s64
6698 #define vqrshrn_n_u16 __vqrshrn_n_u16
6699 #define vqrshrn_n_u32 __vqrshrn_n_u32
6700 #define vqrshrn_n_u64 __vqrshrn_n_u64
6701 #define vqrshrun_n_s16 __vqrshrun_n_s16
6702 #define vqrshrun_n_s32 __vqrshrun_n_s32
6703 #define vqrshrun_n_s64 __vqrshrun_n_s64
6704 #define vqshrn_n_s16 __vqshrn_n_s16
6705 #define vqshrn_n_s32 __vqshrn_n_s32
6706 #define vqshrn_n_s64 __vqshrn_n_s64
6707 #define vqshrn_n_u16 __vqshrn_n_u16
6708 #define vqshrn_n_u32 __vqshrn_n_u32
6709 #define vqshrn_n_u64 __vqshrn_n_u64
6710 #define vqshrun_n_s16 __vqshrun_n_s16
6711 #define vqshrun_n_s32 __vqshrun_n_s32
6712 #define vqshrun_n_s64 __vqshrun_n_s64
6713 #define vqsub_s16 __vqsub_s16
6714 #define vqsub_s32 __vqsub_s32
6715 #define vqsub_s64 __vqsub_s64
6716 #define vqsub_s8 __vqsub_s8
6717 #define vqsub_u16 __vqsub_u16
6718 #define vqsub_u32 __vqsub_u32
6719 #define vqsub_u64 __vqsub_u64
6720 #define vqsub_u8 __vqsub_u8
6721 #define vqsubq_s16 __vqsubq_s16
6722 #define vqsubq_s32 __vqsubq_s32
6723 #define vqsubq_s64 __vqsubq_s64
6724 #define vqsubq_s8 __vqsubq_s8
6725 #define vqsubq_u16 __vqsubq_u16
6726 #define vqsubq_u32 __vqsubq_u32
6727 #define vqsubq_u64 __vqsubq_u64
6728 #define vqsubq_u8 __vqsubq_u8
6729 #define vrecpe_f32 __vrecpe_f32
6730 #define vrecpe_u32 __vrecpe_u32
6731 #define vrsqrte_f32 __vrsqrte_f32
6732 #define vrsqrte_u32 __vrsqrte_u32
6733 #define vrecpeq_f32 __vrecpeq_f32
6734 #define vrecpeq_u32 __vrecpeq_u32
6735 #define vrsqrteq_f32 __vrsqrteq_f32
6736 #define vrsqrteq_u32 __vrsqrteq_u32
6737 #define vrecps_f32 __vrecps_f32
6738 #define vrecpsq_f32 __vrecpsq_f32
6739 #define vrev16_p8 __vrev16_p8
6740 #define vrev16_s8 __vrev16_s8
6741 #define vrev16_u8 __vrev16_u8
6742 #define vrev32_p16 __vrev32_p16
6743 #define vrev32_p8 __vrev32_p8
6744 #define vrev32_s16 __vrev32_s16
6745 #define vrev32_s8 __vrev32_s8
6746 #define vrev32_u16 __vrev32_u16
6747 #define vrev32_u8 __vrev32_u8
6748 #define vrev64_f32 __vrev64_f32
6749 #define vrev64_p16 __vrev64_p16
6750 #define vrev64_p8 __vrev64_p8
6751 #define vrev64_s16 __vrev64_s16
6752 #define vrev64_s32 __vrev64_s32
6753 #define vrev64_s8 __vrev64_s8
6754 #define vrev64_u16 __vrev64_u16
6755 #define vrev64_u32 __vrev64_u32
6756 #define vrev64_u8 __vrev64_u8
6757 #define vrev16q_p8 __vrev16q_p8
6758 #define vrev16q_s8 __vrev16q_s8
6759 #define vrev16q_u8 __vrev16q_u8
6760 #define vrev32q_p16 __vrev32q_p16
6761 #define vrev32q_p8 __vrev32q_p8
6762 #define vrev32q_s16 __vrev32q_s16
6763 #define vrev32q_s8 __vrev32q_s8
6764 #define vrev32q_u16 __vrev32q_u16
6765 #define vrev32q_u8 __vrev32q_u8
6766 #define vrev64q_f32 __vrev64q_f32
6767 #define vrev64q_p16 __vrev64q_p16
6768 #define vrev64q_p8 __vrev64q_p8
6769 #define vrev64q_s16 __vrev64q_s16
6770 #define vrev64q_s32 __vrev64q_s32
6771 #define vrev64q_s8 __vrev64q_s8
6772 #define vrev64q_u16 __vrev64q_u16
6773 #define vrev64q_u32 __vrev64q_u32
6774 #define vrev64q_u8 __vrev64q_u8
6775 #define vrnd_f32 __vrnd_f32
6776 #define vrnda_f32 __vrnda_f32
6777 #define vrndm_f32 __vrndm_f32
6778 #define vrndn_f32 __vrndn_f32
6779 #define vrndp_f32 __vrndp_f32
6780 #define vrndx_f32 __vrndx_f32
6781 #define vrndq_f32 __vrndq_f32
6782 #define vrndaq_f32 __vrndaq_f32
6783 #define vrndmq_f32 __vrndmq_f32
6784 #define vrndnq_f32 __vrndnq_f32
6785 #define vrndpq_f32 __vrndpq_f32
6786 #define vrndxq_f32 __vrndxq_f32
6787 #define vrsqrts_f32 __vrsqrts_f32
6788 #define vrsqrtsq_f32 __vrsqrtsq_f32
6789 #define vshl_n_s16 __vshl_n_s16
6790 #define vshl_n_s32 __vshl_n_s32
6791 #define vshl_n_s64 __vshl_n_s64
6792 #define vshl_n_s8 __vshl_n_s8
6793 #define vshl_n_u16 __vshl_n_u16
6794 #define vshl_n_u32 __vshl_n_u32
6795 #define vshl_n_u64 __vshl_n_u64
6796 #define vshl_n_u8 __vshl_n_u8
6797 #define vshlq_n_s16 __vshlq_n_s16
6798 #define vshlq_n_s32 __vshlq_n_s32
6799 #define vshlq_n_s64 __vshlq_n_s64
6800 #define vshlq_n_s8 __vshlq_n_s8
6801 #define vshlq_n_u16 __vshlq_n_u16
6802 #define vshlq_n_u32 __vshlq_n_u32
6803 #define vshlq_n_u64 __vshlq_n_u64
6804 #define vshlq_n_u8 __vshlq_n_u8
6805 #define vqrshl_s16 __vqrshl_s16
6806 #define vqrshl_s32 __vqrshl_s32
6807 #define vqrshl_s64 __vqrshl_s64
6808 #define vqrshl_s8 __vqrshl_s8
6809 #define vqrshl_u16 __vqrshl_u16
6810 #define vqrshl_u32 __vqrshl_u32
6811 #define vqrshl_u64 __vqrshl_u64
6812 #define vqrshl_u8 __vqrshl_u8
6813 #define vqshl_s16 __vqshl_s16
6814 #define vqshl_s32 __vqshl_s32
6815 #define vqshl_s64 __vqshl_s64
6816 #define vqshl_s8 __vqshl_s8
6817 #define vqshl_u16 __vqshl_u16
6818 #define vqshl_u32 __vqshl_u32
6819 #define vqshl_u64 __vqshl_u64
6820 #define vqshl_u8 __vqshl_u8
6821 #define vrshl_s16 __vrshl_s16
6822 #define vrshl_s32 __vrshl_s32
6823 #define vrshl_s64 __vrshl_s64
6824 #define vrshl_s8 __vrshl_s8
6825 #define vrshl_u16 __vrshl_u16
6826 #define vrshl_u32 __vrshl_u32
6827 #define vrshl_u64 __vrshl_u64
6828 #define vrshl_u8 __vrshl_u8
6829 #define vshl_s16 __vshl_s16
6830 #define vshl_s32 __vshl_s32
6831 #define vshl_s64 __vshl_s64
6832 #define vshl_s8 __vshl_s8
6833 #define vshl_u16 __vshl_u16
6834 #define vshl_u32 __vshl_u32
6835 #define vshl_u64 __vshl_u64
6836 #define vshl_u8 __vshl_u8
6837 #define vqrshlq_s16 __vqrshlq_s16
6838 #define vqrshlq_s32 __vqrshlq_s32
6839 #define vqrshlq_s64 __vqrshlq_s64
6840 #define vqrshlq_s8 __vqrshlq_s8
6841 #define vqrshlq_u16 __vqrshlq_u16
6842 #define vqrshlq_u32 __vqrshlq_u32
6843 #define vqrshlq_u64 __vqrshlq_u64
6844 #define vqrshlq_u8 __vqrshlq_u8
6845 #define vqshlq_s16 __vqshlq_s16
6846 #define vqshlq_s32 __vqshlq_s32
6847 #define vqshlq_s64 __vqshlq_s64
6848 #define vqshlq_s8 __vqshlq_s8
6849 #define vqshlq_u16 __vqshlq_u16
6850 #define vqshlq_u32 __vqshlq_u32
6851 #define vqshlq_u64 __vqshlq_u64
6852 #define vqshlq_u8 __vqshlq_u8
6853 #define vrshlq_s16 __vrshlq_s16
6854 #define vrshlq_s32 __vrshlq_s32
6855 #define vrshlq_s64 __vrshlq_s64
6856 #define vrshlq_s8 __vrshlq_s8
6857 #define vrshlq_u16 __vrshlq_u16
6858 #define vrshlq_u32 __vrshlq_u32
6859 #define vrshlq_u64 __vrshlq_u64
6860 #define vrshlq_u8 __vrshlq_u8
6861 #define vshlq_s16 __vshlq_s16
6862 #define vshlq_s32 __vshlq_s32
6863 #define vshlq_s64 __vshlq_s64
6864 #define vshlq_s8 __vshlq_s8
6865 #define vshlq_u16 __vshlq_u16
6866 #define vshlq_u32 __vshlq_u32
6867 #define vshlq_u64 __vshlq_u64
6868 #define vshlq_u8 __vshlq_u8
6869 #define vshll_n_s16 __vshll_n_s16
6870 #define vshll_n_s32 __vshll_n_s32
6871 #define vshll_n_s8 __vshll_n_s8
6872 #define vshll_n_u16 __vshll_n_u16
6873 #define vshll_n_u32 __vshll_n_u32
6874 #define vshll_n_u8 __vshll_n_u8
6875 #define vrshr_n_s16 __vrshr_n_s16
6876 #define vrshr_n_s32 __vrshr_n_s32
6877 #define vrshr_n_s64 __vrshr_n_s64
6878 #define vrshr_n_s8 __vrshr_n_s8
6879 #define vrshr_n_u16 __vrshr_n_u16
6880 #define vrshr_n_u32 __vrshr_n_u32
6881 #define vrshr_n_u64 __vrshr_n_u64
6882 #define vrshr_n_u8 __vrshr_n_u8
6883 #define vshr_n_s16 __vshr_n_s16
6884 #define vshr_n_s32 __vshr_n_s32
6885 #define vshr_n_s64 __vshr_n_s64
6886 #define vshr_n_s8 __vshr_n_s8
6887 #define vshr_n_u16 __vshr_n_u16
6888 #define vshr_n_u32 __vshr_n_u32
6889 #define vshr_n_u64 __vshr_n_u64
6890 #define vshr_n_u8 __vshr_n_u8
6891 #define vrshrq_n_s16 __vrshrq_n_s16
6892 #define vrshrq_n_s32 __vrshrq_n_s32
6893 #define vrshrq_n_s64 __vrshrq_n_s64
6894 #define vrshrq_n_s8 __vrshrq_n_s8
6895 #define vrshrq_n_u16 __vrshrq_n_u16
6896 #define vrshrq_n_u32 __vrshrq_n_u32
6897 #define vrshrq_n_u64 __vrshrq_n_u64
6898 #define vrshrq_n_u8 __vrshrq_n_u8
6899 #define vshrq_n_s16 __vshrq_n_s16
6900 #define vshrq_n_s32 __vshrq_n_s32
6901 #define vshrq_n_s64 __vshrq_n_s64
6902 #define vshrq_n_s8 __vshrq_n_s8
6903 #define vshrq_n_u16 __vshrq_n_u16
6904 #define vshrq_n_u32 __vshrq_n_u32
6905 #define vshrq_n_u64 __vshrq_n_u64
6906 #define vshrq_n_u8 __vshrq_n_u8
6907 #define vrshrn_n_s16 __vrshrn_n_s16
6908 #define vrshrn_n_s32 __vrshrn_n_s32
6909 #define vrshrn_n_s64 __vrshrn_n_s64
6910 #define vrshrn_n_u16 __vrshrn_n_u16
6911 #define vrshrn_n_u32 __vrshrn_n_u32
6912 #define vrshrn_n_u64 __vrshrn_n_u64
6913 #define vshrn_n_s16 __vshrn_n_s16
6914 #define vshrn_n_s32 __vshrn_n_s32
6915 #define vshrn_n_s64 __vshrn_n_s64
6916 #define vshrn_n_u16 __vshrn_n_u16
6917 #define vshrn_n_u32 __vshrn_n_u32
6918 #define vshrn_n_u64 __vshrn_n_u64
6919 #define vsli_n_p16 __vsli_n_p16
6920 #define vsli_n_p8 __vsli_n_p8
6921 #define vsli_n_s16 __vsli_n_s16
6922 #define vsli_n_s32 __vsli_n_s32
6923 #define vsli_n_s64 __vsli_n_s64
6924 #define vsli_n_s8 __vsli_n_s8
6925 #define vsli_n_u16 __vsli_n_u16
6926 #define vsli_n_u32 __vsli_n_u32
6927 #define vsli_n_u64 __vsli_n_u64
6928 #define vsli_n_u8 __vsli_n_u8
6929 #define vsliq_n_p16 __vsliq_n_p16
6930 #define vsliq_n_p8 __vsliq_n_p8
6931 #define vsliq_n_s16 __vsliq_n_s16
6932 #define vsliq_n_s32 __vsliq_n_s32
6933 #define vsliq_n_s64 __vsliq_n_s64
6934 #define vsliq_n_s8 __vsliq_n_s8
6935 #define vsliq_n_u16 __vsliq_n_u16
6936 #define vsliq_n_u32 __vsliq_n_u32
6937 #define vsliq_n_u64 __vsliq_n_u64
6938 #define vsliq_n_u8 __vsliq_n_u8
6939 #define vrsra_n_s16 __vrsra_n_s16
6940 #define vrsra_n_s32 __vrsra_n_s32
6941 #define vrsra_n_s64 __vrsra_n_s64
6942 #define vrsra_n_s8 __vrsra_n_s8
6943 #define vrsra_n_u16 __vrsra_n_u16
6944 #define vrsra_n_u32 __vrsra_n_u32
6945 #define vrsra_n_u64 __vrsra_n_u64
6946 #define vrsra_n_u8 __vrsra_n_u8
6947 #define vsra_n_s16 __vsra_n_s16
6948 #define vsra_n_s32 __vsra_n_s32
6949 #define vsra_n_s64 __vsra_n_s64
6950 #define vsra_n_s8 __vsra_n_s8
6951 #define vsra_n_u16 __vsra_n_u16
6952 #define vsra_n_u32 __vsra_n_u32
6953 #define vsra_n_u64 __vsra_n_u64
6954 #define vsra_n_u8 __vsra_n_u8
6955 #define vrsraq_n_s16 __vrsraq_n_s16
6956 #define vrsraq_n_s32 __vrsraq_n_s32
6957 #define vrsraq_n_s64 __vrsraq_n_s64
6958 #define vrsraq_n_s8 __vrsraq_n_s8
6959 #define vrsraq_n_u16 __vrsraq_n_u16
6960 #define vrsraq_n_u32 __vrsraq_n_u32
6961 #define vrsraq_n_u64 __vrsraq_n_u64
6962 #define vrsraq_n_u8 __vrsraq_n_u8
6963 #define vsraq_n_s16 __vsraq_n_s16
6964 #define vsraq_n_s32 __vsraq_n_s32
6965 #define vsraq_n_s64 __vsraq_n_s64
6966 #define vsraq_n_s8 __vsraq_n_s8
6967 #define vsraq_n_u16 __vsraq_n_u16
6968 #define vsraq_n_u32 __vsraq_n_u32
6969 #define vsraq_n_u64 __vsraq_n_u64
6970 #define vsraq_n_u8 __vsraq_n_u8
6971 #define vsri_n_p16 __vsri_n_p16
6972 #define vsri_n_p8 __vsri_n_p8
6973 #define vsri_n_s16 __vsri_n_s16
6974 #define vsri_n_s32 __vsri_n_s32
6975 #define vsri_n_s64 __vsri_n_s64
6976 #define vsri_n_s8 __vsri_n_s8
6977 #define vsri_n_u16 __vsri_n_u16
6978 #define vsri_n_u32 __vsri_n_u32
6979 #define vsri_n_u64 __vsri_n_u64
6980 #define vsri_n_u8 __vsri_n_u8
6981 #define vsriq_n_p16 __vsriq_n_p16
6982 #define vsriq_n_p8 __vsriq_n_p8
6983 #define vsriq_n_s16 __vsriq_n_s16
6984 #define vsriq_n_s32 __vsriq_n_s32
6985 #define vsriq_n_s64 __vsriq_n_s64
6986 #define vsriq_n_s8 __vsriq_n_s8
6987 #define vsriq_n_u16 __vsriq_n_u16
6988 #define vsriq_n_u32 __vsriq_n_u32
6989 #define vsriq_n_u64 __vsriq_n_u64
6990 #define vsriq_n_u8 __vsriq_n_u8
6991 #define vst1_f32 __vst1_f32
6992 #define vst1_p16 __vst1_p16
6993 #define vst1_p8 __vst1_p8
6994 #define vst1_s16 __vst1_s16
6995 #define vst1_s32 __vst1_s32
6996 #define vst1_s64 __vst1_s64
6997 #define vst1_s8 __vst1_s8
6998 #define vst1_u16 __vst1_u16
6999 #define vst1_u32 __vst1_u32
7000 #define vst1_u64 __vst1_u64
7001 #define vst1_u8 __vst1_u8
7002 #define vst1_f32_ex __vst1_f32_ex
7003 #define vst1_p16_ex __vst1_p16_ex
7004 #define vst1_p8_ex __vst1_p8_ex
7005 #define vst1_s16_ex __vst1_s16_ex
7006 #define vst1_s32_ex __vst1_s32_ex
7007 #define vst1_s64_ex __vst1_s64_ex
7008 #define vst1_s8_ex __vst1_s8_ex
7009 #define vst1_u16_ex __vst1_u16_ex
7010 #define vst1_u32_ex __vst1_u32_ex
7011 #define vst1_u64_ex __vst1_u64_ex
7012 #define vst1_u8_ex __vst1_u8_ex
7013 #define vst1q_f32 __vst1q_f32
7014 #define vst1q_p16 __vst1q_p16
7015 #define vst1q_p8 __vst1q_p8
7016 #define vst1q_s16 __vst1q_s16
7017 #define vst1q_s32 __vst1q_s32
7018 #define vst1q_s64 __vst1q_s64
7019 #define vst1q_s8 __vst1q_s8
7020 #define vst1q_u16 __vst1q_u16
7021 #define vst1q_u32 __vst1q_u32
7022 #define vst1q_u64 __vst1q_u64
7023 #define vst1q_u8 __vst1q_u8
7024 #define vst1q_f32_ex __vst1q_f32_ex
7025 #define vst1q_p16_ex __vst1q_p16_ex
7026 #define vst1q_p8_ex __vst1q_p8_ex
7027 #define vst1q_s16_ex __vst1q_s16_ex
7028 #define vst1q_s32_ex __vst1q_s32_ex
7029 #define vst1q_s64_ex __vst1q_s64_ex
7030 #define vst1q_s8_ex __vst1q_s8_ex
7031 #define vst1q_u16_ex __vst1q_u16_ex
7032 #define vst1q_u32_ex __vst1q_u32_ex
7033 #define vst1q_u64_ex __vst1q_u64_ex
7034 #define vst1q_u8_ex __vst1q_u8_ex
7035 #define vst1_lane_f32 __vst1_lane_f32
7036 #define vst1_lane_p16 __vst1_lane_p16
7037 #define vst1_lane_p8 __vst1_lane_p8
7038 #define vst1_lane_s16 __vst1_lane_s16
7039 #define vst1_lane_s32 __vst1_lane_s32
7040 #define vst1_lane_s8 __vst1_lane_s8
7041 #define vst1_lane_u16 __vst1_lane_u16
7042 #define vst1_lane_u32 __vst1_lane_u32
7043 #define vst1_lane_u8 __vst1_lane_u8
7044 #define vst1q_lane_f32 __vst1q_lane_f32
7045 #define vst1q_lane_p16 __vst1q_lane_p16
7046 #define vst1q_lane_p8 __vst1q_lane_p8
7047 #define vst1q_lane_s16 __vst1q_lane_s16
7048 #define vst1q_lane_s32 __vst1q_lane_s32
7049 #define vst1q_lane_s8 __vst1q_lane_s8
7050 #define vst1q_lane_u16 __vst1q_lane_u16
7051 #define vst1q_lane_u32 __vst1q_lane_u32
7052 #define vst1q_lane_u8 __vst1q_lane_u8
7053 #define vst1_lane_f32_ex __vst1_lane_f32_ex
7054 #define vst1_lane_p16_ex __vst1_lane_p16_ex
7055 #define vst1_lane_s16_ex __vst1_lane_s16_ex
7056 #define vst1_lane_s32_ex __vst1_lane_s32_ex
7057 #define vst1_lane_u16_ex __vst1_lane_u16_ex
7058 #define vst1_lane_u32_ex __vst1_lane_u32_ex
7059 #define vst1q_lane_f32_ex __vst1q_lane_f32_ex
7060 #define vst1q_lane_p16_ex __vst1q_lane_p16_ex
7061 #define vst1q_lane_s16_ex __vst1q_lane_s16_ex
7062 #define vst1q_lane_s32_ex __vst1q_lane_s32_ex
7063 #define vst1q_lane_u16_ex __vst1q_lane_u16_ex
7064 #define vst1q_lane_u32_ex __vst1q_lane_u32_ex
7065 #define vst2_f32 __vst2_f32
7066 #define vst2_p16 __vst2_p16
7067 #define vst2_p8 __vst2_p8
7068 #define vst2_s16 __vst2_s16
7069 #define vst2_s32 __vst2_s32
7070 #define vst2_s8 __vst2_s8
7071 #define vst2_u16 __vst2_u16
7072 #define vst2_u32 __vst2_u32
7073 #define vst2_u8 __vst2_u8
7074 #define vst2_s64 __vst2_s64
7075 #define vst2_u64 __vst2_u64
7076 #define vst2_s64_ex __vst2_s64_ex
7077 #define vst2_u64_ex __vst2_u64_ex
7078 #define vst2_f32_ex __vst2_f32_ex
7079 #define vst2_p16_ex __vst2_p16_ex
7080 #define vst2_p8_ex __vst2_p8_ex
7081 #define vst2_s16_ex __vst2_s16_ex
7082 #define vst2_s32_ex __vst2_s32_ex
7083 #define vst2_s8_ex __vst2_s8_ex
7084 #define vst2_u16_ex __vst2_u16_ex
7085 #define vst2_u32_ex __vst2_u32_ex
7086 #define vst2_u8_ex __vst2_u8_ex
7087 #define vst2q_f32 __vst2q_f32
7088 #define vst2q_p16 __vst2q_p16
7089 #define vst2q_p8 __vst2q_p8
7090 #define vst2q_s16 __vst2q_s16
7091 #define vst2q_s32 __vst2q_s32
7092 #define vst2q_s8 __vst2q_s8
7093 #define vst2q_u16 __vst2q_u16
7094 #define vst2q_u32 __vst2q_u32
7095 #define vst2q_u8 __vst2q_u8
7096 #define vst2q_f32_ex __vst2q_f32_ex
7097 #define vst2q_p16_ex __vst2q_p16_ex
7098 #define vst2q_p8_ex __vst2q_p8_ex
7099 #define vst2q_s16_ex __vst2q_s16_ex
7100 #define vst2q_s32_ex __vst2q_s32_ex
7101 #define vst2q_s8_ex __vst2q_s8_ex
7102 #define vst2q_u16_ex __vst2q_u16_ex
7103 #define vst2q_u32_ex __vst2q_u32_ex
7104 #define vst2q_u8_ex __vst2q_u8_ex
7105 #define vst2_lane_f32 __vst2_lane_f32
7106 #define vst2_lane_p16 __vst2_lane_p16
7107 #define vst2_lane_p8 __vst2_lane_p8
7108 #define vst2_lane_s16 __vst2_lane_s16
7109 #define vst2_lane_s32 __vst2_lane_s32
7110 #define vst2_lane_s8 __vst2_lane_s8
7111 #define vst2_lane_u16 __vst2_lane_u16
7112 #define vst2_lane_u32 __vst2_lane_u32
7113 #define vst2_lane_u8 __vst2_lane_u8
7114 #define vst2q_lane_f32 __vst2q_lane_f32
7115 #define vst2q_lane_p16 __vst2q_lane_p16
7116 #define vst2q_lane_s16 __vst2q_lane_s16
7117 #define vst2q_lane_s32 __vst2q_lane_s32
7118 #define vst2q_lane_u16 __vst2q_lane_u16
7119 #define vst2q_lane_u32 __vst2q_lane_u32
7120 #define vst2_lane_f32_ex __vst2_lane_f32_ex
7121 #define vst2_lane_p16_ex __vst2_lane_p16_ex
7122 #define vst2_lane_p8_ex __vst2_lane_p8_ex
7123 #define vst2_lane_s16_ex __vst2_lane_s16_ex
7124 #define vst2_lane_s32_ex __vst2_lane_s32_ex
7125 #define vst2_lane_s8_ex __vst2_lane_s8_ex
7126 #define vst2_lane_u16_ex __vst2_lane_u16_ex
7127 #define vst2_lane_u32_ex __vst2_lane_u32_ex
7128 #define vst2_lane_u8_ex __vst2_lane_u8_ex
7129 #define vst2q_lane_f32_ex __vst2q_lane_f32_ex
7130 #define vst2q_lane_p16_ex __vst2q_lane_p16_ex
7131 #define vst2q_lane_s16_ex __vst2q_lane_s16_ex
7132 #define vst2q_lane_s32_ex __vst2q_lane_s32_ex
7133 #define vst2q_lane_u16_ex __vst2q_lane_u16_ex
7134 #define vst2q_lane_u32_ex __vst2q_lane_u32_ex
7135 #define vst3_f32 __vst3_f32
7136 #define vst3_p16 __vst3_p16
7137 #define vst3_p8 __vst3_p8
7138 #define vst3_s16 __vst3_s16
7139 #define vst3_s32 __vst3_s32
7140 #define vst3_s8 __vst3_s8
7141 #define vst3_u16 __vst3_u16
7142 #define vst3_u32 __vst3_u32
7143 #define vst3_u8 __vst3_u8
7144 #define vst3_s64 __vst3_s64
7145 #define vst3_u64 __vst3_u64
7146 #define vst3_s64_ex __vst3_s64_ex
7147 #define vst3_u64_ex __vst3_u64_ex
7148 #define vst3_f32_ex __vst3_f32_ex
7149 #define vst3_p16_ex __vst3_p16_ex
7150 #define vst3_p8_ex __vst3_p8_ex
7151 #define vst3_s16_ex __vst3_s16_ex
7152 #define vst3_s32_ex __vst3_s32_ex
7153 #define vst3_s8_ex __vst3_s8_ex
7154 #define vst3_u16_ex __vst3_u16_ex
7155 #define vst3_u32_ex __vst3_u32_ex
7156 #define vst3_u8_ex __vst3_u8_ex
7157 #define vst3q_f32 __vst3q_f32
7158 #define vst3q_p16 __vst3q_p16
7159 #define vst3q_p8 __vst3q_p8
7160 #define vst3q_s16 __vst3q_s16
7161 #define vst3q_s32 __vst3q_s32
7162 #define vst3q_s8 __vst3q_s8
7163 #define vst3q_u16 __vst3q_u16
7164 #define vst3q_u32 __vst3q_u32
7165 #define vst3q_u8 __vst3q_u8
7166 #define vst3q_f32_ex __vst3q_f32_ex
7167 #define vst3q_p16_ex __vst3q_p16_ex
7168 #define vst3q_p8_ex __vst3q_p8_ex
7169 #define vst3q_s16_ex __vst3q_s16_ex
7170 #define vst3q_s32_ex __vst3q_s32_ex
7171 #define vst3q_s8_ex __vst3q_s8_ex
7172 #define vst3q_u16_ex __vst3q_u16_ex
7173 #define vst3q_u32_ex __vst3q_u32_ex
7174 #define vst3q_u8_ex __vst3q_u8_ex
7175 #define vst3_lane_f32 __vst3_lane_f32
7176 #define vst3_lane_p16 __vst3_lane_p16
7177 #define vst3_lane_p8 __vst3_lane_p8
7178 #define vst3_lane_s16 __vst3_lane_s16
7179 #define vst3_lane_s32 __vst3_lane_s32
7180 #define vst3_lane_s8 __vst3_lane_s8
7181 #define vst3_lane_u16 __vst3_lane_u16
7182 #define vst3_lane_u32 __vst3_lane_u32
7183 #define vst3_lane_u8 __vst3_lane_u8
7184 #define vst3q_lane_f32 __vst3q_lane_f32
7185 #define vst3q_lane_p16 __vst3q_lane_p16
7186 #define vst3q_lane_s16 __vst3q_lane_s16
7187 #define vst3q_lane_s32 __vst3q_lane_s32
7188 #define vst3q_lane_u16 __vst3q_lane_u16
7189 #define vst3q_lane_u32 __vst3q_lane_u32
7190 #define vst4_f32 __vst4_f32
7191 #define vst4_p16 __vst4_p16
7192 #define vst4_p8 __vst4_p8
7193 #define vst4_s16 __vst4_s16
7194 #define vst4_s32 __vst4_s32
7195 #define vst4_s8 __vst4_s8
7196 #define vst4_u16 __vst4_u16
7197 #define vst4_u32 __vst4_u32
7198 #define vst4_u8 __vst4_u8
7199 #define vst4_s64 __vst4_s64
7200 #define vst4_u64 __vst4_u64
7201 #define vst4_s64_ex __vst4_s64_ex
7202 #define vst4_u64_ex __vst4_u64_ex
7203 #define vst4_f32_ex __vst4_f32_ex
7204 #define vst4_p16_ex __vst4_p16_ex
7205 #define vst4_p8_ex __vst4_p8_ex
7206 #define vst4_s16_ex __vst4_s16_ex
7207 #define vst4_s32_ex __vst4_s32_ex
7208 #define vst4_s8_ex __vst4_s8_ex
7209 #define vst4_u16_ex __vst4_u16_ex
7210 #define vst4_u32_ex __vst4_u32_ex
7211 #define vst4_u8_ex __vst4_u8_ex
7212 #define vst4q_f32 __vst4q_f32
7213 #define vst4q_p16 __vst4q_p16
7214 #define vst4q_p8 __vst4q_p8
7215 #define vst4q_s16 __vst4q_s16
7216 #define vst4q_s32 __vst4q_s32
7217 #define vst4q_s8 __vst4q_s8
7218 #define vst4q_u16 __vst4q_u16
7219 #define vst4q_u32 __vst4q_u32
7220 #define vst4q_u8 __vst4q_u8
7221 #define vst4q_f32_ex __vst4q_f32_ex
7222 #define vst4q_p16_ex __vst4q_p16_ex
7223 #define vst4q_p8_ex __vst4q_p8_ex
7224 #define vst4q_s16_ex __vst4q_s16_ex
7225 #define vst4q_s32_ex __vst4q_s32_ex
7226 #define vst4q_s8_ex __vst4q_s8_ex
7227 #define vst4q_u16_ex __vst4q_u16_ex
7228 #define vst4q_u32_ex __vst4q_u32_ex
7229 #define vst4q_u8_ex __vst4q_u8_ex
7230 #define vst4_lane_f32 __vst4_lane_f32
7231 #define vst4_lane_p16 __vst4_lane_p16
7232 #define vst4_lane_p8 __vst4_lane_p8
7233 #define vst4_lane_s16 __vst4_lane_s16
7234 #define vst4_lane_s32 __vst4_lane_s32
7235 #define vst4_lane_s8 __vst4_lane_s8
7236 #define vst4_lane_u16 __vst4_lane_u16
7237 #define vst4_lane_u32 __vst4_lane_u32
7238 #define vst4_lane_u8 __vst4_lane_u8
7239 #define vst4q_lane_f32 __vst4q_lane_f32
7240 #define vst4q_lane_p16 __vst4q_lane_p16
7241 #define vst4q_lane_s16 __vst4q_lane_s16
7242 #define vst4q_lane_s32 __vst4q_lane_s32
7243 #define vst4q_lane_u16 __vst4q_lane_u16
7244 #define vst4q_lane_u32 __vst4q_lane_u32
7245 #define vst4_lane_f32_ex __vst4_lane_f32_ex
7246 #define vst4_lane_p16_ex __vst4_lane_p16_ex
7247 #define vst4_lane_p8_ex __vst4_lane_p8_ex
7248 #define vst4_lane_s16_ex __vst4_lane_s16_ex
7249 #define vst4_lane_s32_ex __vst4_lane_s32_ex
7250 #define vst4_lane_s8_ex __vst4_lane_s8_ex
7251 #define vst4_lane_u16_ex __vst4_lane_u16_ex
7252 #define vst4_lane_u32_ex __vst4_lane_u32_ex
7253 #define vst4_lane_u8_ex __vst4_lane_u8_ex
7254 #define vst4q_lane_f32_ex __vst4q_lane_f32_ex
7255 #define vst4q_lane_p16_ex __vst4q_lane_p16_ex
7256 #define vst4q_lane_s16_ex __vst4q_lane_s16_ex
7257 #define vst4q_lane_s32_ex __vst4q_lane_s32_ex
7258 #define vst4q_lane_u16_ex __vst4q_lane_u16_ex
7259 #define vst4q_lane_u32_ex __vst4q_lane_u32_ex
7260 #define vsub_f32 __vsub_f32
7261 #define vsub_s16 __vsub_s16
7262 #define vsub_s32 __vsub_s32
7263 #define vsub_s64 __vsub_s64
7264 #define vsub_s8 __vsub_s8
7265 #define vsub_u16 __vsub_u16
7266 #define vsub_u32 __vsub_u32
7267 #define vsub_u64 __vsub_u64
7268 #define vsub_u8 __vsub_u8
7269 #define vsubq_f32 __vsubq_f32
7270 #define vsubq_s16 __vsubq_s16
7271 #define vsubq_s32 __vsubq_s32
7272 #define vsubq_s64 __vsubq_s64
7273 #define vsubq_s8 __vsubq_s8
7274 #define vsubq_u16 __vsubq_u16
7275 #define vsubq_u32 __vsubq_u32
7276 #define vsubq_u64 __vsubq_u64
7277 #define vsubq_u8 __vsubq_u8
7278 #define vrsubhn_s16 __vrsubhn_s16
7279 #define vrsubhn_s32 __vrsubhn_s32
7280 #define vrsubhn_s64 __vrsubhn_s64
7281 #define vrsubhn_u16 __vrsubhn_u16
7282 #define vrsubhn_u32 __vrsubhn_u32
7283 #define vrsubhn_u64 __vrsubhn_u64
7284 #define vsubhn_s16 __vsubhn_s16
7285 #define vsubhn_s32 __vsubhn_s32
7286 #define vsubhn_s64 __vsubhn_s64
7287 #define vsubhn_u16 __vsubhn_u16
7288 #define vsubhn_u32 __vsubhn_u32
7289 #define vsubhn_u64 __vsubhn_u64
7290 #define vsubl_s16 __vsubl_s16
7291 #define vsubl_s32 __vsubl_s32
7292 #define vsubl_s8 __vsubl_s8
7293 #define vsubl_u16 __vsubl_u16
7294 #define vsubl_u32 __vsubl_u32
7295 #define vsubl_u8 __vsubl_u8
7296 #define vsubw_s16 __vsubw_s16
7297 #define vsubw_s32 __vsubw_s32
7298 #define vsubw_s8 __vsubw_s8
7299 #define vsubw_u16 __vsubw_u16
7300 #define vsubw_u32 __vsubw_u32
7301 #define vsubw_u8 __vsubw_u8
7302 #define vtbl2_p8 __vtbl2_p8
7303 #define vtbl2_s8 __vtbl2_s8
7304 #define vtbl2_u8 __vtbl2_u8
7305 #define vtbx2_p8 __vtbx2_p8
7306 #define vtbx2_s8 __vtbx2_s8
7307 #define vtbx2_u8 __vtbx2_u8
7308 #define vtbl3_p8 __vtbl3_p8
7309 #define vtbl3_s8 __vtbl3_s8
7310 #define vtbl3_u8 __vtbl3_u8
7311 #define vtbx3_p8 __vtbx3_p8
7312 #define vtbx3_s8 __vtbx3_s8
7313 #define vtbx3_u8 __vtbx3_u8
7314 #define vtbl4_p8 __vtbl4_p8
7315 #define vtbl4_s8 __vtbl4_s8
7316 #define vtbl4_u8 __vtbl4_u8
7317 #define vtbx4_p8 __vtbx4_p8
7318 #define vtbx4_s8 __vtbx4_s8
7319 #define vtbx4_u8 __vtbx4_u8
7320 #define vtbl1_p8 __vtbl1_p8
7321 #define vtbl1_s8 __vtbl1_s8
7322 #define vtbl1_u8 __vtbl1_u8
7323 #define vtbx1_p8 __vtbx1_p8
7324 #define vtbx1_s8 __vtbx1_s8
7325 #define vtbx1_u8 __vtbx1_u8
7326 #define vtrn_f32 __vtrn_f32
7327 #define vtrn_p16 __vtrn_p16
7328 #define vtrn_p8 __vtrn_p8
7329 #define vtrn_s16 __vtrn_s16
7330 #define vtrn_s32 __vtrn_s32
7331 #define vtrn_s8 __vtrn_s8
7332 #define vtrn_u16 __vtrn_u16
7333 #define vtrn_u32 __vtrn_u32
7334 #define vtrn_u8 __vtrn_u8
7335 #define vtrnq_f32 __vtrnq_f32
7336 #define vtrnq_p16 __vtrnq_p16
7337 #define vtrnq_p8 __vtrnq_p8
7338 #define vtrnq_s16 __vtrnq_s16
7339 #define vtrnq_s32 __vtrnq_s32
7340 #define vtrnq_s8 __vtrnq_s8
7341 #define vtrnq_u16 __vtrnq_u16
7342 #define vtrnq_u32 __vtrnq_u32
7343 #define vtrnq_u8 __vtrnq_u8
7344 #define vtrnq_s64 __vtrnq_s64
7345 #define vtrnq_u64 __vtrnq_u64
7346 #define vtst_p8 __vtst_p8
7347 #define vtst_s16 __vtst_s16
7348 #define vtst_s32 __vtst_s32
7349 #define vtst_s8 __vtst_s8
7350 #define vtst_u16 __vtst_u16
7351 #define vtst_u32 __vtst_u32
7352 #define vtst_u8 __vtst_u8
7353 #define vtstq_p8 __vtstq_p8
7354 #define vtstq_s16 __vtstq_s16
7355 #define vtstq_s32 __vtstq_s32
7356 #define vtstq_s8 __vtstq_s8
7357 #define vtstq_u16 __vtstq_u16
7358 #define vtstq_u32 __vtstq_u32
7359 #define vtstq_u8 __vtstq_u8
7360 #define vuzp_p16 __vuzp_p16
7361 #define vuzp_p8 __vuzp_p8
7362 #define vuzp_s16 __vuzp_s16
7363 #define vuzp_s8 __vuzp_s8
7364 #define vuzp_u16 __vuzp_u16
7365 #define vuzp_u8 __vuzp_u8
7366 #define vuzp_f32 __vuzp_f32
7367 #define vuzp_s32 __vuzp_s32
7368 #define vuzp_u32 __vuzp_u32
7369 #define vuzpq_f32 __vuzpq_f32
7370 #define vuzpq_p16 __vuzpq_p16
7371 #define vuzpq_p8 __vuzpq_p8
7372 #define vuzpq_s16 __vuzpq_s16
7373 #define vuzpq_s32 __vuzpq_s32
7374 #define vuzpq_s8 __vuzpq_s8
7375 #define vuzpq_u16 __vuzpq_u16
7376 #define vuzpq_u32 __vuzpq_u32
7377 #define vuzpq_u8 __vuzpq_u8
7378 #define vzip_p16 __vzip_p16
7379 #define vzip_p8 __vzip_p8
7380 #define vzip_s16 __vzip_s16
7381 #define vzip_s8 __vzip_s8
7382 #define vzip_u16 __vzip_u16
7383 #define vzip_u8 __vzip_u8
7384 #define vzip_f32 __vzip_f32
7385 #define vzip_s32 __vzip_s32
7386 #define vzip_u32 __vzip_u32
7387 #define vzipq_f32 __vzipq_f32
7388 #define vzipq_p16 __vzipq_p16
7389 #define vzipq_p8 __vzipq_p8
7390 #define vzipq_s16 __vzipq_s16
7391 #define vzipq_s32 __vzipq_s32
7392 #define vzipq_s8 __vzipq_s8
7393 #define vzipq_u16 __vzipq_u16
7394 #define vzipq_u32 __vzipq_u32
7395 #define vzipq_u8 __vzipq_u8
7396 
7397 #define vreinterpret_f32_s8 __vreinterpret_f32_s8
7398 #define vreinterpret_f32_s16 __vreinterpret_f32_s16
7399 #define vreinterpret_f32_s32 __vreinterpret_f32_s32
7400 #define vreinterpret_f32_s64 __vreinterpret_f32_s64
7401 #define vreinterpret_f32_p8 __vreinterpret_f32_p8
7402 #define vreinterpret_f32_p16 __vreinterpret_f32_p16
7403 #define vreinterpret_f32_u8 __vreinterpret_f32_u8
7404 #define vreinterpret_f32_u16 __vreinterpret_f32_u16
7405 #define vreinterpret_f32_u32 __vreinterpret_f32_u32
7406 #define vreinterpret_f32_u64 __vreinterpret_f32_u64
7407 #define vreinterpret_s8_f32 __vreinterpret_s8_f32
7408 #define vreinterpret_s8_s16 __vreinterpret_s8_s16
7409 #define vreinterpret_s8_s32 __vreinterpret_s8_s32
7410 #define vreinterpret_s8_s64 __vreinterpret_s8_s64
7411 #define vreinterpret_s8_p8 __vreinterpret_s8_p8
7412 #define vreinterpret_s8_p16 __vreinterpret_s8_p16
7413 #define vreinterpret_s8_u8 __vreinterpret_s8_u8
7414 #define vreinterpret_s8_u16 __vreinterpret_s8_u16
7415 #define vreinterpret_s8_u32 __vreinterpret_s8_u32
7416 #define vreinterpret_s8_u64 __vreinterpret_s8_u64
7417 #define vreinterpret_s16_f32 __vreinterpret_s16_f32
7418 #define vreinterpret_s16_s8 __vreinterpret_s16_s8
7419 #define vreinterpret_s16_s32 __vreinterpret_s16_s32
7420 #define vreinterpret_s16_s64 __vreinterpret_s16_s64
7421 #define vreinterpret_s16_p8 __vreinterpret_s16_p8
7422 #define vreinterpret_s16_p16 __vreinterpret_s16_p16
7423 #define vreinterpret_s16_u8 __vreinterpret_s16_u8
7424 #define vreinterpret_s16_u16 __vreinterpret_s16_u16
7425 #define vreinterpret_s16_u32 __vreinterpret_s16_u32
7426 #define vreinterpret_s16_u64 __vreinterpret_s16_u64
7427 #define vreinterpret_s32_f32 __vreinterpret_s32_f32
7428 #define vreinterpret_s32_s8 __vreinterpret_s32_s8
7429 #define vreinterpret_s32_s16 __vreinterpret_s32_s16
7430 #define vreinterpret_s32_s64 __vreinterpret_s32_s64
7431 #define vreinterpret_s32_p8 __vreinterpret_s32_p8
7432 #define vreinterpret_s32_p16 __vreinterpret_s32_p16
7433 #define vreinterpret_s32_u8 __vreinterpret_s32_u8
7434 #define vreinterpret_s32_u16 __vreinterpret_s32_u16
7435 #define vreinterpret_s32_u32 __vreinterpret_s32_u32
7436 #define vreinterpret_s32_u64 __vreinterpret_s32_u64
7437 #define vreinterpret_s64_f32 __vreinterpret_s64_f32
7438 #define vreinterpret_s64_s8 __vreinterpret_s64_s8
7439 #define vreinterpret_s64_s16 __vreinterpret_s64_s16
7440 #define vreinterpret_s64_s32 __vreinterpret_s64_s32
7441 #define vreinterpret_s64_p8 __vreinterpret_s64_p8
7442 #define vreinterpret_s64_p16 __vreinterpret_s64_p16
7443 #define vreinterpret_s64_u8 __vreinterpret_s64_u8
7444 #define vreinterpret_s64_u16 __vreinterpret_s64_u16
7445 #define vreinterpret_s64_u32 __vreinterpret_s64_u32
7446 #define vreinterpret_s64_u64 __vreinterpret_s64_u64
7447 #define vreinterpret_p8_f32 __vreinterpret_p8_f32
7448 #define vreinterpret_p8_s8 __vreinterpret_p8_s8
7449 #define vreinterpret_p8_s16 __vreinterpret_p8_s16
7450 #define vreinterpret_p8_s32 __vreinterpret_p8_s32
7451 #define vreinterpret_p8_s64 __vreinterpret_p8_s64
7452 #define vreinterpret_p8_p16 __vreinterpret_p8_p16
7453 #define vreinterpret_p8_u8 __vreinterpret_p8_u8
7454 #define vreinterpret_p8_u16 __vreinterpret_p8_u16
7455 #define vreinterpret_p8_u32 __vreinterpret_p8_u32
7456 #define vreinterpret_p8_u64 __vreinterpret_p8_u64
7457 #define vreinterpret_p16_f32 __vreinterpret_p16_f32
7458 #define vreinterpret_p16_s8 __vreinterpret_p16_s8
7459 #define vreinterpret_p16_s16 __vreinterpret_p16_s16
7460 #define vreinterpret_p16_s32 __vreinterpret_p16_s32
7461 #define vreinterpret_p16_s64 __vreinterpret_p16_s64
7462 #define vreinterpret_p16_p8 __vreinterpret_p16_p8
7463 #define vreinterpret_p16_u8 __vreinterpret_p16_u8
7464 #define vreinterpret_p16_u16 __vreinterpret_p16_u16
7465 #define vreinterpret_p16_u32 __vreinterpret_p16_u32
7466 #define vreinterpret_p16_u64 __vreinterpret_p16_u64
7467 #define vreinterpret_u8_f32 __vreinterpret_u8_f32
7468 #define vreinterpret_u8_s8 __vreinterpret_u8_s8
7469 #define vreinterpret_u8_s16 __vreinterpret_u8_s16
7470 #define vreinterpret_u8_s32 __vreinterpret_u8_s32
7471 #define vreinterpret_u8_s64 __vreinterpret_u8_s64
7472 #define vreinterpret_u8_p8 __vreinterpret_u8_p8
7473 #define vreinterpret_u8_p16 __vreinterpret_u8_p16
7474 #define vreinterpret_u8_u16 __vreinterpret_u8_u16
7475 #define vreinterpret_u8_u32 __vreinterpret_u8_u32
7476 #define vreinterpret_u8_u64 __vreinterpret_u8_u64
7477 #define vreinterpret_u16_f32 __vreinterpret_u16_f32
7478 #define vreinterpret_u16_s8 __vreinterpret_u16_s8
7479 #define vreinterpret_u16_s16 __vreinterpret_u16_s16
7480 #define vreinterpret_u16_s32 __vreinterpret_u16_s32
7481 #define vreinterpret_u16_s64 __vreinterpret_u16_s64
7482 #define vreinterpret_u16_p8 __vreinterpret_u16_p8
7483 #define vreinterpret_u16_p16 __vreinterpret_u16_p16
7484 #define vreinterpret_u16_u8 __vreinterpret_u16_u8
7485 #define vreinterpret_u16_u32 __vreinterpret_u16_u32
7486 #define vreinterpret_u16_u64 __vreinterpret_u16_u64
7487 #define vreinterpret_u32_f32 __vreinterpret_u32_f32
7488 #define vreinterpret_u32_s8 __vreinterpret_u32_s8
7489 #define vreinterpret_u32_s16 __vreinterpret_u32_s16
7490 #define vreinterpret_u32_s32 __vreinterpret_u32_s32
7491 #define vreinterpret_u32_s64 __vreinterpret_u32_s64
7492 #define vreinterpret_u32_p8 __vreinterpret_u32_p8
7493 #define vreinterpret_u32_p16 __vreinterpret_u32_p16
7494 #define vreinterpret_u32_u8 __vreinterpret_u32_u8
7495 #define vreinterpret_u32_u16 __vreinterpret_u32_u16
7496 #define vreinterpret_u32_u64 __vreinterpret_u32_u64
7497 #define vreinterpret_u64_f32 __vreinterpret_u64_f32
7498 #define vreinterpret_u64_s8 __vreinterpret_u64_s8
7499 #define vreinterpret_u64_s16 __vreinterpret_u64_s16
7500 #define vreinterpret_u64_s32 __vreinterpret_u64_s32
7501 #define vreinterpret_u64_s64 __vreinterpret_u64_s64
7502 #define vreinterpret_u64_p8 __vreinterpret_u64_p8
7503 #define vreinterpret_u64_p16 __vreinterpret_u64_p16
7504 #define vreinterpret_u64_u8 __vreinterpret_u64_u8
7505 #define vreinterpret_u64_u16 __vreinterpret_u64_u16
7506 #define vreinterpret_u64_u32 __vreinterpret_u64_u32
7507 #define vreinterpretq_f32_s8 __vreinterpretq_f32_s8
7508 #define vreinterpretq_f32_s16 __vreinterpretq_f32_s16
7509 #define vreinterpretq_f32_s32 __vreinterpretq_f32_s32
7510 #define vreinterpretq_f32_s64 __vreinterpretq_f32_s64
7511 #define vreinterpretq_f32_p8 __vreinterpretq_f32_p8
7512 #define vreinterpretq_f32_p16 __vreinterpretq_f32_p16
7513 #define vreinterpretq_f32_u8 __vreinterpretq_f32_u8
7514 #define vreinterpretq_f32_u16 __vreinterpretq_f32_u16
7515 #define vreinterpretq_f32_u32 __vreinterpretq_f32_u32
7516 #define vreinterpretq_f32_u64 __vreinterpretq_f32_u64
7517 #define vreinterpretq_s8_f32 __vreinterpretq_s8_f32
7518 #define vreinterpretq_s8_s16 __vreinterpretq_s8_s16
7519 #define vreinterpretq_s8_s32 __vreinterpretq_s8_s32
7520 #define vreinterpretq_s8_s64 __vreinterpretq_s8_s64
7521 #define vreinterpretq_s8_p8 __vreinterpretq_s8_p8
7522 #define vreinterpretq_s8_p16 __vreinterpretq_s8_p16
7523 #define vreinterpretq_s8_u8 __vreinterpretq_s8_u8
7524 #define vreinterpretq_s8_u16 __vreinterpretq_s8_u16
7525 #define vreinterpretq_s8_u32 __vreinterpretq_s8_u32
7526 #define vreinterpretq_s8_u64 __vreinterpretq_s8_u64
7527 #define vreinterpretq_s16_f32 __vreinterpretq_s16_f32
7528 #define vreinterpretq_s16_s8 __vreinterpretq_s16_s8
7529 #define vreinterpretq_s16_s32 __vreinterpretq_s16_s32
7530 #define vreinterpretq_s16_s64 __vreinterpretq_s16_s64
7531 #define vreinterpretq_s16_p8 __vreinterpretq_s16_p8
7532 #define vreinterpretq_s16_p16 __vreinterpretq_s16_p16
7533 #define vreinterpretq_s16_u8 __vreinterpretq_s16_u8
7534 #define vreinterpretq_s16_u16 __vreinterpretq_s16_u16
7535 #define vreinterpretq_s16_u32 __vreinterpretq_s16_u32
7536 #define vreinterpretq_s16_u64 __vreinterpretq_s16_u64
7537 #define vreinterpretq_s32_f32 __vreinterpretq_s32_f32
7538 #define vreinterpretq_s32_s8 __vreinterpretq_s32_s8
7539 #define vreinterpretq_s32_s16 __vreinterpretq_s32_s16
7540 #define vreinterpretq_s32_s64 __vreinterpretq_s32_s64
7541 #define vreinterpretq_s32_p8 __vreinterpretq_s32_p8
7542 #define vreinterpretq_s32_p16 __vreinterpretq_s32_p16
7543 #define vreinterpretq_s32_u8 __vreinterpretq_s32_u8
7544 #define vreinterpretq_s32_u16 __vreinterpretq_s32_u16
7545 #define vreinterpretq_s32_u32 __vreinterpretq_s32_u32
7546 #define vreinterpretq_s32_u64 __vreinterpretq_s32_u64
7547 #define vreinterpretq_s64_f32 __vreinterpretq_s64_f32
7548 #define vreinterpretq_s64_s8 __vreinterpretq_s64_s8
7549 #define vreinterpretq_s64_s16 __vreinterpretq_s64_s16
7550 #define vreinterpretq_s64_s32 __vreinterpretq_s64_s32
7551 #define vreinterpretq_s64_p8 __vreinterpretq_s64_p8
7552 #define vreinterpretq_s64_p16 __vreinterpretq_s64_p16
7553 #define vreinterpretq_s64_u8 __vreinterpretq_s64_u8
7554 #define vreinterpretq_s64_u16 __vreinterpretq_s64_u16
7555 #define vreinterpretq_s64_u32 __vreinterpretq_s64_u32
7556 #define vreinterpretq_s64_u64 __vreinterpretq_s64_u64
7557 #define vreinterpretq_p8_f32 __vreinterpretq_p8_f32
7558 #define vreinterpretq_p8_s8 __vreinterpretq_p8_s8
7559 #define vreinterpretq_p8_s16 __vreinterpretq_p8_s16
7560 #define vreinterpretq_p8_s32 __vreinterpretq_p8_s32
7561 #define vreinterpretq_p8_s64 __vreinterpretq_p8_s64
7562 #define vreinterpretq_p8_p16 __vreinterpretq_p8_p16
7563 #define vreinterpretq_p8_u8 __vreinterpretq_p8_u8
7564 #define vreinterpretq_p8_u16 __vreinterpretq_p8_u16
7565 #define vreinterpretq_p8_u32 __vreinterpretq_p8_u32
7566 #define vreinterpretq_p8_u64 __vreinterpretq_p8_u64
7567 #define vreinterpretq_p16_f32 __vreinterpretq_p16_f32
7568 #define vreinterpretq_p16_s8 __vreinterpretq_p16_s8
7569 #define vreinterpretq_p16_s16 __vreinterpretq_p16_s16
7570 #define vreinterpretq_p16_s32 __vreinterpretq_p16_s32
7571 #define vreinterpretq_p16_s64 __vreinterpretq_p16_s64
7572 #define vreinterpretq_p16_p8 __vreinterpretq_p16_p8
7573 #define vreinterpretq_p16_u8 __vreinterpretq_p16_u8
7574 #define vreinterpretq_p16_u16 __vreinterpretq_p16_u16
7575 #define vreinterpretq_p16_u32 __vreinterpretq_p16_u32
7576 #define vreinterpretq_p16_u64 __vreinterpretq_p16_u64
7577 #define vreinterpretq_u8_f32 __vreinterpretq_u8_f32
7578 #define vreinterpretq_u8_s8 __vreinterpretq_u8_s8
7579 #define vreinterpretq_u8_s16 __vreinterpretq_u8_s16
7580 #define vreinterpretq_u8_s32 __vreinterpretq_u8_s32
7581 #define vreinterpretq_u8_s64 __vreinterpretq_u8_s64
7582 #define vreinterpretq_u8_p8 __vreinterpretq_u8_p8
7583 #define vreinterpretq_u8_p16 __vreinterpretq_u8_p16
7584 #define vreinterpretq_u8_u16 __vreinterpretq_u8_u16
7585 #define vreinterpretq_u8_u32 __vreinterpretq_u8_u32
7586 #define vreinterpretq_u8_u64 __vreinterpretq_u8_u64
7587 #define vreinterpretq_u16_f32 __vreinterpretq_u16_f32
7588 #define vreinterpretq_u16_s8 __vreinterpretq_u16_s8
7589 #define vreinterpretq_u16_s16 __vreinterpretq_u16_s16
7590 #define vreinterpretq_u16_s32 __vreinterpretq_u16_s32
7591 #define vreinterpretq_u16_s64 __vreinterpretq_u16_s64
7592 #define vreinterpretq_u16_p8 __vreinterpretq_u16_p8
7593 #define vreinterpretq_u16_p16 __vreinterpretq_u16_p16
7594 #define vreinterpretq_u16_u8 __vreinterpretq_u16_u8
7595 #define vreinterpretq_u16_u32 __vreinterpretq_u16_u32
7596 #define vreinterpretq_u16_u64 __vreinterpretq_u16_u64
7597 #define vreinterpretq_u32_f32 __vreinterpretq_u32_f32
7598 #define vreinterpretq_u32_s8 __vreinterpretq_u32_s8
7599 #define vreinterpretq_u32_s16 __vreinterpretq_u32_s16
7600 #define vreinterpretq_u32_s32 __vreinterpretq_u32_s32
7601 #define vreinterpretq_u32_s64 __vreinterpretq_u32_s64
7602 #define vreinterpretq_u32_p8 __vreinterpretq_u32_p8
7603 #define vreinterpretq_u32_p16 __vreinterpretq_u32_p16
7604 #define vreinterpretq_u32_u8 __vreinterpretq_u32_u8
7605 #define vreinterpretq_u32_u16 __vreinterpretq_u32_u16
7606 #define vreinterpretq_u32_u64 __vreinterpretq_u32_u64
7607 #define vreinterpretq_u64_f32 __vreinterpretq_u64_f32
7608 #define vreinterpretq_u64_s8 __vreinterpretq_u64_s8
7609 #define vreinterpretq_u64_s16 __vreinterpretq_u64_s16
7610 #define vreinterpretq_u64_s32 __vreinterpretq_u64_s32
7611 #define vreinterpretq_u64_s64 __vreinterpretq_u64_s64
7612 #define vreinterpretq_u64_p8 __vreinterpretq_u64_p8
7613 #define vreinterpretq_u64_p16 __vreinterpretq_u64_p16
7614 #define vreinterpretq_u64_u8 __vreinterpretq_u64_u8
7615 #define vreinterpretq_u64_u16 __vreinterpretq_u64_u16
7616 #define vreinterpretq_u64_u32 __vreinterpretq_u64_u32
7617 
7618 #define vmul_n_s16 __vmul_n_s16
7619 #define vmul_n_s32 __vmul_n_s32
7620 #define vmul_n_u16 __vmul_n_u16
7621 #define vmul_n_u32 __vmul_n_u32
7622 #define vmulq_n_s16 __vmulq_n_s16
7623 #define vmulq_n_s32 __vmulq_n_s32
7624 #define vmulq_n_u16 __vmulq_n_u16
7625 #define vmulq_n_u32 __vmulq_n_u32
7626 #define vmull_n_s16 __vmull_n_s16
7627 #define vmull_n_s32 __vmull_n_s32
7628 #define vmull_n_u16 __vmull_n_u16
7629 #define vmull_n_u32 __vmull_n_u32
7630 #define vqdmulh_n_s16 __vqdmulh_n_s16
7631 #define vqdmulh_n_s32 __vqdmulh_n_s32
7632 #define vqdmulhq_n_s16 __vqdmulhq_n_s16
7633 #define vqdmulhq_n_s32 __vqdmulhq_n_s32
7634 #define vqdmull_n_s16 __vqdmull_n_s16
7635 #define vqdmull_n_s32 __vqdmull_n_s32
7636 #define vqrdmulh_n_s16 __vqrdmulh_n_s16
7637 #define vqrdmulh_n_s32 __vqrdmulh_n_s32
7638 #define vqrdmulhq_n_s16 __vqrdmulhq_n_s16
7639 #define vqrdmulhq_n_s32 __vqrdmulhq_n_s32
7640 
7641 #define vmla_n_s16 __vmla_n_s16
7642 #define vmla_n_s32 __vmla_n_s32
7643 #define vmla_n_u16 __vmla_n_u16
7644 #define vmla_n_u32 __vmla_n_u32
7645 #define vmlaq_n_s16 __vmlaq_n_s16
7646 #define vmlaq_n_s32 __vmlaq_n_s32
7647 #define vmlaq_n_u16 __vmlaq_n_u16
7648 #define vmlaq_n_u32 __vmlaq_n_u32
7649 #define vmlal_n_s16 __vmlal_n_s16
7650 #define vmlal_n_s32 __vmlal_n_s32
7651 #define vmlal_n_u16 __vmlal_n_u16
7652 #define vmlal_n_u32 __vmlal_n_u32
7653 #define vmls_n_s16 __vmls_n_s16
7654 #define vmls_n_s32 __vmls_n_s32
7655 #define vmls_n_u16 __vmls_n_u16
7656 #define vmls_n_u32 __vmls_n_u32
7657 #define vmlsq_n_s16 __vmlsq_n_s16
7658 #define vmlsq_n_s32 __vmlsq_n_s32
7659 #define vmlsq_n_u16 __vmlsq_n_u16
7660 #define vmlsq_n_u32 __vmlsq_n_u32
7661 #define vmlsl_n_s16 __vmlsl_n_s16
7662 #define vmlsl_n_s32 __vmlsl_n_s32
7663 #define vmlsl_n_u16 __vmlsl_n_u16
7664 #define vmlsl_n_u32 __vmlsl_n_u32
7665 #define vqdmlal_n_s16 __vqdmlal_n_s16
7666 #define vqdmlal_n_s32 __vqdmlal_n_s32
7667 #define vqdmlsl_n_s16 __vqdmlsl_n_s16
7668 #define vqdmlsl_n_s32 __vqdmlsl_n_s32
7669 
7670 #define vdup_lane_s64 __vdup_lane_s64
7671 #define vdup_lane_u64 __vdup_lane_u64
7672 #define vdupq_lane_s64 __vdupq_lane_s64
7673 #define vdupq_lane_u64 __vdupq_lane_u64
7674 
7675 #endif // !defined(_ARM_ISO_COMPATIBLE_INTRINSIC_NAMES)
7676 
7677 #if defined (__cplusplus)
7678 }
7679 #endif /* defined (__cplusplus) */
7680 
7681 #endif
__n128 __neon_QdDm(unsigned int _Enc, __n64)
void __neon_AdrQx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x3)
__n128 __neon_QdRt(unsigned int _Enc, int)
__n128 __neon_QdQnFt_acc(unsigned int, __n128, __n128, float)
__n64 __neon_DdDnFt(unsigned int, __n64, float)
__n64 __neon_DdRtRt2_acc(unsigned int _Enc, __n64, __int64)
__n64 int32x2_t
Definition: arm_neon.h:193
__n128 __neon_QdDnDm(unsigned int _Enc, __n64, __n64)
__n128x3 int16x8x3_t
Definition: arm_neon.h:235
__n128x4 float32x4x4_t
Definition: arm_neon.h:228
int __neon_RtDn(unsigned int _Enc, __n64)
short int16_t
Definition: stdint.h:18
__n64 float32x2_t
Definition: arm_neon.h:181
__n64x2 uint64x1x2_t
Definition: arm_neon.h:222
__n128 poly8x16_t
Definition: arm_neon.h:245
__n64x2 int8x8x2_t
Definition: arm_neon.h:186
__n64 __neon_DdQm_high(unsigned int _Enc, __n128)
__n64x4 __neon_Dx4Adr_acc(unsigned int _Enc, __n64x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__inline int32_t __uint16ToInt32(uint16_t i)
Definition: arm_neon.h:158
#define _Out_writes_bytes_(size)
Definition: sal.h:347
__n128 int8x16_t
Definition: arm_neon.h:229
__n64x3 int64x1x3_t
Definition: arm_neon.h:199
__n128 float32x4_t
Definition: arm_neon.h:225
__n64x3 poly16x4x3_t
Definition: arm_neon.h:207
__n64 __neon_DdRt(unsigned int _Enc, int)
__n128 __neon_QdQnDmx_acc(unsigned int _Enc, __n128, __n128, __n64)
__n64x4 int8x8x4_t
Definition: arm_neon.h:188
__n128x2 int64x2x2_t
Definition: arm_neon.h:242
__n64x2 int16x4x2_t
Definition: arm_neon.h:190
__n128 __neon_QdDnDmx(unsigned int _Enc, __n64, __n64)
__n64 poly8x8_t
Definition: arm_neon.h:201
long long int64_t
Definition: stdint.h:20
__n128x3 int64x2x3_t
Definition: arm_neon.h:243
__n128x4 uint8x16x4_t
Definition: arm_neon.h:256
__n128x2 __neon_QdQm_acc3(unsigned int _Enc, __n128, __n128)
__n64 __neon_DdDnFt_acc(unsigned int, __n64, __n64, float)
__n64 __neon_DdRt_acc(unsigned int _Enc, __n64, int)
__n64x4 float32x2x4_t
Definition: arm_neon.h:184
__n128 val[3]
Definition: arm_neon.h:104
__n128x3 uint64x2x3_t
Definition: arm_neon.h:267
Definition: arm_neon.h:87
__n128 __neon_QdFt(unsigned int _Enc, float)
Definition: arm_neon.h:102
__n128 __neon_QdRtRt2_acc(unsigned int _Enc, __n128, __int64)
__n64x4 uint8x8x4_t
Definition: arm_neon.h:212
__n64 __neon_DdDnDm(unsigned int _Enc, __n64, __n64)
__n64x3 uint16x4x3_t
Definition: arm_neon.h:215
_In_reads_bytes_(_Size) void const *_Src
__n128x3 uint16x8x3_t
Definition: arm_neon.h:259
__n64x4 int32x2x4_t
Definition: arm_neon.h:196
unsigned short uint16_t
Definition: stdint.h:22
__n128x3 poly8x16x3_t
Definition: arm_neon.h:247
__n128 __neon_QdRtRt2_dup(unsigned int _Enc, __int64)
__n64 uint64x1_t
Definition: arm_neon.h:221
struct __n64x3 __n64x3
__n128x4 int32x4x4_t
Definition: arm_neon.h:240
__n64x4 uint32x2x4_t
Definition: arm_neon.h:220
__n64x2 int64x1x2_t
Definition: arm_neon.h:198
__n128x4 __neon_Qx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 __neon_DdQnQm(unsigned int _Enc, __n128, __n128)
void __neon_AdrQx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x4)
struct __n128x2 __n128x2
unsigned char uint8_t
Definition: stdint.h:21
__n64 __neon_DdDx2Dm(unsigned int _Enc, __n64x2, __n64)
#define DUMMYNEONSTRUCT
Definition: arm_neon.h:37
__inline int32_t __int32ToInt32(int32_t i)
Definition: arm_neon.h:154
__n64x3 int32x2x3_t
Definition: arm_neon.h:195
__n128x3 poly16x8x3_t
Definition: arm_neon.h:251
struct __n64x2 __n64x2
__n128x2 int32x4x2_t
Definition: arm_neon.h:238
__n64x4 poly8x8x4_t
Definition: arm_neon.h:204
void __neon_AdrDx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x4)
__inline _Post_equal_to_(p) __n64 *__int8ToN64(_Pre_notnull_ _Const_ int8_t *p)
Definition: arm_neon.h:128
unsigned __int16 poly16_t
Definition: arm_neon.h:115
__n128 __neon_QdQm_acc(unsigned int _Enc, __n128, __n128)
void __neon_AdrQx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x3)
__n64x4 uint64x1x4_t
Definition: arm_neon.h:224
__n128 __neon_QdDnDmx_acc(unsigned int _Enc, __n128, __n64, __n64)
__n64x2 __neon_Dx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64x3 poly8x8x3_t
Definition: arm_neon.h:203
__n128x4 int16x8x4_t
Definition: arm_neon.h:236
void __neon_AdrD1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64)
__inline int32_t __poly8ToInt32(poly8_t i)
Definition: arm_neon.h:162
__n128x2 int8x16x2_t
Definition: arm_neon.h:230
Definition: arm_neon.h:107
float __neon_FtDn(unsigned int _Enc, __n64)
__inline int64_t __int64ToInt64(int64_t i)
Definition: arm_neon.h:155
__n128 uint64x2_t
Definition: arm_neon.h:265
__n64 __neon_DdDnDm_acc(unsigned int _Enc, __n64, __n64, __n64)
__n64x2 poly8x8x2_t
Definition: arm_neon.h:202
__n64 val[4]
Definition: arm_neon.h:94
int i[4]
Definition: dvec.h:68
void __neon_AdrDx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x3)
void __neon_AdrQx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x2)
__n64x2 float32x2x2_t
Definition: arm_neon.h:182
__n64x3 float32x2x3_t
Definition: arm_neon.h:183
__n128 __neon_QdFt_acc(unsigned int _Enc, __n128, float)
__n64x4 int16x4x4_t
Definition: arm_neon.h:192
__n64x2 uint32x2x2_t
Definition: arm_neon.h:218
__n128x2 float32x4x2_t
Definition: arm_neon.h:226
__n64 __neon_D1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__int64 __neon_RtRt2Dm(unsigned int _Enc, __n64)
__n128x2 uint32x4x2_t
Definition: arm_neon.h:262
__n128x3 __neon_Qx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 int32x4_t
Definition: arm_neon.h:237
int __neon_RtQn(unsigned int _Enc, __n128)
__n128 __neon_QdQnQm(unsigned int _Enc, __n128, __n128)
__n128 __neon_QdQnDm(unsigned int _Enc, __n128, __n64)
__n64 __neon_DdQm(unsigned int _Enc, __n128)
__n128 int64x2_t
Definition: arm_neon.h:241
__n128x4 poly16x8x4_t
Definition: arm_neon.h:252
unsigned int uint32_t
Definition: stdint.h:23
__n128 __neon_Q1Adr_acc(unsigned int _Enc, __n128, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
struct __n64x4 __n64x4
__inline int32_t __uint32ToInt32(uint32_t i)
Definition: arm_neon.h:159
__n128x3 int8x16x3_t
Definition: arm_neon.h:231
__n128x4 uint32x4x4_t
Definition: arm_neon.h:264
unsigned long long uint64_t
Definition: stdint.h:24
struct __n128x4 __n128x4
__n64 int64x1_t
Definition: arm_neon.h:197
__n128
Definition: arm_neon.h:80
__n64 uint8x8_t
Definition: arm_neon.h:209
__n128x4 poly8x16x4_t
Definition: arm_neon.h:248
__n64x2 uint16x4x2_t
Definition: arm_neon.h:214
__n64x2 uint8x8x2_t
Definition: arm_neon.h:210
__n128x2 int16x8x2_t
Definition: arm_neon.h:234
__n128x2 uint16x8x2_t
Definition: arm_neon.h:258
Definition: arm_neon.h:82
__n128 val[4]
Definition: arm_neon.h:109
__n64 int16x4_t
Definition: arm_neon.h:189
__n64x3 uint8x8x3_t
Definition: arm_neon.h:211
__n64x4 __neon_Dx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 int16x8_t
Definition: arm_neon.h:233
__inline int32_t __uint8ToInt32(uint8_t i)
Definition: arm_neon.h:157
__n64x4 int64x1x4_t
Definition: arm_neon.h:200
__inline int32_t __poly16ToInt32(poly16_t i)
Definition: arm_neon.h:163
signed char int8_t
Definition: stdint.h:17
__n128x4 uint16x8x4_t
Definition: arm_neon.h:260
struct __n128x3 __n128x3
__n128 __neon_QdQnFt(unsigned int, __n128, float)
void __neon_AdrDx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x2)
__n64x3 int16x4x3_t
Definition: arm_neon.h:191
__n64 __neon_DdRtRt2(unsigned int _Enc, __int64)
__n64 val[2]
Definition: arm_neon.h:84
__n64 __neon_DdFt_acc(unsigned int _Enc, __n64, float)
void __neon_AdrDx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x3)
__n128x3 float32x4x3_t
Definition: arm_neon.h:227
__n128 uint8x16_t
Definition: arm_neon.h:253
__n64 uint32x2_t
Definition: arm_neon.h:217
__n128x3 uint8x16x3_t
Definition: arm_neon.h:255
float float32_t
Definition: arm_neon.h:117
#define _ADVSIMD_ALIGN(x)
Definition: arm_neon.h:32
__n64
Definition: arm_neon.h:55
__n64 __neon_DdDx4Dm(unsigned int _Enc, __n64x4, __n64)
void __neon_AdrDx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x4)
__n64 __neon_DdDm(unsigned int _Enc, __n64)
__n64 __neon_DdFt(unsigned int _Enc, float)
__n64 int8x8_t
Definition: arm_neon.h:185
__inline int32_t __int8ToInt32(int8_t i)
Definition: arm_neon.h:152
__n64x3 int8x8x3_t
Definition: arm_neon.h:187
__n64x2 __neon_DdDm_acc2(unsigned int _Enc, __n64, __n64)
__n64 val[3]
Definition: arm_neon.h:89
__n128 uint16x8_t
Definition: arm_neon.h:257
__n128x4 int64x2x4_t
Definition: arm_neon.h:244
__n64x4 poly16x4x4_t
Definition: arm_neon.h:208
void __neon_AdrQx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x2)
__n64 __neon_D1Adr_acc(unsigned int _Enc, __n64, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 __neon_DdDx3Dm_acc(unsigned int _Enc, __n64, __n64x3, __n64)
#define _Out_writes_(size)
Definition: sal.h:345
__int64 __neon_RtRt2Qm(unsigned int _Enc, __n128)
void __neon_AdrDx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x2)
__inline int64_t __uint64ToInt64(uint64_t i)
Definition: arm_neon.h:160
void __neon_AdrQ1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128)
int int32_t
Definition: stdint.h:19
__n128x4 uint64x2x4_t
Definition: arm_neon.h:268
__n128x2 uint8x16x2_t
Definition: arm_neon.h:254
__n64 __neon_DdQm_low(unsigned int _Enc, __n128)
__n64 __neon_DdDx4Dm_acc(unsigned int _Enc, __n64, __n64x4, __n64)
__n128 __neon_QdDnDm_acc(unsigned int _Enc, __n128, __n64, __n64)
__n128 poly16x8_t
Definition: arm_neon.h:249
__n128x2 __neon_QdQm_acc2(unsigned int _Enc, __n128, __n128)
__n64 __neon_DdDx3Dm(unsigned int _Enc, __n64x3, __n64)
__n128 uint32x4_t
Definition: arm_neon.h:261
#define _In_reads_(size)
Definition: sal.h:316
unsigned __int8 poly8_t
Definition: arm_neon.h:114
__n128 val[2]
Definition: arm_neon.h:99
__inline int32_t __int16ToInt32(int16_t i)
Definition: arm_neon.h:153
__n128x2 __neon_Qx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
#define _Const_
Definition: sal.h:296
void __neon_AdrQx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x4)
__n64x2 __neon_Dx2Adr_acc(unsigned int _Enc, __n64x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 __neon_QdQm(unsigned int _Enc, __n128)
__n64x4 uint16x4x4_t
Definition: arm_neon.h:216
__n128 __neon_QdDnDm_merge(unsigned int _Enc, __n64, __n64)
__n64x2 poly16x4x2_t
Definition: arm_neon.h:206
__n128x3 int32x4x3_t
Definition: arm_neon.h:239
__n128x2 __neon_Qx2Adr_acc(unsigned int _Enc, __n128x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 __neon_QdRt_acc(unsigned int _Enc, __n128, int)
__n128x2 poly8x16x2_t
Definition: arm_neon.h:246
__n64 __neon_DdDx2Dm_acc(unsigned int _Enc, __n64, __n64x2, __n64)
__n128x2 poly16x8x2_t
Definition: arm_neon.h:250
__n64x3 uint32x2x3_t
Definition: arm_neon.h:219
float __neon_FtQn(unsigned int _Enc, __n128)
__n128 __neon_QdQnDmx(unsigned int _Enc, __n128, __n64)
__n128x4 __neon_Qx4Adr_acc(unsigned int _Enc, __n128x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x3 __neon_Qx3Adr_acc(unsigned int _Enc, __n128x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64x3 __neon_Dx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x2 uint64x2x2_t
Definition: arm_neon.h:266
#define _Pre_notnull_
Definition: sal.h:677
__n64x3 __neon_Dx3Adr_acc(unsigned int _Enc, __n64x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x3 uint32x4x3_t
Definition: arm_neon.h:263
__n128x4 int8x16x4_t
Definition: arm_neon.h:232
__n64 __neon_DdDnDmx_acc(unsigned int _Enc, __n64, __n64, __n64)
Definition: arm_neon.h:92
__n64x2 int32x2x2_t
Definition: arm_neon.h:194
Definition: arm_neon.h:97
__n64 poly16x4_t
Definition: arm_neon.h:205
__n128 __neon_Q1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 uint16x4_t
Definition: arm_neon.h:213
__n64x3 uint64x1x3_t
Definition: arm_neon.h:223
__n64 __neon_DdDnDmx(unsigned int _Enc, __n64, __n64)
__n128 __neon_QdQnQm_acc(unsigned int _Enc, __n128, __n128, __n128)
__n64 __neon_DdDm_acc(unsigned int _Enc, __n64, __n64)