STLdoc
STLdocumentation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
arm_neon.h
Go to the documentation of this file.
1 /***
2 * arm_neon.h - declarations/definitions for ARM NEON specific intrinsics
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 *Purpose:
7 * This include file contains the declarations for ARM NEON intrinsic functions
8 *
9 ****/
10 
11 #pragma once
12 
13 #include <stdint.h>
14 #include <sal.h>
15 
16 #if !defined (_M_ARM)
17 #error This header is specific to ARM targets
18 #endif /* !defined (_M_ARM) */
19 
20 
21 #if defined (__cplusplus)
22 extern "C" {
23 #endif /* defined (__cplusplus) */
24 
25 
27 //
28 #if !defined (_ADVSIMD_ALIGN)
29 #if defined (__midl)
30 #define _ADVSIMD_ALIGN(x)
31 #else /* defined (__midl) */
32 #define _ADVSIMD_ALIGN(x) __declspec(align(x))
33 #endif /* defined (__midl) */
34 #endif /* !defined (_ADVSIMD_ALIGN) */
35 
36 #ifndef DUMMYNEONSTRUCT
37 #define DUMMYNEONSTRUCT s
38 #endif /* DUMMYNEONSTRUCT */
39 
41 //
42 // ARM Advanced SIMD 64bit type
43 //
44 typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n64
45 {
46  unsigned __int64 n64_u64[1];
47  unsigned __int32 n64_u32[2];
48  unsigned __int16 n64_u16[4];
49  unsigned __int8 n64_u8[8];
50  __int64 n64_i64[1];
51  __int32 n64_i32[2];
52  __int16 n64_i16[4];
53  __int8 n64_i8[8];
54  float n64_f32[2];
56 
57 
59 //
60 // ARM Advanced SIMD 128bit type
61 //
62 typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n128
63 {
64  unsigned __int64 n128_u64[2];
65  unsigned __int32 n128_u32[4];
66  unsigned __int16 n128_u16[8];
67  unsigned __int8 n128_u8[16];
68  __int64 n128_i64[2];
69  __int32 n128_i32[4];
70  __int16 n128_i16[8];
71  __int8 n128_i8[16];
72  float n128_f32[4];
73 
74  struct
75  {
76  __n64 low64;
77  __n64 high64;
79 
81 
82 typedef struct __n64x2
83 {
84  __n64 val[2];
85 } __n64x2;
86 
87 typedef struct __n64x3
88 {
89  __n64 val[3];
90 } __n64x3;
91 
92 typedef struct __n64x4
93 {
94  __n64 val[4];
95 } __n64x4;
96 
97 typedef struct __n128x2
98 {
99  __n128 val[2];
100 } __n128x2;
101 
102 typedef struct __n128x3
103 {
105 } __n128x3;
106 
107 typedef struct __n128x4
108 {
110 } __n128x4;
111 
113 //
114 typedef unsigned __int8 poly8_t;
115 typedef unsigned __int16 poly16_t;
116 
117 typedef float float32_t;
118 
119 
121 //
122 __inline _Post_equal_to_(p) __n64 *__int8ToN64(_In_ int8_t *p) { return (__n64 *)p; }
123 __inline _Post_equal_to_(p) __n64 *__int16ToN64(_In_ int16_t *p) { return (__n64 *)p; }
124 __inline _Post_equal_to_(p) __n64 *__int32ToN64(_In_ int32_t *p) { return (__n64 *)p; }
125 __inline _Post_equal_to_(p) __n64 *__int64ToN64(_In_ int64_t *p) { return (__n64 *)p; }
126 __inline _Post_equal_to_(p) __n64 *__uint8ToN64(_In_ uint8_t *p) { return (__n64 *)p; }
127 __inline _Post_equal_to_(p) __n64 *__uint16ToN64(_In_ uint16_t *p) { return (__n64 *)p; }
128 __inline _Post_equal_to_(p) __n64 *__uint32ToN64(_In_ uint32_t *p) { return (__n64 *)p; }
129 __inline _Post_equal_to_(p) __n64 *__uint64ToN64(_In_ uint64_t *p) { return (__n64 *)p; }
130 __inline _Post_equal_to_(p) __n64 *__poly8ToN64(_In_ poly8_t *p) { return (__n64 *)p; }
131 __inline _Post_equal_to_(p) __n64 *__poly16ToN64(_In_ poly16_t *p) { return (__n64 *)p; }
132 __inline _Post_equal_to_(p) __n64 *__float32ToN64(_In_ float32_t *p) { return (__n64 *)p; }
133 
134 __inline _Post_equal_to_(p) const __n64 *__int8ToN64_c(_In_ const int8_t *p) { return (const __n64 *)p; }
135 __inline _Post_equal_to_(p) const __n64 *__int16ToN64_c(_In_ const int16_t *p) { return (const __n64 *)p; }
136 __inline _Post_equal_to_(p) const __n64 *__int32ToN64_c(_In_ const int32_t *p) { return (const __n64 *)p; }
137 __inline _Post_equal_to_(p) const __n64 *__int64ToN64_c(_In_ const int64_t *p) { return (const __n64 *)p; }
138 __inline _Post_equal_to_(p) const __n64 *__uint8ToN64_c(_In_ const uint8_t *p) { return (const __n64 *)p; }
139 __inline _Post_equal_to_(p) const __n64 *__uint16ToN64_c(_In_ const uint16_t *p) { return (const __n64 *)p; }
140 __inline _Post_equal_to_(p) const __n64 *__uint32ToN64_c(_In_ const uint32_t *p) { return (const __n64 *)p; }
141 __inline _Post_equal_to_(p) const __n64 *__uint64ToN64_c(_In_ const uint64_t *p) { return (const __n64 *)p; }
142 __inline _Post_equal_to_(p) const __n64 *__poly8ToN64_c(_In_ const poly8_t *p) { return (const __n64 *)p; }
143 __inline _Post_equal_to_(p) const __n64 *__poly16ToN64_c(_In_ const poly16_t *p) { return (const __n64 *)p; }
144 __inline _Post_equal_to_(p) const __n64 *__float32ToN64_c(_In_ const float32_t *p) { return (const __n64 *)p; }
145 
146 __inline int32_t __int8ToInt32(int8_t i) { return (int32_t)i; }
147 __inline int32_t __int16ToInt32(int16_t i) { return (int32_t)i; }
148 __inline int32_t __int32ToInt32(int32_t i) { return (int32_t)i; }
149 __inline int64_t __int64ToInt64(int64_t i) { return (int64_t)i; }
150 
151 __inline int32_t __uint8ToInt32(uint8_t i) { return (int32_t)i; }
152 __inline int32_t __uint16ToInt32(uint16_t i) { return (int32_t)i; }
153 __inline int32_t __uint32ToInt32(uint32_t i) { return (int32_t)i; }
154 __inline int64_t __uint64ToInt64(uint64_t i) { return (int64_t)i; }
155 
156 __inline int32_t __poly8ToInt32(poly8_t i) { return (int32_t)i; }
157 __inline int32_t __poly16ToInt32(poly16_t i) { return (int32_t)i; }
158 
160 //
161 #define vshll_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 8, "invalid shift amount"), ((shift_amount) == 8) ? __internal_vshll_n_t2_s8((Dm)) : __internal_vshll_n_t1_s8((Dm), (shift_amount)) )
162 #define vshll_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 16, "invalid shift amount"), ((shift_amount) == 16) ? __internal_vshll_n_t2_s16((Dm)) : __internal_vshll_n_t1_s16((Dm), (shift_amount)) )
163 #define vshll_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 32, "invalid shift amount"), ((shift_amount) == 32) ? __internal_vshll_n_t2_s32((Dm)) : __internal_vshll_n_t1_s32((Dm), (shift_amount)) )
164 #define vshll_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 8, "invalid shift amount"), ((shift_amount) == 8) ? __internal_vshll_n_t2_u8((Dm)) : __internal_vshll_n_t1_u8((Dm), (shift_amount)) )
165 #define vshll_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 16, "invalid shift amount"), ((shift_amount) == 16) ? __internal_vshll_n_t2_u16((Dm)) : __internal_vshll_n_t1_u16((Dm), (shift_amount)) )
166 #define vshll_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) <= 32, "invalid shift amount"), ((shift_amount) == 32) ? __internal_vshll_n_t2_u32((Dm)) : __internal_vshll_n_t1_u32((Dm), (shift_amount)) )
167 
168 
170 //
171 // { +++ auto-generated code begins (explicit types)
172 
177 typedef __n64 int8x8_t;
181 typedef __n64 int16x4_t;
185 typedef __n64 int32x2_t;
189 typedef __n64 int64x1_t;
193 typedef __n64 poly8x8_t;
201 typedef __n64 uint8x8_t;
261 
262 // } +++ auto-generated code ends (explicit types)
263 
264 
266 //
267 // { +++ auto-generated code begins (prototypes)
268 
269 __n64x2 __neon_DdDm_acc2(unsigned int _Enc, __n64, __n64);
270 __n64x2 __neon_Dx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
271 __n64x2 __neon_Dx2Adr_acc(unsigned int _Enc, __n64x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
272 __n64x3 __neon_Dx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
273 __n64x3 __neon_Dx3Adr_acc(unsigned int _Enc, __n64x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
274 __n64x4 __neon_Dx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
275 __n64x4 __neon_Dx4Adr_acc(unsigned int _Enc, __n64x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
276 __n64 __neon_DdDm(unsigned int _Enc, __n64);
277 __n64 __neon_DdDx2Dm(unsigned int _Enc, __n64x2, __n64);
278 __n64 __neon_DdDx2Dm_acc(unsigned int _Enc, __n64, __n64x2, __n64);
279 __n64 __neon_DdDx3Dm(unsigned int _Enc, __n64x3, __n64);
280 __n64 __neon_DdDx3Dm_acc(unsigned int _Enc, __n64, __n64x3, __n64);
281 __n64 __neon_DdDx4Dm(unsigned int _Enc, __n64x4, __n64);
282 __n64 __neon_DdDx4Dm_acc(unsigned int _Enc, __n64, __n64x4, __n64);
283 __n64 __neon_DdDm_acc(unsigned int _Enc, __n64, __n64);
284 __n64 __neon_DdDnDm(unsigned int _Enc, __n64, __n64);
285 __n64 __neon_DdDnDm_acc(unsigned int _Enc, __n64, __n64, __n64);
286 __n64 __neon_DdDnDmx(unsigned int _Enc, __n64, __n64);
287 __n64 __neon_DdDnDmx_acc(unsigned int _Enc, __n64, __n64, __n64);
288 __n64 __neon_DdDnFt(unsigned int, __n64, float);
289 __n64 __neon_DdDnFt_acc(unsigned int, __n64, __n64, float);
290 __n64 __neon_DdFt(unsigned int _Enc, float);
291 __n64 __neon_DdFt_acc(unsigned int _Enc, __n64, float);
292 __n64 __neon_D1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
293 __n64 __neon_D1Adr_acc(unsigned int _Enc, __n64, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
294 __n64 __neon_DdQm(unsigned int _Enc, __n128);
295 __n64 __neon_DdQm_high(unsigned int _Enc, __n128);
296 __n64 __neon_DdQm_low(unsigned int _Enc, __n128);
297 __n64 __neon_DdQnQm(unsigned int _Enc, __n128, __n128);
298 __n64 __neon_DdRt(unsigned int _Enc, int);
299 __n64 __neon_DdRtRt2(unsigned int _Enc, __int64);
300 __n64 __neon_DdRtRt2_acc(unsigned int _Enc, __n64, __int64);
301 __n64 __neon_DdRt_acc(unsigned int _Enc, __n64, int);
302 float __neon_FtDn(unsigned int _Enc, __n64);
303 float __neon_FtQn(unsigned int _Enc, __n128);
304 __n128x2 __neon_Qx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
305 __n128x2 __neon_Qx2Adr_acc(unsigned int _Enc, __n128x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
306 __n128x2 __neon_QdQm_acc2(unsigned int _Enc, __n128, __n128);
307 __n128x2 __neon_QdQm_acc3(unsigned int _Enc, __n128, __n128);
308 __n128x3 __neon_Qx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
309 __n128x3 __neon_Qx3Adr_acc(unsigned int _Enc, __n128x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
310 __n128x4 __neon_Qx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
311 __n128x4 __neon_Qx4Adr_acc(unsigned int _Enc, __n128x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
312 __n128 __neon_QdDm(unsigned int _Enc, __n64);
313 __n128 __neon_QdDnDm(unsigned int _Enc, __n64, __n64);
314 __n128 __neon_QdDnDm_acc(unsigned int _Enc, __n128, __n64, __n64);
315 __n128 __neon_QdDnDm_merge(unsigned int _Enc, __n64, __n64);
316 __n128 __neon_QdDnDmx(unsigned int _Enc, __n64, __n64);
317 __n128 __neon_QdDnDmx_acc(unsigned int _Enc, __n128, __n64, __n64);
318 __n128 __neon_QdFt(unsigned int _Enc, float);
319 __n128 __neon_QdFt_acc(unsigned int _Enc, __n128, float);
320 __n128 __neon_Q1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
321 __n128 __neon_Q1Adr_acc(unsigned int _Enc, __n128, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64*);
322 __n128 __neon_QdQm(unsigned int _Enc, __n128);
323 __n128 __neon_QdQm_acc(unsigned int _Enc, __n128, __n128);
324 __n128 __neon_QdQnDm(unsigned int _Enc, __n128, __n64);
325 __n128 __neon_QdQnDmx(unsigned int _Enc, __n128, __n64);
326 __n128 __neon_QdQnDmx_acc(unsigned int _Enc, __n128, __n128, __n64);
327 __n128 __neon_QdQnFt(unsigned int, __n128, float);
328 __n128 __neon_QdQnFt_acc(unsigned int, __n128, __n128, float);
329 __n128 __neon_QdQnQm(unsigned int _Enc, __n128, __n128);
330 __n128 __neon_QdQnQm_acc(unsigned int _Enc, __n128, __n128, __n128);
331 __n128 __neon_QdRt(unsigned int _Enc, int);
332 __n128 __neon_QdRtRt2_acc(unsigned int _Enc, __n128, __int64);
333 __n128 __neon_QdRtRt2_dup(unsigned int _Enc, __int64);
334 __n128 __neon_QdRt_acc(unsigned int _Enc, __n128, int);
335 __int64 __neon_RtRt2Dm(unsigned int _Enc, __n64);
336 __int64 __neon_RtRt2Qm(unsigned int _Enc, __n128);
337 int __neon_RtDn(unsigned int _Enc, __n64);
338 int __neon_RtQn(unsigned int _Enc, __n128);
339 void __neon_AdrD1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64);
340 void __neon_AdrDx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x2);
341 void __neon_AdrDx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x2);
342 void __neon_AdrDx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x3);
343 void __neon_AdrDx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x3);
344 void __neon_AdrDx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x4);
345 void __neon_AdrDx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n64x4);
346 void __neon_AdrQ1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128);
347 void __neon_AdrQx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x2);
348 void __neon_AdrQx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x2);
349 void __neon_AdrQx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x3);
350 void __neon_AdrQx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x3);
351 void __neon_AdrQx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x4);
352 void __neon_AdrQx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64*, __n128x4);
353 
354 // } +++ auto-generated code ends (prototypes)
355 
356 
357 #if defined (__cplusplus)
358 }
359 #endif /* defined (__cplusplus) */
360 
361 
363 //
364 // VLDx/VSTx alignment specifications
365 //
366 
367 
368 #define _NEON_ALIGN16(a) \
369  ( \
370  ((a) == 8) ? 0 : \
371  ((a) == 16) ? 1 : \
372  -1)
373 
374 #define _NEON_ALIGN32(a) \
375  ( \
376  ((a) == 8) ? 0 : \
377  ((a) == 32) ? 1 : \
378  -1)
379 
380 #define _NEON_ALIGN64(a) \
381  ( \
382  ((a) == 8) ? 0 : \
383  ((a) == 64) ? 1 : \
384  -1)
385 
386 #define _NEON_ALIGN64_128(a) \
387  ( \
388  ((a) == 8) ? 0 : \
389  ((a) == 64) ? 1 : \
390  ((a) == 128) ? 2 : \
391  -1)
392 
393 
394 #define _NEON_ALIGN64_128_256(a) \
395  ( \
396  ((a) == 8) ? 0 : \
397  ((a) == 64) ? 1 : \
398  ((a) == 128) ? 2 : \
399  ((a) == 256) ? 3 : \
400  -1)
401 
402 
404 //
405 // { +++ auto-generated code begins (encoding macros)
406 
407 #define _NENC_0(x) ((x) & 0x1)
408 #define _NENC_11_8(x) (((x) << 8) & 0xf00)
409 #define _NENC_12(x) (((x) << 12) & 0x1000)
410 #define _NENC_16(x) (((x) << 16) & 0x10000)
411 #define _NENC_18_16(x) (((x) << 16) & 0x70000)
412 #define _NENC_19(x) (((x) << 19) & 0x80000)
413 #define _NENC_19_16(x) (((x) << 16) & 0xf0000)
414 #define _NENC_19_17(x) (((x) << 17) & 0xe0000)
415 #define _NENC_19_18(x) (((x) << 18) & 0xc0000)
416 #define _NENC_20_16(x) (((x) << 16) & 0x1f0000)
417 #define _NENC_21(x) (((x) << 21) & 0x200000)
418 #define _NENC_21_16(x) (((x) << 16) & 0x3f0000)
419 #define _NENC_21x6(x) (((x) << 6) & 0x40 | ((x) << 20) & 0x200000)
420 #define _NENC_21x6_5(x) (((x) << 5) & 0x60 | ((x) << 19) & 0x200000)
421 #define _NENC_4(x) (((x) << 4) & 0x10)
422 #define _NENC_5(x) (((x) << 5) & 0x20)
423 #define _NENC_5_4(x) (((x) << 4) & 0x30)
424 #define _NENC_5x3(x) (((x) << 3) & 0x8 | ((x) << 4) & 0x20)
425 #define _NENC_7(x) (((x) << 7) & 0x80)
426 #define _NENC_7_5(x) (((x) << 5) & 0xe0)
427 #define _NENC_7_6(x) (((x) << 6) & 0xc0)
428 
429 // } +++ auto-generated code ends (encoding macros)
430 
431 
433 //
434 // { +++ auto-generated code begins (Neon macros)
435 
436 // AES
437 #define aesd_p8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
438 #define aesd_s8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
439 #define aesd_u8(Qm) ( __neon_QdQm( 0xf3b00340, (Qm)) )
440 #define aese_p8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
441 #define aese_s8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
442 #define aese_u8(Qm) ( __neon_QdQm( 0xf3b00300, (Qm)) )
443 #define aesimc_p8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
444 #define aesimc_s8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
445 #define aesimc_u8(Qm) ( __neon_QdQm( 0xf3b003c0, (Qm)) )
446 #define aesmc_p8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
447 #define aesmc_s8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
448 #define aesmc_u8(Qm) ( __neon_QdQm( 0xf3b00380, (Qm)) )
449 
450 // SHA (2-operand)
451 #define sha1h_f32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
452 #define sha1h_s32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
453 #define sha1h_u32(Qm) ( __neon_QdQm( 0xf3b902c0, (Qm)) )
454 #define sha1su1_f32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
455 #define sha1su1_s32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
456 #define sha1su1_u32(Qm) ( __neon_QdQm( 0xf3ba0380, (Qm)) )
457 #define sha256su0_f32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
458 #define sha256su0_s32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
459 #define sha256su0_u32(Qm) ( __neon_QdQm( 0xf3ba03c0, (Qm)) )
460 
461 // SHA (3-operand)
462 #define sha1c_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
463 #define sha1c_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
464 #define sha1c_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2000c40, (Qn), (Qm)) )
465 #define sha1m_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
466 #define sha1m_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
467 #define sha1m_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200c40, (Qn), (Qm)) )
468 #define sha1p_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
469 #define sha1p_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
470 #define sha1p_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2100c40, (Qn), (Qm)) )
471 #define sha1su0_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
472 #define sha1su0_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
473 #define sha1su0_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2300c40, (Qn), (Qm)) )
474 #define sha256h_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
475 #define sha256h_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
476 #define sha256h_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3000c40, (Qn), (Qm)) )
477 #define sha256h2_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
478 #define sha256h2_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
479 #define sha256h2_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3100c40, (Qn), (Qm)) )
480 #define sha256su1_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
481 #define sha256su1_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
482 #define sha256su1_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200c40, (Qn), (Qm)) )
483 
484 // VABA, VABAL
485 #define vaba_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100710, (Dd), (Dn), (Dm)) )
486 #define vaba_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200710, (Dd), (Dn), (Dm)) )
487 #define vaba_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000710, (Dd), (Dn), (Dm)) )
488 #define vaba_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100710, (Dd), (Dn), (Dm)) )
489 #define vaba_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200710, (Dd), (Dn), (Dm)) )
490 #define vaba_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000710, (Dd), (Dn), (Dm)) )
491 #define vabal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900500, (Qd), (Dn), (Dm)) )
492 #define vabal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00500, (Qd), (Dn), (Dm)) )
493 #define vabal_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800500, (Qd), (Dn), (Dm)) )
494 #define vabal_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900500, (Qd), (Dn), (Dm)) )
495 #define vabal_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00500, (Qd), (Dn), (Dm)) )
496 #define vabal_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800500, (Qd), (Dn), (Dm)) )
497 #define vabaq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100750, (Qd), (Qn), (Qm)) )
498 #define vabaq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200750, (Qd), (Qn), (Qm)) )
499 #define vabaq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000750, (Qd), (Qn), (Qm)) )
500 #define vabaq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100750, (Qd), (Qn), (Qm)) )
501 #define vabaq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200750, (Qd), (Qn), (Qm)) )
502 #define vabaq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000750, (Qd), (Qn), (Qm)) )
503 
504 // VABD (floating point)
505 #define vabd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200d00, (Dn), (Dm)) )
506 #define vabdq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200d40, (Qn), (Qm)) )
507 
508 // VABD[L] (integer)
509 #define vabd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100700, (Dn), (Dm)) )
510 #define vabd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200700, (Dn), (Dm)) )
511 #define vabd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000700, (Dn), (Dm)) )
512 #define vabd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100700, (Dn), (Dm)) )
513 #define vabd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200700, (Dn), (Dm)) )
514 #define vabd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000700, (Dn), (Dm)) )
515 #define vabdl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900700, (Dn), (Dm)) )
516 #define vabdl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00700, (Dn), (Dm)) )
517 #define vabdl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800700, (Dn), (Dm)) )
518 #define vabdl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900700, (Dn), (Dm)) )
519 #define vabdl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00700, (Dn), (Dm)) )
520 #define vabdl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800700, (Dn), (Dm)) )
521 #define vabdq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100740, (Qn), (Qm)) )
522 #define vabdq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200740, (Qn), (Qm)) )
523 #define vabdq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000740, (Qn), (Qm)) )
524 #define vabdq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100740, (Qn), (Qm)) )
525 #define vabdq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200740, (Qn), (Qm)) )
526 #define vabdq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000740, (Qn), (Qm)) )
527 
528 // VABS, VNEG
529 #define vabs_f32(Dm) ( __neon_DdDm( 0xf3b90700, (Dm)) )
530 #define vabs_s16(Dm) ( __neon_DdDm( 0xf3b50300, (Dm)) )
531 #define vabs_s32(Dm) ( __neon_DdDm( 0xf3b90300, (Dm)) )
532 #define vabs_s8(Dm) ( __neon_DdDm( 0xf3b10300, (Dm)) )
533 #define vneg_f32(Dm) ( __neon_DdDm( 0xf3b90780, (Dm)) )
534 #define vneg_s16(Dm) ( __neon_DdDm( 0xf3b50380, (Dm)) )
535 #define vneg_s32(Dm) ( __neon_DdDm( 0xf3b90380, (Dm)) )
536 #define vneg_s8(Dm) ( __neon_DdDm( 0xf3b10380, (Dm)) )
537 #define vabsq_f32(Qm) ( __neon_QdQm( 0xf3b90740, (Qm)) )
538 #define vabsq_s16(Qm) ( __neon_QdQm( 0xf3b50340, (Qm)) )
539 #define vabsq_s32(Qm) ( __neon_QdQm( 0xf3b90340, (Qm)) )
540 #define vabsq_s8(Qm) ( __neon_QdQm( 0xf3b10340, (Qm)) )
541 #define vnegq_f32(Qm) ( __neon_QdQm( 0xf3b907c0, (Qm)) )
542 #define vnegq_s16(Qm) ( __neon_QdQm( 0xf3b503c0, (Qm)) )
543 #define vnegq_s32(Qm) ( __neon_QdQm( 0xf3b903c0, (Qm)) )
544 #define vnegq_s8(Qm) ( __neon_QdQm( 0xf3b103c0, (Qm)) )
545 
546 // VACGE, VACGT, VACLE, VACLT
547 #define vacge_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e10, (Dn), (Dm)) )
548 #define vacgt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e10, (Dn), (Dm)) )
549 #define vacle_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e10, (Dm), (Dn)) )
550 #define vaclt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e10, (Dm), (Dn)) )
551 #define vacgeq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e50, (Qn), (Qm)) )
552 #define vacgtq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e50, (Qn), (Qm)) )
553 #define vacleq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e50, (Qm), (Qn)) )
554 #define vacltq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e50, (Qm), (Qn)) )
555 
556 // VADD
557 #define vadd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000d00, (Dn), (Dm)) )
558 #define vadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100800, (Dn), (Dm)) )
559 #define vadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200800, (Dn), (Dm)) )
560 #define vadd_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300800, (Dn), (Dm)) )
561 #define vadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000800, (Dn), (Dm)) )
562 #define vadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100800, (Dn), (Dm)) )
563 #define vadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200800, (Dn), (Dm)) )
564 #define vadd_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2300800, (Dn), (Dm)) )
565 #define vadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000800, (Dn), (Dm)) )
566 #define vaddq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000d40, (Qn), (Qm)) )
567 #define vaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100840, (Qn), (Qm)) )
568 #define vaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200840, (Qn), (Qm)) )
569 #define vaddq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300840, (Qn), (Qm)) )
570 #define vaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000840, (Qn), (Qm)) )
571 #define vaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100840, (Qn), (Qm)) )
572 #define vaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200840, (Qn), (Qm)) )
573 #define vaddq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2300840, (Qn), (Qm)) )
574 #define vaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000840, (Qn), (Qm)) )
575 
576 // VADDHN, VRADDHN
577 #define vaddhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf2800400, (Qn), (Qm)) )
578 #define vaddhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf2900400, (Qn), (Qm)) )
579 #define vaddhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00400, (Qn), (Qm)) )
580 #define vaddhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf2800400, (Qn), (Qm)) )
581 #define vaddhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf2900400, (Qn), (Qm)) )
582 #define vaddhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00400, (Qn), (Qm)) )
583 #define vraddhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf3800400, (Qn), (Qm)) )
584 #define vraddhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf3900400, (Qn), (Qm)) )
585 #define vraddhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00400, (Qn), (Qm)) )
586 #define vraddhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf3800400, (Qn), (Qm)) )
587 #define vraddhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf3900400, (Qn), (Qm)) )
588 #define vraddhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00400, (Qn), (Qm)) )
589 
590 // VADDL, VADDW
591 #define vaddl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900000, (Dn), (Dm)) )
592 #define vaddl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00000, (Dn), (Dm)) )
593 #define vaddl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800000, (Dn), (Dm)) )
594 #define vaddl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900000, (Dn), (Dm)) )
595 #define vaddl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00000, (Dn), (Dm)) )
596 #define vaddl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800000, (Dn), (Dm)) )
597 #define vaddw_s16(Qn, Dm) ( __neon_QdQnDm( 0xf2900100, (Qn), (Dm)) )
598 #define vaddw_s32(Qn, Dm) ( __neon_QdQnDm( 0xf2a00100, (Qn), (Dm)) )
599 #define vaddw_s8(Qn, Dm) ( __neon_QdQnDm( 0xf2800100, (Qn), (Dm)) )
600 #define vaddw_u16(Qn, Dm) ( __neon_QdQnDm( 0xf3900100, (Qn), (Dm)) )
601 #define vaddw_u32(Qn, Dm) ( __neon_QdQnDm( 0xf3a00100, (Qn), (Dm)) )
602 #define vaddw_u8(Qn, Dm) ( __neon_QdQnDm( 0xf3800100, (Qn), (Dm)) )
603 
604 // VAND, VORR
605 #define vand_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
606 #define vand_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
607 #define vand_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
608 #define vand_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
609 #define vand_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
610 #define vand_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
611 #define vand_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
612 #define vand_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000110, (Dn), (Dm)) )
613 #define vorr_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
614 #define vorr_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
615 #define vorr_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
616 #define vorr_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
617 #define vorr_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
618 #define vorr_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
619 #define vorr_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
620 #define vorr_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2200110, (Dn), (Dm)) )
621 #define vandq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
622 #define vandq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
623 #define vandq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
624 #define vandq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
625 #define vandq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
626 #define vandq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
627 #define vandq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
628 #define vandq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000150, (Qn), (Qm)) )
629 #define vorrq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
630 #define vorrq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
631 #define vorrq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
632 #define vorrq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
633 #define vorrq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
634 #define vorrq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
635 #define vorrq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
636 #define vorrq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2200150, (Qn), (Qm)) )
637 
638 // VBIF, VBIT, VBSL
639 #define vbif_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
640 #define vbif_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
641 #define vbif_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
642 #define vbif_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
643 #define vbif_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
644 #define vbif_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
645 #define vbif_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
646 #define vbif_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
647 #define vbif_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
648 #define vbif_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
649 #define vbif_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3300110, (Dd), (Dn), (Dm)) )
650 #define vbit_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
651 #define vbit_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
652 #define vbit_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
653 #define vbit_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
654 #define vbit_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
655 #define vbit_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
656 #define vbit_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
657 #define vbit_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
658 #define vbit_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
659 #define vbit_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
660 #define vbit_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200110, (Dd), (Dn), (Dm)) )
661 #define vbsl_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
662 #define vbsl_p16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
663 #define vbsl_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
664 #define vbsl_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
665 #define vbsl_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
666 #define vbsl_s64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
667 #define vbsl_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
668 #define vbsl_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
669 #define vbsl_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
670 #define vbsl_u64(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
671 #define vbsl_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100110, (Dd), (Dn), (Dm)) )
672 #define vbifq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
673 #define vbifq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
674 #define vbifq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
675 #define vbifq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
676 #define vbifq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
677 #define vbifq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
678 #define vbifq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
679 #define vbifq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
680 #define vbifq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
681 #define vbifq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
682 #define vbifq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3300150, (Qd), (Qn), (Qm)) )
683 #define vbitq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
684 #define vbitq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
685 #define vbitq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
686 #define vbitq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
687 #define vbitq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
688 #define vbitq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
689 #define vbitq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
690 #define vbitq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
691 #define vbitq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
692 #define vbitq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
693 #define vbitq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200150, (Qd), (Qn), (Qm)) )
694 #define vbslq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
695 #define vbslq_p16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
696 #define vbslq_p8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
697 #define vbslq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
698 #define vbslq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
699 #define vbslq_s64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
700 #define vbslq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
701 #define vbslq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
702 #define vbslq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
703 #define vbslq_u64(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
704 #define vbslq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100150, (Qd), (Qn), (Qm)) )
705 
706 // VCEQ (immediate #0)
707 #define vceq_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90500, (Dm)) )
708 #define vceq_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50100, (Dm)) )
709 #define vceq_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90100, (Dm)) )
710 #define vceq_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10100, (Dm)) )
711 #define vceq_z_u16_ex(Dm) ( __neon_DdDm( 0xf3b50100, (Dm)) )
712 #define vceq_z_u32_ex(Dm) ( __neon_DdDm( 0xf3b90100, (Dm)) )
713 #define vceq_z_u8_ex(Dm) ( __neon_DdDm( 0xf3b10100, (Dm)) )
714 #define vceqq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90540, (Qm)) )
715 #define vceqq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50140, (Qm)) )
716 #define vceqq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90140, (Qm)) )
717 #define vceqq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10140, (Qm)) )
718 #define vceqq_z_u16_ex(Qm) ( __neon_QdQm( 0xf3b50140, (Qm)) )
719 #define vceqq_z_u32_ex(Qm) ( __neon_QdQm( 0xf3b90140, (Qm)) )
720 #define vceqq_z_u8_ex(Qm) ( __neon_QdQm( 0xf3b10140, (Qm)) )
721 
722 // VCEQ (register)
723 #define vceq_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000e00, (Dn), (Dm)) )
724 #define vceq_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
725 #define vceq_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100810, (Dn), (Dm)) )
726 #define vceq_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200810, (Dn), (Dm)) )
727 #define vceq_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
728 #define vceq_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100810, (Dn), (Dm)) )
729 #define vceq_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200810, (Dn), (Dm)) )
730 #define vceq_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000810, (Dn), (Dm)) )
731 #define vceqq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000e40, (Qn), (Qm)) )
732 #define vceqq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
733 #define vceqq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100850, (Qn), (Qm)) )
734 #define vceqq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200850, (Qn), (Qm)) )
735 #define vceqq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
736 #define vceqq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100850, (Qn), (Qm)) )
737 #define vceqq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200850, (Qn), (Qm)) )
738 #define vceqq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000850, (Qn), (Qm)) )
739 
740 // VCGE (immediate #0)
741 #define vcge_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90480, (Dm)) )
742 #define vcge_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50080, (Dm)) )
743 #define vcge_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90080, (Dm)) )
744 #define vcge_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10080, (Dm)) )
745 #define vcgeq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b904c0, (Qm)) )
746 #define vcgeq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b500c0, (Qm)) )
747 #define vcgeq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b900c0, (Qm)) )
748 #define vcgeq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b100c0, (Qm)) )
749 
750 // VCGE, VCLE (register)
751 #define vcge_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e00, (Dn), (Dm)) )
752 #define vcge_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100310, (Dn), (Dm)) )
753 #define vcge_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200310, (Dn), (Dm)) )
754 #define vcge_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000310, (Dn), (Dm)) )
755 #define vcge_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100310, (Dn), (Dm)) )
756 #define vcge_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200310, (Dn), (Dm)) )
757 #define vcge_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000310, (Dn), (Dm)) )
758 #define vcle_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000e00, (Dm), (Dn)) )
759 #define vcle_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100310, (Dm), (Dn)) )
760 #define vcle_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200310, (Dm), (Dn)) )
761 #define vcle_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000310, (Dm), (Dn)) )
762 #define vcle_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100310, (Dm), (Dn)) )
763 #define vcle_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200310, (Dm), (Dn)) )
764 #define vcle_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000310, (Dm), (Dn)) )
765 #define vcgeq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e40, (Qn), (Qm)) )
766 #define vcgeq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100350, (Qn), (Qm)) )
767 #define vcgeq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200350, (Qn), (Qm)) )
768 #define vcgeq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000350, (Qn), (Qm)) )
769 #define vcgeq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100350, (Qn), (Qm)) )
770 #define vcgeq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200350, (Qn), (Qm)) )
771 #define vcgeq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000350, (Qn), (Qm)) )
772 #define vcleq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000e40, (Qm), (Qn)) )
773 #define vcleq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100350, (Qm), (Qn)) )
774 #define vcleq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200350, (Qm), (Qn)) )
775 #define vcleq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000350, (Qm), (Qn)) )
776 #define vcleq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100350, (Qm), (Qn)) )
777 #define vcleq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200350, (Qm), (Qn)) )
778 #define vcleq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000350, (Qm), (Qn)) )
779 
780 // VCGT (immediate #0)
781 #define vcgt_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90400, (Dm)) )
782 #define vcgt_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50000, (Dm)) )
783 #define vcgt_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90000, (Dm)) )
784 #define vcgt_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10000, (Dm)) )
785 #define vcgtq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90440, (Qm)) )
786 #define vcgtq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50040, (Qm)) )
787 #define vcgtq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90040, (Qm)) )
788 #define vcgtq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10040, (Qm)) )
789 
790 // VCGT, VCLT (register)
791 #define vcgt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e00, (Dn), (Dm)) )
792 #define vcgt_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100300, (Dn), (Dm)) )
793 #define vcgt_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200300, (Dn), (Dm)) )
794 #define vcgt_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000300, (Dn), (Dm)) )
795 #define vcgt_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100300, (Dn), (Dm)) )
796 #define vcgt_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200300, (Dn), (Dm)) )
797 #define vcgt_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000300, (Dn), (Dm)) )
798 #define vclt_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200e00, (Dm), (Dn)) )
799 #define vclt_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100300, (Dm), (Dn)) )
800 #define vclt_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200300, (Dm), (Dn)) )
801 #define vclt_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000300, (Dm), (Dn)) )
802 #define vclt_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100300, (Dm), (Dn)) )
803 #define vclt_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200300, (Dm), (Dn)) )
804 #define vclt_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000300, (Dm), (Dn)) )
805 #define vcgtq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e40, (Qn), (Qm)) )
806 #define vcgtq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100340, (Qn), (Qm)) )
807 #define vcgtq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200340, (Qn), (Qm)) )
808 #define vcgtq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000340, (Qn), (Qm)) )
809 #define vcgtq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100340, (Qn), (Qm)) )
810 #define vcgtq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200340, (Qn), (Qm)) )
811 #define vcgtq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000340, (Qn), (Qm)) )
812 #define vcltq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200e40, (Qm), (Qn)) )
813 #define vcltq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100340, (Qm), (Qn)) )
814 #define vcltq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200340, (Qm), (Qn)) )
815 #define vcltq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000340, (Qm), (Qn)) )
816 #define vcltq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100340, (Qm), (Qn)) )
817 #define vcltq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200340, (Qm), (Qn)) )
818 #define vcltq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000340, (Qm), (Qn)) )
819 
820 // VCLE (immediate #0)
821 #define vcle_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90580, (Dm)) )
822 #define vcle_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50180, (Dm)) )
823 #define vcle_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90180, (Dm)) )
824 #define vcle_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10180, (Dm)) )
825 #define vcleq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b905c0, (Qm)) )
826 #define vcleq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b501c0, (Qm)) )
827 #define vcleq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b901c0, (Qm)) )
828 #define vcleq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b101c0, (Qm)) )
829 
830 // VCLS, VCLZ
831 #define vcls_s16(Dm) ( __neon_DdDm( 0xf3b40400, (Dm)) )
832 #define vcls_s32(Dm) ( __neon_DdDm( 0xf3b80400, (Dm)) )
833 #define vcls_s8(Dm) ( __neon_DdDm( 0xf3b00400, (Dm)) )
834 #define vclz_s16(Dm) ( __neon_DdDm( 0xf3b40480, (Dm)) )
835 #define vclz_s32(Dm) ( __neon_DdDm( 0xf3b80480, (Dm)) )
836 #define vclz_s8(Dm) ( __neon_DdDm( 0xf3b00480, (Dm)) )
837 #define vclz_u16(Dm) ( __neon_DdDm( 0xf3b40480, (Dm)) )
838 #define vclz_u32(Dm) ( __neon_DdDm( 0xf3b80480, (Dm)) )
839 #define vclz_u8(Dm) ( __neon_DdDm( 0xf3b00480, (Dm)) )
840 #define vclsq_s16(Qm) ( __neon_QdQm( 0xf3b40440, (Qm)) )
841 #define vclsq_s32(Qm) ( __neon_QdQm( 0xf3b80440, (Qm)) )
842 #define vclsq_s8(Qm) ( __neon_QdQm( 0xf3b00440, (Qm)) )
843 #define vclzq_s16(Qm) ( __neon_QdQm( 0xf3b404c0, (Qm)) )
844 #define vclzq_s32(Qm) ( __neon_QdQm( 0xf3b804c0, (Qm)) )
845 #define vclzq_s8(Qm) ( __neon_QdQm( 0xf3b004c0, (Qm)) )
846 #define vclzq_u16(Qm) ( __neon_QdQm( 0xf3b404c0, (Qm)) )
847 #define vclzq_u32(Qm) ( __neon_QdQm( 0xf3b804c0, (Qm)) )
848 #define vclzq_u8(Qm) ( __neon_QdQm( 0xf3b004c0, (Qm)) )
849 
850 // VCLT (immediate #0)
851 #define vclt_z_f32_ex(Dm) ( __neon_DdDm( 0xf3b90600, (Dm)) )
852 #define vclt_z_s16_ex(Dm) ( __neon_DdDm( 0xf3b50200, (Dm)) )
853 #define vclt_z_s32_ex(Dm) ( __neon_DdDm( 0xf3b90200, (Dm)) )
854 #define vclt_z_s8_ex(Dm) ( __neon_DdDm( 0xf3b10200, (Dm)) )
855 #define vcltq_z_f32_ex(Qm) ( __neon_QdQm( 0xf3b90640, (Qm)) )
856 #define vcltq_z_s16_ex(Qm) ( __neon_QdQm( 0xf3b50240, (Qm)) )
857 #define vcltq_z_s32_ex(Qm) ( __neon_QdQm( 0xf3b90240, (Qm)) )
858 #define vcltq_z_s8_ex(Qm) ( __neon_QdQm( 0xf3b10240, (Qm)) )
859 
860 // VCNT
861 #define vcnt_p8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
862 #define vcnt_s8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
863 #define vcnt_u8(Dm) ( __neon_DdDm( 0xf3b00500, (Dm)) )
864 #define vcntq_p8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
865 #define vcntq_s8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
866 #define vcntq_u8(Qm) ( __neon_QdQm( 0xf3b00540, (Qm)) )
867 
868 // VCOMBINE (combine 2x64bit into a 128bit register)
869 #define vcombine_f32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
870 #define vcombine_p16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
871 #define vcombine_p8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
872 #define vcombine_s16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
873 #define vcombine_s32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
874 #define vcombine_s64(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
875 #define vcombine_s8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
876 #define vcombine_u16(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
877 #define vcombine_u32(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
878 #define vcombine_u64(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
879 #define vcombine_u8(Dn, Dm) ( __neon_QdDnDm_merge( 0x00000000, (Dn), (Dm)) )
880 
881 // VCREATE (ARM core register pair to Neon 64bit register)
882 #define vcreate_f32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
883 #define vcreate_p16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
884 #define vcreate_p8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
885 #define vcreate_s16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
886 #define vcreate_s32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
887 #define vcreate_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
888 #define vcreate_s8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
889 #define vcreate_u16(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
890 #define vcreate_u32(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
891 #define vcreate_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
892 #define vcreate_u8(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
893 
894 // VCVT (between floating-point and fixed-point)
895 #define vcvt_n_f32_s32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf2800e10 | _NENC_21_16(64 - (fbits)), (Dm)) )
896 #define vcvt_n_f32_u32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf3800e10 | _NENC_21_16(64 - (fbits)), (Dm)) )
897 #define vcvt_n_s32_f32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf2800f10 | _NENC_21_16(64 - (fbits)), (Dm)) )
898 #define vcvt_n_u32_f32(Dm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_DdDm( 0xf3800f10 | _NENC_21_16(64 - (fbits)), (Dm)) )
899 #define vcvtq_n_f32_s32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf2800e50 | _NENC_21_16(64 - (fbits)), (Qm)) )
900 #define vcvtq_n_f32_u32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf3800e50 | _NENC_21_16(64 - (fbits)), (Qm)) )
901 #define vcvtq_n_s32_f32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf2800f50 | _NENC_21_16(64 - (fbits)), (Qm)) )
902 #define vcvtq_n_u32_f32(Qm, fbits) ( __static_assert((fbits) >= 1 && (fbits) <= 32, "invalid fbits value"), __neon_QdQm( 0xf3800f50 | _NENC_21_16(64 - (fbits)), (Qm)) )
903 
904 // VCVT (between floating-point and integer with directed rounding)
905 #define vcvta_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0000, (Dm)) )
906 #define vcvta_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0080, (Dm)) )
907 #define vcvtm_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0300, (Dm)) )
908 #define vcvtm_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0380, (Dm)) )
909 #define vcvtn_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0100, (Dm)) )
910 #define vcvtn_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0180, (Dm)) )
911 #define vcvtp_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0200, (Dm)) )
912 #define vcvtp_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0280, (Dm)) )
913 #define vcvtaq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0040, (Qm)) )
914 #define vcvtaq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb00c0, (Qm)) )
915 #define vcvtmq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0340, (Qm)) )
916 #define vcvtmq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb03c0, (Qm)) )
917 #define vcvtnq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0140, (Qm)) )
918 #define vcvtnq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb01c0, (Qm)) )
919 #define vcvtpq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0240, (Qm)) )
920 #define vcvtpq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb02c0, (Qm)) )
921 
922 // VCVT (between floating-point and integer)
923 #define vcvt_f32_s32(Dm) ( __neon_DdDm( 0xf3bb0600, (Dm)) )
924 #define vcvt_f32_u32(Dm) ( __neon_DdDm( 0xf3bb0680, (Dm)) )
925 #define vcvt_s32_f32(Dm) ( __neon_DdDm( 0xf3bb0700, (Dm)) )
926 #define vcvt_u32_f32(Dm) ( __neon_DdDm( 0xf3bb0780, (Dm)) )
927 #define vcvtq_f32_s32(Qm) ( __neon_QdQm( 0xf3bb0640, (Qm)) )
928 #define vcvtq_f32_u32(Qm) ( __neon_QdQm( 0xf3bb06c0, (Qm)) )
929 #define vcvtq_s32_f32(Qm) ( __neon_QdQm( 0xf3bb0740, (Qm)) )
930 #define vcvtq_u32_f32(Qm) ( __neon_QdQm( 0xf3bb07c0, (Qm)) )
931 
932 // VDUP (scalar)
933 #define vdup_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
934 #define vdup_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
935 #define vdup_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
936 #define vdup_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
937 #define vdup_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
938 #define vdup_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
939 #define vdup_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDm( 0xf3b20c00 | _NENC_19_18(lane), (Dm)) )
940 #define vdup_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDm( 0xf3b40c00 | _NENC_19(lane), (Dm)) )
941 #define vdup_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdDm( 0xf3b10c00 | _NENC_19_17(lane), (Dm)) )
942 #define vdupq_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
943 #define vdupq_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
944 #define vdupq_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
945 #define vdupq_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
946 #define vdupq_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
947 #define vdupq_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
948 #define vdupq_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDm( 0xf3b20c40 | _NENC_19_18(lane), (Dm)) )
949 #define vdupq_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDm( 0xf3b40c40 | _NENC_19(lane), (Dm)) )
950 #define vdupq_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdDm( 0xf3b10c40 | _NENC_19_17(lane), (Dm)) )
951 
952 // VDUP, VMOV (ARM core register to Neon register)
953 #define vdup_n_f32(Ft) ( __neon_DdFt( 0xee800b10, (Ft)) )
954 #define vmov_n_f32(Ft) ( __neon_DdFt( 0xee800b10, (Ft)) )
955 #define vdup_n_p16(Rt) ( __neon_DdRt( 0xee800b30, __poly16ToInt32(Rt)) )
956 #define vdup_n_p8(Rt) ( __neon_DdRt( 0xeec00b10, __poly8ToInt32(Rt)) )
957 #define vdup_n_s16(Rt) ( __neon_DdRt( 0xee800b30, __int16ToInt32(Rt)) )
958 #define vdup_n_s32(Rt) ( __neon_DdRt( 0xee800b10, __int32ToInt32(Rt)) )
959 #define vdup_n_s8(Rt) ( __neon_DdRt( 0xeec00b10, __int8ToInt32(Rt)) )
960 #define vdup_n_u16(Rt) ( __neon_DdRt( 0xee800b30, __uint16ToInt32(Rt)) )
961 #define vdup_n_u32(Rt) ( __neon_DdRt( 0xee800b10, __uint32ToInt32(Rt)) )
962 #define vdup_n_u8(Rt) ( __neon_DdRt( 0xeec00b10, __uint8ToInt32(Rt)) )
963 #define vmov_n_p16(Rt) ( __neon_DdRt( 0xee800b30, __poly16ToInt32(Rt)) )
964 #define vmov_n_p8(Rt) ( __neon_DdRt( 0xeec00b10, __poly8ToInt32(Rt)) )
965 #define vmov_n_s16(Rt) ( __neon_DdRt( 0xee800b30, __int16ToInt32(Rt)) )
966 #define vmov_n_s32(Rt) ( __neon_DdRt( 0xee800b10, __int32ToInt32(Rt)) )
967 #define vmov_n_s8(Rt) ( __neon_DdRt( 0xeec00b10, __int8ToInt32(Rt)) )
968 #define vmov_n_u16(Rt) ( __neon_DdRt( 0xee800b30, __uint16ToInt32(Rt)) )
969 #define vmov_n_u32(Rt) ( __neon_DdRt( 0xee800b10, __uint32ToInt32(Rt)) )
970 #define vmov_n_u8(Rt) ( __neon_DdRt( 0xeec00b10, __uint8ToInt32(Rt)) )
971 #define vdupq_n_f32(Ft) ( __neon_QdFt( 0xeea00b10, (Ft)) )
972 #define vmovq_n_f32(Ft) ( __neon_QdFt( 0xeea00b10, (Ft)) )
973 #define vdupq_n_p16(Rt) ( __neon_QdRt( 0xeea00b30, __poly16ToInt32(Rt)) )
974 #define vdupq_n_p8(Rt) ( __neon_QdRt( 0xeee00b10, __poly8ToInt32(Rt)) )
975 #define vdupq_n_s16(Rt) ( __neon_QdRt( 0xeea00b30, __int16ToInt32(Rt)) )
976 #define vdupq_n_s32(Rt) ( __neon_QdRt( 0xeea00b10, __int32ToInt32(Rt)) )
977 #define vdupq_n_s8(Rt) ( __neon_QdRt( 0xeee00b10, __int8ToInt32(Rt)) )
978 #define vdupq_n_u16(Rt) ( __neon_QdRt( 0xeea00b30, __uint16ToInt32(Rt)) )
979 #define vdupq_n_u32(Rt) ( __neon_QdRt( 0xeea00b10, __uint32ToInt32(Rt)) )
980 #define vdupq_n_u8(Rt) ( __neon_QdRt( 0xeee00b10, __uint8ToInt32(Rt)) )
981 #define vmovq_n_p16(Rt) ( __neon_QdRt( 0xeea00b30, __poly16ToInt32(Rt)) )
982 #define vmovq_n_p8(Rt) ( __neon_QdRt( 0xeee00b10, __poly8ToInt32(Rt)) )
983 #define vmovq_n_s16(Rt) ( __neon_QdRt( 0xeea00b30, __int16ToInt32(Rt)) )
984 #define vmovq_n_s32(Rt) ( __neon_QdRt( 0xeea00b10, __int32ToInt32(Rt)) )
985 #define vmovq_n_s8(Rt) ( __neon_QdRt( 0xeee00b10, __int8ToInt32(Rt)) )
986 #define vmovq_n_u16(Rt) ( __neon_QdRt( 0xeea00b30, __uint16ToInt32(Rt)) )
987 #define vmovq_n_u32(Rt) ( __neon_QdRt( 0xeea00b10, __uint32ToInt32(Rt)) )
988 #define vmovq_n_u8(Rt) ( __neon_QdRt( 0xeee00b10, __uint8ToInt32(Rt)) )
989 
990 // VDUP.64, VMOV.64 (ARM core register pair to Neon registers)
991 #define vdup_n_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __int64ToInt64(R64t)) )
992 #define vdup_n_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
993 #define vmov_n_s64(R64t) ( __neon_DdRtRt2( 0xec400b10, __int64ToInt64(R64t)) )
994 #define vmov_n_u64(R64t) ( __neon_DdRtRt2( 0xec400b10, __uint64ToInt64(R64t)) )
995 #define vdupq_n_s64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __int64ToInt64(R64t)) )
996 #define vdupq_n_u64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __uint64ToInt64(R64t)) )
997 #define vmovq_n_s64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __int64ToInt64(R64t)) )
998 #define vmovq_n_u64(R64t) ( __neon_QdRtRt2_dup( 0xec400b10, __uint64ToInt64(R64t)) )
999 
1000 // VEOR, VBIC, VORN
1001 #define vbic_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1002 #define vbic_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1003 #define vbic_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1004 #define vbic_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1005 #define vbic_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1006 #define vbic_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1007 #define vbic_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1008 #define vbic_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2100110, (Dn), (Dm)) )
1009 #define veor_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1010 #define veor_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1011 #define veor_s64(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1012 #define veor_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1013 #define veor_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1014 #define veor_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1015 #define veor_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1016 #define veor_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000110, (Dn), (Dm)) )
1017 #define vorn_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1018 #define vorn_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1019 #define vorn_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1020 #define vorn_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1021 #define vorn_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1022 #define vorn_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1023 #define vorn_u64(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1024 #define vorn_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2300110, (Dn), (Dm)) )
1025 #define vbicq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1026 #define vbicq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1027 #define vbicq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1028 #define vbicq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1029 #define vbicq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1030 #define vbicq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1031 #define vbicq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1032 #define vbicq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2100150, (Qn), (Qm)) )
1033 #define veorq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1034 #define veorq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1035 #define veorq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1036 #define veorq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1037 #define veorq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1038 #define veorq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1039 #define veorq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1040 #define veorq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000150, (Qn), (Qm)) )
1041 #define vornq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1042 #define vornq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1043 #define vornq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1044 #define vornq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1045 #define vornq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1046 #define vornq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1047 #define vornq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1048 #define vornq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2300150, (Qn), (Qm)) )
1049 
1050 // VEXT
1051 #define vext_f32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1052 #define vext_p16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1053 #define vext_p8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1054 #define vext_s16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1055 #define vext_s32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1056 #define vext_s64(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 1, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 8), (Dn), (Dm)) )
1057 #define vext_s8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1058 #define vext_u16(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 2), (Dn), (Dm)) )
1059 #define vext_u32(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 4), (Dn), (Dm)) )
1060 #define vext_u64(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 1, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8((pos) * 8), (Dn), (Dm)) )
1061 #define vext_u8(Dn, Dm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_DdDnDm( 0xf2b00000 | _NENC_11_8(pos), (Dn), (Dm)) )
1062 #define vextq_f32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1063 #define vextq_p16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1064 #define vextq_p8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1065 #define vextq_s16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1066 #define vextq_s32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1067 #define vextq_s64(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 8), (Qn), (Qm)) )
1068 #define vextq_s8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1069 #define vextq_u16(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 8, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 2), (Qn), (Qm)) )
1070 #define vextq_u32(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 4, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 4), (Qn), (Qm)) )
1071 #define vextq_u64(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 2, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8((pos) * 8), (Qn), (Qm)) )
1072 #define vextq_u8(Qn, Qm, pos) ( __static_assert((pos) >= 0 && (pos) < 16, "invalid position value"), __neon_QdQnQm( 0xf2b00040 | _NENC_11_8(pos), (Qn), (Qm)) )
1073 
1074 // VGET (access the 64bit high/low part of a 128bit register)
1075 #define vget_high_f32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1076 #define vget_high_p16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1077 #define vget_high_p8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1078 #define vget_high_s16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1079 #define vget_high_s32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1080 #define vget_high_s64(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1081 #define vget_high_s8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1082 #define vget_high_u16(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1083 #define vget_high_u32(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1084 #define vget_high_u64(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1085 #define vget_high_u8(Qm) ( __neon_DdQm_high( 0x00000000, (Qm)) )
1086 #define vget_low_f32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1087 #define vget_low_p16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1088 #define vget_low_p8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1089 #define vget_low_s16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1090 #define vget_low_s32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1091 #define vget_low_s64(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1092 #define vget_low_s8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1093 #define vget_low_u16(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1094 #define vget_low_u32(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1095 #define vget_low_u64(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1096 #define vget_low_u8(Qm) ( __neon_DdQm_low( 0x00000000, (Qm)) )
1097 
1098 // VHADD, VRHADD, VHSUB
1099 #define vhadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100000, (Dn), (Dm)) )
1100 #define vhadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200000, (Dn), (Dm)) )
1101 #define vhadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000000, (Dn), (Dm)) )
1102 #define vhadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100000, (Dn), (Dm)) )
1103 #define vhadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200000, (Dn), (Dm)) )
1104 #define vhadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000000, (Dn), (Dm)) )
1105 #define vhsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100200, (Dn), (Dm)) )
1106 #define vhsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200200, (Dn), (Dm)) )
1107 #define vhsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000200, (Dn), (Dm)) )
1108 #define vhsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100200, (Dn), (Dm)) )
1109 #define vhsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200200, (Dn), (Dm)) )
1110 #define vhsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000200, (Dn), (Dm)) )
1111 #define vrhadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100100, (Dn), (Dm)) )
1112 #define vrhadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200100, (Dn), (Dm)) )
1113 #define vrhadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000100, (Dn), (Dm)) )
1114 #define vrhadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100100, (Dn), (Dm)) )
1115 #define vrhadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200100, (Dn), (Dm)) )
1116 #define vrhadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000100, (Dn), (Dm)) )
1117 #define vhaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100040, (Qn), (Qm)) )
1118 #define vhaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200040, (Qn), (Qm)) )
1119 #define vhaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000040, (Qn), (Qm)) )
1120 #define vhaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100040, (Qn), (Qm)) )
1121 #define vhaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200040, (Qn), (Qm)) )
1122 #define vhaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000040, (Qn), (Qm)) )
1123 #define vhsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100240, (Qn), (Qm)) )
1124 #define vhsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200240, (Qn), (Qm)) )
1125 #define vhsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000240, (Qn), (Qm)) )
1126 #define vhsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100240, (Qn), (Qm)) )
1127 #define vhsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200240, (Qn), (Qm)) )
1128 #define vhsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000240, (Qn), (Qm)) )
1129 #define vrhaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100140, (Qn), (Qm)) )
1130 #define vrhaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200140, (Qn), (Qm)) )
1131 #define vrhaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000140, (Qn), (Qm)) )
1132 #define vrhaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100140, (Qn), (Qm)) )
1133 #define vrhaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200140, (Qn), (Qm)) )
1134 #define vrhaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000140, (Qn), (Qm)) )
1135 
1136 // VLD1 (multiple single elements)
1137 #define vld1_f32(pcD) ( __neon_D1Adr( 0xf420078f, __float32ToN64_c(pcD)) )
1138 #define vld1_p16(pcD) ( __neon_D1Adr( 0xf420074f, __poly16ToN64_c(pcD)) )
1139 #define vld1_p8(pcD) ( __neon_D1Adr( 0xf420070f, __poly8ToN64_c(pcD)) )
1140 #define vld1_s16(pcD) ( __neon_D1Adr( 0xf420074f, __int16ToN64_c(pcD)) )
1141 #define vld1_s32(pcD) ( __neon_D1Adr( 0xf420078f, __int32ToN64_c(pcD)) )
1142 #define vld1_s64(pcD) ( __neon_D1Adr( 0xf42007cf, __int64ToN64_c(pcD)) )
1143 #define vld1_s8(pcD) ( __neon_D1Adr( 0xf420070f, __int8ToN64_c(pcD)) )
1144 #define vld1_u16(pcD) ( __neon_D1Adr( 0xf420074f, __uint16ToN64_c(pcD)) )
1145 #define vld1_u32(pcD) ( __neon_D1Adr( 0xf420078f, __uint32ToN64_c(pcD)) )
1146 #define vld1_u64(pcD) ( __neon_D1Adr( 0xf42007cf, __uint64ToN64_c(pcD)) )
1147 #define vld1_u8(pcD) ( __neon_D1Adr( 0xf420070f, __uint8ToN64_c(pcD)) )
1148 #define vld1_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1149 #define vld1_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1150 #define vld1_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1151 #define vld1_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1152 #define vld1_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1153 #define vld1_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf42007cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64_c(pcD)) )
1154 #define vld1_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1155 #define vld1_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420074f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1156 #define vld1_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420078f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1157 #define vld1_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf42007cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64_c(pcD)) )
1158 #define vld1_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_D1Adr( 0xf420070f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1159 #define vld1q_f32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __float32ToN64_c(pcD)) )
1160 #define vld1q_p16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __poly16ToN64_c(pcD)) )
1161 #define vld1q_p8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __poly8ToN64_c(pcD)) )
1162 #define vld1q_s16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __int16ToN64_c(pcD)) )
1163 #define vld1q_s32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __int32ToN64_c(pcD)) )
1164 #define vld1q_s64(pcD) ( __neon_Q1Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1165 #define vld1q_s8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __int8ToN64_c(pcD)) )
1166 #define vld1q_u16(pcD) ( __neon_Q1Adr( 0xf4200a4f, __uint16ToN64_c(pcD)) )
1167 #define vld1q_u32(pcD) ( __neon_Q1Adr( 0xf4200a8f, __uint32ToN64_c(pcD)) )
1168 #define vld1q_u64(pcD) ( __neon_Q1Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1169 #define vld1q_u8(pcD) ( __neon_Q1Adr( 0xf4200a0f, __uint8ToN64_c(pcD)) )
1170 #define vld1q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1171 #define vld1q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1172 #define vld1q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1173 #define vld1q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1174 #define vld1q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1175 #define vld1q_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1176 #define vld1q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1177 #define vld1q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1178 #define vld1q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1179 #define vld1q_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1180 #define vld1q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4200a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1181 
1182 // VLD1 (single element to all lanes)
1183 #define vld1_dup_f32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __float32ToN64_c(pcD)) )
1184 #define vld1_dup_p16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __poly16ToN64_c(pcD)) )
1185 #define vld1_dup_p8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __poly8ToN64_c(pcD)) )
1186 #define vld1_dup_s16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __int16ToN64_c(pcD)) )
1187 #define vld1_dup_s32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __int32ToN64_c(pcD)) )
1188 #define vld1_dup_s8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __int8ToN64_c(pcD)) )
1189 #define vld1_dup_u16(pcD) ( __neon_D1Adr( 0xf4a00c4f, __uint16ToN64_c(pcD)) )
1190 #define vld1_dup_u32(pcD) ( __neon_D1Adr( 0xf4a00c8f, __uint32ToN64_c(pcD)) )
1191 #define vld1_dup_u8(pcD) ( __neon_D1Adr( 0xf4a00c0f, __uint8ToN64_c(pcD)) )
1192 #define vld1q_dup_f32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __float32ToN64_c(pcD)) )
1193 #define vld1q_dup_p16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __poly16ToN64_c(pcD)) )
1194 #define vld1q_dup_p8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __poly8ToN64_c(pcD)) )
1195 #define vld1q_dup_s16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __int16ToN64_c(pcD)) )
1196 #define vld1q_dup_s32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __int32ToN64_c(pcD)) )
1197 #define vld1q_dup_s8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __int8ToN64_c(pcD)) )
1198 #define vld1q_dup_u16(pcD) ( __neon_Q1Adr( 0xf4a00c6f, __uint16ToN64_c(pcD)) )
1199 #define vld1q_dup_u32(pcD) ( __neon_Q1Adr( 0xf4a00caf, __uint32ToN64_c(pcD)) )
1200 #define vld1q_dup_u8(pcD) ( __neon_Q1Adr( 0xf4a00c2f, __uint8ToN64_c(pcD)) )
1201 
1202 // VLD1 (single element to all lanes, aligned)
1203 #define vld1_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __float32ToN64_c(pcD)) )
1204 #define vld1_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64_c(pcD)) )
1205 #define vld1_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64_c(pcD)) )
1206 #define vld1_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __int32ToN64_c(pcD)) )
1207 #define vld1_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c4f | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64_c(pcD)) )
1208 #define vld1_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr( 0xf4a00c8f | _NENC_4(_NEON_ALIGN32(align)), __uint32ToN64_c(pcD)) )
1209 #define vld1q_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __float32ToN64_c(pcD)) )
1210 #define vld1q_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64_c(pcD)) )
1211 #define vld1q_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64_c(pcD)) )
1212 #define vld1q_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __int32ToN64_c(pcD)) )
1213 #define vld1q_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00c6f | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64_c(pcD)) )
1214 #define vld1q_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr( 0xf4a00caf | _NENC_4(_NEON_ALIGN32(align)), __uint32ToN64_c(pcD)) )
1215 
1216 // VLD1 (single element to one lane)
1217 #define vld1_lane_f32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __float32ToN64_c(pcD)) )
1218 #define vld1_lane_p16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __poly16ToN64_c(pcD)) )
1219 #define vld1_lane_p8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __poly8ToN64_c(pcD)) )
1220 #define vld1_lane_s16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __int16ToN64_c(pcD)) )
1221 #define vld1_lane_s32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __int32ToN64_c(pcD)) )
1222 #define vld1_lane_s8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __int8ToN64_c(pcD)) )
1223 #define vld1_lane_u16(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane), (Dd), __uint16ToN64_c(pcD)) )
1224 #define vld1_lane_u32(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane), (Dd), __uint32ToN64_c(pcD)) )
1225 #define vld1_lane_u8(pcD, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_D1Adr_acc( 0xf4a0000f | _NENC_7_5(lane), (Dd), __uint8ToN64_c(pcD)) )
1226 #define vld1q_lane_f32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __float32ToN64_c(pcD)) )
1227 #define vld1q_lane_p16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __poly16ToN64_c(pcD)) )
1228 #define vld1q_lane_p8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __poly8ToN64_c(pcD)) )
1229 #define vld1q_lane_s16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __int16ToN64_c(pcD)) )
1230 #define vld1q_lane_s32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __int32ToN64_c(pcD)) )
1231 #define vld1q_lane_s8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __int8ToN64_c(pcD)) )
1232 #define vld1q_lane_u16(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Qd), __uint16ToN64_c(pcD)) )
1233 #define vld1q_lane_u32(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Qd), __uint32ToN64_c(pcD)) )
1234 #define vld1q_lane_u8(pcD, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_Q1Adr_acc( 0xf4a0000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), (Qd), __uint8ToN64_c(pcD)) )
1235 
1236 // VLD1 (single element to one lane, aligned)
1237 #define vld1_lane_f32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __float32ToN64_c(pcD)) )
1238 #define vld1_lane_p16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __poly16ToN64_c(pcD)) )
1239 #define vld1_lane_s16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __int16ToN64_c(pcD)) )
1240 #define vld1_lane_s32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __int32ToN64_c(pcD)) )
1241 #define vld1_lane_u16_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), (Dd), __uint16ToN64_c(pcD)) )
1242 #define vld1_lane_u32_ex(pcD, Dd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_D1Adr_acc( 0xf4a0080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Dd), __uint32ToN64_c(pcD)) )
1243 #define vld1q_lane_f32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __float32ToN64_c(pcD)) )
1244 #define vld1q_lane_p16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __poly16ToN64_c(pcD)) )
1245 #define vld1q_lane_s16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __int16ToN64_c(pcD)) )
1246 #define vld1q_lane_s32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __int32ToN64_c(pcD)) )
1247 #define vld1q_lane_u16_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), (Qd), __uint16ToN64_c(pcD)) )
1248 #define vld1q_lane_u32_ex(pcD, Qd, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Q1Adr_acc( 0xf4a0080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), (Qd), __uint32ToN64_c(pcD)) )
1249 
1250 // VLD2 (multiple 2-element structures)
1251 #define vld2_f32(pcD) ( __neon_Dx2Adr( 0xf420088f, __float32ToN64_c(pcD)) )
1252 #define vld2_p16(pcD) ( __neon_Dx2Adr( 0xf420084f, __poly16ToN64_c(pcD)) )
1253 #define vld2_p8(pcD) ( __neon_Dx2Adr( 0xf420080f, __poly8ToN64_c(pcD)) )
1254 #define vld2_s16(pcD) ( __neon_Dx2Adr( 0xf420084f, __int16ToN64_c(pcD)) )
1255 #define vld2_s32(pcD) ( __neon_Dx2Adr( 0xf420088f, __int32ToN64_c(pcD)) )
1256 #define vld2_s8(pcD) ( __neon_Dx2Adr( 0xf420080f, __int8ToN64_c(pcD)) )
1257 #define vld2_u16(pcD) ( __neon_Dx2Adr( 0xf420084f, __uint16ToN64_c(pcD)) )
1258 #define vld2_u32(pcD) ( __neon_Dx2Adr( 0xf420088f, __uint32ToN64_c(pcD)) )
1259 #define vld2_u8(pcD) ( __neon_Dx2Adr( 0xf420080f, __uint8ToN64_c(pcD)) )
1260 #define vld2_s64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1261 #define vld2_u64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1262 #define vld2_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1263 #define vld2_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1264 #define vld2_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1265 #define vld2_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1266 #define vld2_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1267 #define vld2_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1268 #define vld2_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1269 #define vld2_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1270 #define vld2_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1271 #define vld2_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1272 #define vld2_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf420080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1273 #define vld2q_f32(pcD) ( __neon_Qx2Adr( 0xf420098f, __float32ToN64_c(pcD)) )
1274 #define vld2q_p16(pcD) ( __neon_Qx2Adr( 0xf420094f, __poly16ToN64_c(pcD)) )
1275 #define vld2q_p8(pcD) ( __neon_Qx2Adr( 0xf420090f, __poly8ToN64_c(pcD)) )
1276 #define vld2q_s16(pcD) ( __neon_Qx2Adr( 0xf420094f, __int16ToN64_c(pcD)) )
1277 #define vld2q_s32(pcD) ( __neon_Qx2Adr( 0xf420098f, __int32ToN64_c(pcD)) )
1278 #define vld2q_s8(pcD) ( __neon_Qx2Adr( 0xf420090f, __int8ToN64_c(pcD)) )
1279 #define vld2q_u16(pcD) ( __neon_Qx2Adr( 0xf420094f, __uint16ToN64_c(pcD)) )
1280 #define vld2q_u32(pcD) ( __neon_Qx2Adr( 0xf420098f, __uint32ToN64_c(pcD)) )
1281 #define vld2q_u8(pcD) ( __neon_Qx2Adr( 0xf420090f, __uint8ToN64_c(pcD)) )
1282 #define vld2q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64_c(pcD)) )
1283 #define vld2q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64_c(pcD)) )
1284 #define vld2q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64_c(pcD)) )
1285 #define vld2q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64_c(pcD)) )
1286 #define vld2q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64_c(pcD)) )
1287 #define vld2q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64_c(pcD)) )
1288 #define vld2q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64_c(pcD)) )
1289 #define vld2q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64_c(pcD)) )
1290 #define vld2q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx2Adr( 0xf420090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64_c(pcD)) )
1291 
1292 // VLD2 (single 2-element structure to all lanes)
1293 #define vld2_dup_f32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __float32ToN64_c(pcD)) )
1294 #define vld2_dup_p16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __poly16ToN64_c(pcD)) )
1295 #define vld2_dup_p8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __poly8ToN64_c(pcD)) )
1296 #define vld2_dup_s16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __int16ToN64_c(pcD)) )
1297 #define vld2_dup_s32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __int32ToN64_c(pcD)) )
1298 #define vld2_dup_s8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __int8ToN64_c(pcD)) )
1299 #define vld2_dup_u16(pcD) ( __neon_Dx2Adr( 0xf4a00d4f, __uint16ToN64_c(pcD)) )
1300 #define vld2_dup_u32(pcD) ( __neon_Dx2Adr( 0xf4a00d8f, __uint32ToN64_c(pcD)) )
1301 #define vld2_dup_u8(pcD) ( __neon_Dx2Adr( 0xf4a00d0f, __uint8ToN64_c(pcD)) )
1302 #define vld2_dup_s64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __int64ToN64_c(pcD)) )
1303 #define vld2_dup_u64(pcD) ( __neon_Dx2Adr( 0xf4200acf, __uint64ToN64_c(pcD)) )
1304 
1305 // VLD2 (single 2-element structure to all lanes, aligned)
1306 #define vld2_dup_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64_c(pcD)) )
1307 #define vld2_dup_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4200acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64_c(pcD)) )
1308 #define vld2_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1309 #define vld2_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64_c(pcD)) )
1310 #define vld2_dup_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __poly8ToN64_c(pcD)) )
1311 #define vld2_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64_c(pcD)) )
1312 #define vld2_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1313 #define vld2_dup_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __int8ToN64_c(pcD)) )
1314 #define vld2_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d4f | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64_c(pcD)) )
1315 #define vld2_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d8f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1316 #define vld2_dup_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr( 0xf4a00d0f | _NENC_4(_NEON_ALIGN16(align)), __uint8ToN64_c(pcD)) )
1317 
1318 // VLD2 (single 2-element structure to one lane)
1319 #define vld2_lane_f32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __float32ToN64_c(pcD)) )
1320 #define vld2_lane_p16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __poly16ToN64_c(pcD)) )
1321 #define vld2_lane_p8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __poly8ToN64_c(pcD)) )
1322 #define vld2_lane_s16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __int16ToN64_c(pcD)) )
1323 #define vld2_lane_s32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __int32ToN64_c(pcD)) )
1324 #define vld2_lane_s8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __int8ToN64_c(pcD)) )
1325 #define vld2_lane_u16(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane), (D2), __uint16ToN64_c(pcD)) )
1326 #define vld2_lane_u32(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane), (D2), __uint32ToN64_c(pcD)) )
1327 #define vld2_lane_u8(pcD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane), (D2), __uint8ToN64_c(pcD)) )
1328 #define vld2q_lane_f32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __float32ToN64_c(pcD)) )
1329 #define vld2q_lane_p16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __poly16ToN64_c(pcD)) )
1330 #define vld2q_lane_s16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __int16ToN64_c(pcD)) )
1331 #define vld2q_lane_s32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __int32ToN64_c(pcD)) )
1332 #define vld2q_lane_u16(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q2), __uint16ToN64_c(pcD)) )
1333 #define vld2q_lane_u32(pcD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q2), __uint32ToN64_c(pcD)) )
1334 
1335 // VLD2 (single 2-element structure to one lane, aligned)
1336 #define vld2_lane_f32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __float32ToN64_c(pcD)) )
1337 #define vld2_lane_p16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __poly16ToN64_c(pcD)) )
1338 #define vld2_lane_p8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __poly8ToN64_c(pcD)) )
1339 #define vld2_lane_s16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __int16ToN64_c(pcD)) )
1340 #define vld2_lane_s32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __int32ToN64_c(pcD)) )
1341 #define vld2_lane_s8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __int8ToN64_c(pcD)) )
1342 #define vld2_lane_u16_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), (D2), __uint16ToN64_c(pcD)) )
1343 #define vld2_lane_u32_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), (D2), __uint32ToN64_c(pcD)) )
1344 #define vld2_lane_u8_ex(pcD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_Dx2Adr_acc( 0xf4a0010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), (D2), __uint8ToN64_c(pcD)) )
1345 #define vld2q_lane_f32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __float32ToN64_c(pcD)) )
1346 #define vld2q_lane_p16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __poly16ToN64_c(pcD)) )
1347 #define vld2q_lane_s16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __int16ToN64_c(pcD)) )
1348 #define vld2q_lane_s32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __int32ToN64_c(pcD)) )
1349 #define vld2q_lane_u16_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), (Q2), __uint16ToN64_c(pcD)) )
1350 #define vld2q_lane_u32_ex(pcD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx2Adr_acc( 0xf4a0094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q2), __uint32ToN64_c(pcD)) )
1351 
1352 // VLD3 (multiple 3-element structures)
1353 #define vld3_f32(pcD) ( __neon_Dx3Adr( 0xf420048f, __float32ToN64_c(pcD)) )
1354 #define vld3_p16(pcD) ( __neon_Dx3Adr( 0xf420044f, __poly16ToN64_c(pcD)) )
1355 #define vld3_p8(pcD) ( __neon_Dx3Adr( 0xf420040f, __poly8ToN64_c(pcD)) )
1356 #define vld3_s16(pcD) ( __neon_Dx3Adr( 0xf420044f, __int16ToN64_c(pcD)) )
1357 #define vld3_s32(pcD) ( __neon_Dx3Adr( 0xf420048f, __int32ToN64_c(pcD)) )
1358 #define vld3_s8(pcD) ( __neon_Dx3Adr( 0xf420040f, __int8ToN64_c(pcD)) )
1359 #define vld3_u16(pcD) ( __neon_Dx3Adr( 0xf420044f, __uint16ToN64_c(pcD)) )
1360 #define vld3_u32(pcD) ( __neon_Dx3Adr( 0xf420048f, __uint32ToN64_c(pcD)) )
1361 #define vld3_u8(pcD) ( __neon_Dx3Adr( 0xf420040f, __uint8ToN64_c(pcD)) )
1362 #define vld3_s64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __int64ToN64_c(pcD)) )
1363 #define vld3_u64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __uint64ToN64_c(pcD)) )
1364 #define vld3_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf42006cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64_c(pcD)) )
1365 #define vld3_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf42006cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64_c(pcD)) )
1366 #define vld3_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1367 #define vld3_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1368 #define vld3_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1369 #define vld3_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1370 #define vld3_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1371 #define vld3_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1372 #define vld3_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420044f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1373 #define vld3_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420048f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1374 #define vld3_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx3Adr( 0xf420040f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1375 #define vld3q_f32(pcD) ( __neon_Qx3Adr( 0xf420058f, __float32ToN64_c(pcD)) )
1376 #define vld3q_p16(pcD) ( __neon_Qx3Adr( 0xf420054f, __poly16ToN64_c(pcD)) )
1377 #define vld3q_p8(pcD) ( __neon_Qx3Adr( 0xf420050f, __poly8ToN64_c(pcD)) )
1378 #define vld3q_s16(pcD) ( __neon_Qx3Adr( 0xf420054f, __int16ToN64_c(pcD)) )
1379 #define vld3q_s32(pcD) ( __neon_Qx3Adr( 0xf420058f, __int32ToN64_c(pcD)) )
1380 #define vld3q_s8(pcD) ( __neon_Qx3Adr( 0xf420050f, __int8ToN64_c(pcD)) )
1381 #define vld3q_u16(pcD) ( __neon_Qx3Adr( 0xf420054f, __uint16ToN64_c(pcD)) )
1382 #define vld3q_u32(pcD) ( __neon_Qx3Adr( 0xf420058f, __uint32ToN64_c(pcD)) )
1383 #define vld3q_u8(pcD) ( __neon_Qx3Adr( 0xf420050f, __uint8ToN64_c(pcD)) )
1384 #define vld3q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64_c(pcD)) )
1385 #define vld3q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1386 #define vld3q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64_c(pcD)) )
1387 #define vld3q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1388 #define vld3q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64_c(pcD)) )
1389 #define vld3q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64_c(pcD)) )
1390 #define vld3q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420054f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1391 #define vld3q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420058f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64_c(pcD)) )
1392 #define vld3q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx3Adr( 0xf420050f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64_c(pcD)) )
1393 
1394 // VLD3 (single 3-element structure to all lanes)
1395 #define vld3_dup_f32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __float32ToN64_c(pcD)) )
1396 #define vld3_dup_p16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __poly16ToN64_c(pcD)) )
1397 #define vld3_dup_p8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __poly8ToN64_c(pcD)) )
1398 #define vld3_dup_s16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __int16ToN64_c(pcD)) )
1399 #define vld3_dup_s32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __int32ToN64_c(pcD)) )
1400 #define vld3_dup_s8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __int8ToN64_c(pcD)) )
1401 #define vld3_dup_u16(pcD) ( __neon_Dx3Adr( 0xf4a00e4f, __uint16ToN64_c(pcD)) )
1402 #define vld3_dup_u32(pcD) ( __neon_Dx3Adr( 0xf4a00e8f, __uint32ToN64_c(pcD)) )
1403 #define vld3_dup_u8(pcD) ( __neon_Dx3Adr( 0xf4a00e0f, __uint8ToN64_c(pcD)) )
1404 #define vld3_dup_s64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __int64ToN64_c(pcD)) )
1405 #define vld3_dup_u64(pcD) ( __neon_Dx3Adr( 0xf42006cf, __uint64ToN64_c(pcD)) )
1406 
1407 // VLD3 (single 3-element structure to one lane)
1408 #define vld3_lane_f32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __float32ToN64_c(pcD)) )
1409 #define vld3_lane_p16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __poly16ToN64_c(pcD)) )
1410 #define vld3_lane_p8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __poly8ToN64_c(pcD)) )
1411 #define vld3_lane_s16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __int16ToN64_c(pcD)) )
1412 #define vld3_lane_s32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __int32ToN64_c(pcD)) )
1413 #define vld3_lane_s8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __int8ToN64_c(pcD)) )
1414 #define vld3_lane_u16(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0060f | _NENC_7_6(lane), (D3), __uint16ToN64_c(pcD)) )
1415 #define vld3_lane_u32(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a00a0f | _NENC_7(lane), (D3), __uint32ToN64_c(pcD)) )
1416 #define vld3_lane_u8(pcD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx3Adr_acc( 0xf4a0020f | _NENC_7_5(lane), (D3), __uint8ToN64_c(pcD)) )
1417 #define vld3q_lane_f32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __float32ToN64_c(pcD)) )
1418 #define vld3q_lane_p16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __poly16ToN64_c(pcD)) )
1419 #define vld3q_lane_s16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __int16ToN64_c(pcD)) )
1420 #define vld3q_lane_s32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __int32ToN64_c(pcD)) )
1421 #define vld3q_lane_u16(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a0062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q3), __uint16ToN64_c(pcD)) )
1422 #define vld3q_lane_u32(pcD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx3Adr_acc( 0xf4a00a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q3), __uint32ToN64_c(pcD)) )
1423 
1424 // VLD4 (multiple 4-element structures)
1425 #define vld4_f32(pcD) ( __neon_Dx4Adr( 0xf420008f, __float32ToN64_c(pcD)) )
1426 #define vld4_p16(pcD) ( __neon_Dx4Adr( 0xf420004f, __poly16ToN64_c(pcD)) )
1427 #define vld4_p8(pcD) ( __neon_Dx4Adr( 0xf420000f, __poly8ToN64_c(pcD)) )
1428 #define vld4_s16(pcD) ( __neon_Dx4Adr( 0xf420004f, __int16ToN64_c(pcD)) )
1429 #define vld4_s32(pcD) ( __neon_Dx4Adr( 0xf420008f, __int32ToN64_c(pcD)) )
1430 #define vld4_s8(pcD) ( __neon_Dx4Adr( 0xf420000f, __int8ToN64_c(pcD)) )
1431 #define vld4_u16(pcD) ( __neon_Dx4Adr( 0xf420004f, __uint16ToN64_c(pcD)) )
1432 #define vld4_u32(pcD) ( __neon_Dx4Adr( 0xf420008f, __uint32ToN64_c(pcD)) )
1433 #define vld4_u8(pcD) ( __neon_Dx4Adr( 0xf420000f, __uint8ToN64_c(pcD)) )
1434 #define vld4_s64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __int64ToN64_c(pcD)) )
1435 #define vld4_u64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __uint64ToN64_c(pcD)) )
1436 #define vld4_s64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf42002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int64ToN64_c(pcD)) )
1437 #define vld4_u64_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf42002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint64ToN64_c(pcD)) )
1438 #define vld4_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64_c(pcD)) )
1439 #define vld4_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64_c(pcD)) )
1440 #define vld4_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64_c(pcD)) )
1441 #define vld4_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64_c(pcD)) )
1442 #define vld4_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64_c(pcD)) )
1443 #define vld4_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64_c(pcD)) )
1444 #define vld4_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64_c(pcD)) )
1445 #define vld4_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64_c(pcD)) )
1446 #define vld4_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf420000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64_c(pcD)) )
1447 #define vld4q_f32(pcD) ( __neon_Qx4Adr( 0xf420018f, __float32ToN64_c(pcD)) )
1448 #define vld4q_p16(pcD) ( __neon_Qx4Adr( 0xf420014f, __poly16ToN64_c(pcD)) )
1449 #define vld4q_p8(pcD) ( __neon_Qx4Adr( 0xf420010f, __poly8ToN64_c(pcD)) )
1450 #define vld4q_s16(pcD) ( __neon_Qx4Adr( 0xf420014f, __int16ToN64_c(pcD)) )
1451 #define vld4q_s32(pcD) ( __neon_Qx4Adr( 0xf420018f, __int32ToN64_c(pcD)) )
1452 #define vld4q_s8(pcD) ( __neon_Qx4Adr( 0xf420010f, __int8ToN64_c(pcD)) )
1453 #define vld4q_u16(pcD) ( __neon_Qx4Adr( 0xf420014f, __uint16ToN64_c(pcD)) )
1454 #define vld4q_u32(pcD) ( __neon_Qx4Adr( 0xf420018f, __uint32ToN64_c(pcD)) )
1455 #define vld4q_u8(pcD) ( __neon_Qx4Adr( 0xf420010f, __uint8ToN64_c(pcD)) )
1456 #define vld4q_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64_c(pcD)) )
1457 #define vld4q_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64_c(pcD)) )
1458 #define vld4q_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64_c(pcD)) )
1459 #define vld4q_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64_c(pcD)) )
1460 #define vld4q_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64_c(pcD)) )
1461 #define vld4q_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64_c(pcD)) )
1462 #define vld4q_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64_c(pcD)) )
1463 #define vld4q_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64_c(pcD)) )
1464 #define vld4q_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_Qx4Adr( 0xf420010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64_c(pcD)) )
1465 
1466 // VLD4 (single 4-element structure to all lanes)
1467 #define vld4_dup_f32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __float32ToN64_c(pcD)) )
1468 #define vld4_dup_p16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __poly16ToN64_c(pcD)) )
1469 #define vld4_dup_p8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __poly8ToN64_c(pcD)) )
1470 #define vld4_dup_s16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __int16ToN64_c(pcD)) )
1471 #define vld4_dup_s32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __int32ToN64_c(pcD)) )
1472 #define vld4_dup_s8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __int8ToN64_c(pcD)) )
1473 #define vld4_dup_u16(pcD) ( __neon_Dx4Adr( 0xf4a00f4f, __uint16ToN64_c(pcD)) )
1474 #define vld4_dup_u32(pcD) ( __neon_Dx4Adr( 0xf4a00f8f, __uint32ToN64_c(pcD)) )
1475 #define vld4_dup_u8(pcD) ( __neon_Dx4Adr( 0xf4a00f0f, __uint8ToN64_c(pcD)) )
1476 #define vld4_dup_s64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __int64ToN64_c(pcD)) )
1477 #define vld4_dup_u64(pcD) ( __neon_Dx4Adr( 0xf42002cf, __uint64ToN64_c(pcD)) )
1478 
1479 // VLD4 (single 4-element structure to all lanes, aligned)
1480 #define vld4_dup_f32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __float32ToN64_c(pcD)) )
1481 #define vld4_dup_p16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64_c(pcD)) )
1482 #define vld4_dup_p8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __poly8ToN64_c(pcD)) )
1483 #define vld4_dup_s16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64_c(pcD)) )
1484 #define vld4_dup_s32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __int32ToN64_c(pcD)) )
1485 #define vld4_dup_s8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __int8ToN64_c(pcD)) )
1486 #define vld4_dup_u16_ex(pcD, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f4f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64_c(pcD)) )
1487 #define vld4_dup_u32_ex(pcD, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_7_6(_NEON_ALIGN64_128(align) > 1 ? 3 : 2) | _NENC_4(_NEON_ALIGN64_128(align) > 0 ? 1 : 0), __uint32ToN64_c(pcD)) )
1488 #define vld4_dup_u8_ex(pcD, align) ( __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr( 0xf4a00f0f | _NENC_4(_NEON_ALIGN32(align)), __uint8ToN64_c(pcD)) )
1489 
1490 // VLD4 (single 4-element structure to one lane)
1491 #define vld4_lane_f32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __float32ToN64_c(pcD)) )
1492 #define vld4_lane_p16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __poly16ToN64_c(pcD)) )
1493 #define vld4_lane_p8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __poly8ToN64_c(pcD)) )
1494 #define vld4_lane_s16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __int16ToN64_c(pcD)) )
1495 #define vld4_lane_s32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __int32ToN64_c(pcD)) )
1496 #define vld4_lane_s8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __int8ToN64_c(pcD)) )
1497 #define vld4_lane_u16(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane), (D4), __uint16ToN64_c(pcD)) )
1498 #define vld4_lane_u32(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane), (D4), __uint32ToN64_c(pcD)) )
1499 #define vld4_lane_u8(pcD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane), (D4), __uint8ToN64_c(pcD)) )
1500 #define vld4q_lane_f32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __float32ToN64_c(pcD)) )
1501 #define vld4q_lane_p16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __poly16ToN64_c(pcD)) )
1502 #define vld4q_lane_s16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __int16ToN64_c(pcD)) )
1503 #define vld4q_lane_s32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __int32ToN64_c(pcD)) )
1504 #define vld4q_lane_u16(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), (Q4), __uint16ToN64_c(pcD)) )
1505 #define vld4q_lane_u32(pcD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), (Q4), __uint32ToN64_c(pcD)) )
1506 
1507 // VLD4 (single 4-element structure to one lane, aligned)
1508 #define vld4_lane_f32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __float32ToN64_c(pcD)) )
1509 #define vld4_lane_p16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __poly16ToN64_c(pcD)) )
1510 #define vld4_lane_p8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __poly8ToN64_c(pcD)) )
1511 #define vld4_lane_s16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __int16ToN64_c(pcD)) )
1512 #define vld4_lane_s32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __int32ToN64_c(pcD)) )
1513 #define vld4_lane_s8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __int8ToN64_c(pcD)) )
1514 #define vld4_lane_u16_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), (D4), __uint16ToN64_c(pcD)) )
1515 #define vld4_lane_u32_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a00b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), (D4), __uint32ToN64_c(pcD)) )
1516 #define vld4_lane_u8_ex(pcD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_Dx4Adr_acc( 0xf4a0030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), (D4), __uint8ToN64_c(pcD)) )
1517 #define vld4q_lane_f32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __float32ToN64_c(pcD)) )
1518 #define vld4q_lane_p16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __poly16ToN64_c(pcD)) )
1519 #define vld4q_lane_s16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __int16ToN64_c(pcD)) )
1520 #define vld4q_lane_s32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __int32ToN64_c(pcD)) )
1521 #define vld4q_lane_u16_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a0072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), (Q4), __uint16ToN64_c(pcD)) )
1522 #define vld4q_lane_u32_ex(pcD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_Qx4Adr_acc( 0xf4a00b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), (Q4), __uint32ToN64_c(pcD)) )
1523 
1524 // VMAX, VMIN (floating point)
1525 #define vmax_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000f00, (Dn), (Dm)) )
1526 #define vmaxnm_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000f10, (Dn), (Dm)) )
1527 #define vmin_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200f00, (Dn), (Dm)) )
1528 #define vminnm_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200f10, (Dn), (Dm)) )
1529 #define vmaxq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000f40, (Qn), (Qm)) )
1530 #define vmaxnmq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000f50, (Qn), (Qm)) )
1531 #define vminq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200f40, (Qn), (Qm)) )
1532 #define vminnmq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3200f50, (Qn), (Qm)) )
1533 
1534 // VMAX, VMIN (integer)
1535 #define vmax_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100600, (Dn), (Dm)) )
1536 #define vmax_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200600, (Dn), (Dm)) )
1537 #define vmax_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000600, (Dn), (Dm)) )
1538 #define vmax_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100600, (Dn), (Dm)) )
1539 #define vmax_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200600, (Dn), (Dm)) )
1540 #define vmax_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000600, (Dn), (Dm)) )
1541 #define vmin_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100610, (Dn), (Dm)) )
1542 #define vmin_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200610, (Dn), (Dm)) )
1543 #define vmin_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000610, (Dn), (Dm)) )
1544 #define vmin_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100610, (Dn), (Dm)) )
1545 #define vmin_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200610, (Dn), (Dm)) )
1546 #define vmin_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000610, (Dn), (Dm)) )
1547 #define vmaxq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100640, (Qn), (Qm)) )
1548 #define vmaxq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200640, (Qn), (Qm)) )
1549 #define vmaxq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000640, (Qn), (Qm)) )
1550 #define vmaxq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100640, (Qn), (Qm)) )
1551 #define vmaxq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200640, (Qn), (Qm)) )
1552 #define vmaxq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000640, (Qn), (Qm)) )
1553 #define vminq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100650, (Qn), (Qm)) )
1554 #define vminq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200650, (Qn), (Qm)) )
1555 #define vminq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000650, (Qn), (Qm)) )
1556 #define vminq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100650, (Qn), (Qm)) )
1557 #define vminq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200650, (Qn), (Qm)) )
1558 #define vminq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000650, (Qn), (Qm)) )
1559 
1560 // VMLA, VMLS (by scalar)
1561 #define vmla_lane_f32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00140 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1562 #define vmla_lane_s16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900040 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1563 #define vmla_lane_s32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00040 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1564 #define vmla_lane_u16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900040 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1565 #define vmla_lane_u32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00040 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1566 #define vmls_lane_f32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00540 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1567 #define vmls_lane_s16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900440 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1568 #define vmls_lane_s32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00440 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1569 #define vmls_lane_u16(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2900440 | _NENC_5x3(lane), (Dd), (Dn), (Dm)) )
1570 #define vmls_lane_u32(Dd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx_acc( 0xf2a00440 | _NENC_5(lane), (Dd), (Dn), (Dm)) )
1571 #define vmlaq_lane_f32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00140 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1572 #define vmlaq_lane_s16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900040 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1573 #define vmlaq_lane_s32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00040 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1574 #define vmlaq_lane_u16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900040 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1575 #define vmlaq_lane_u32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00040 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1576 #define vmlsq_lane_f32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00540 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1577 #define vmlsq_lane_s16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900440 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1578 #define vmlsq_lane_s32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00440 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1579 #define vmlsq_lane_u16(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3900440 | _NENC_5x3(lane), (Qd), (Qn), (Dm)) )
1580 #define vmlsq_lane_u32(Qd, Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx_acc( 0xf3a00440 | _NENC_5(lane), (Qd), (Qn), (Dm)) )
1581 
1582 // VMLA, VMLS (float, by scalar)
1583 #define vmla_n_f32(Dd, Dn, Ft) ( __neon_DdDnFt_acc( 0xf2a00140, (Dd), (Dn), (Ft)) )
1584 #define vmls_n_f32(Dd, Dn, Ft) ( __neon_DdDnFt_acc( 0xf2a00540, (Dd), (Dn), (Ft)) )
1585 #define vmlaq_n_f32(Qd, Qn, Ft) ( __neon_QdQnFt_acc( 0xf3a00140, (Qd), (Qn), (Ft)) )
1586 #define vmlsq_n_f32(Qd, Qn, Ft) ( __neon_QdQnFt_acc( 0xf3a00540, (Qd), (Qn), (Ft)) )
1587 
1588 // VMLA, VMLS (floating point)
1589 #define vmla_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000d10, (Dd), (Dn), (Dm)) )
1590 #define vmls_f32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200d10, (Dd), (Dn), (Dm)) )
1591 #define vmlaq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000d50, (Qd), (Qn), (Qm)) )
1592 #define vmlsq_f32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200d50, (Qd), (Qn), (Qm)) )
1593 
1594 // VMLA, VMLS (integer)
1595 #define vmla_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100900, (Dd), (Dn), (Dm)) )
1596 #define vmla_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200900, (Dd), (Dn), (Dm)) )
1597 #define vmla_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000900, (Dd), (Dn), (Dm)) )
1598 #define vmla_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2100900, (Dd), (Dn), (Dm)) )
1599 #define vmla_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2200900, (Dd), (Dn), (Dm)) )
1600 #define vmla_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf2000900, (Dd), (Dn), (Dm)) )
1601 #define vmls_s16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100900, (Dd), (Dn), (Dm)) )
1602 #define vmls_s32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200900, (Dd), (Dn), (Dm)) )
1603 #define vmls_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000900, (Dd), (Dn), (Dm)) )
1604 #define vmls_u16(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3100900, (Dd), (Dn), (Dm)) )
1605 #define vmls_u32(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3200900, (Dd), (Dn), (Dm)) )
1606 #define vmls_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3000900, (Dd), (Dn), (Dm)) )
1607 #define vmlaq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100940, (Qd), (Qn), (Qm)) )
1608 #define vmlaq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200940, (Qd), (Qn), (Qm)) )
1609 #define vmlaq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000940, (Qd), (Qn), (Qm)) )
1610 #define vmlaq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2100940, (Qd), (Qn), (Qm)) )
1611 #define vmlaq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2200940, (Qd), (Qn), (Qm)) )
1612 #define vmlaq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf2000940, (Qd), (Qn), (Qm)) )
1613 #define vmlsq_s16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100940, (Qd), (Qn), (Qm)) )
1614 #define vmlsq_s32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200940, (Qd), (Qn), (Qm)) )
1615 #define vmlsq_s8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000940, (Qd), (Qn), (Qm)) )
1616 #define vmlsq_u16(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3100940, (Qd), (Qn), (Qm)) )
1617 #define vmlsq_u32(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3200940, (Qd), (Qn), (Qm)) )
1618 #define vmlsq_u8(Qd, Qn, Qm) ( __neon_QdQnQm_acc( 0xf3000940, (Qd), (Qn), (Qm)) )
1619 
1620 // VMLAL, VMLSL
1621 #define vmlal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900800, (Qd), (Dn), (Dm)) )
1622 #define vmlal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00800, (Qd), (Dn), (Dm)) )
1623 #define vmlal_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800800, (Qd), (Dn), (Dm)) )
1624 #define vmlal_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900800, (Qd), (Dn), (Dm)) )
1625 #define vmlal_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00800, (Qd), (Dn), (Dm)) )
1626 #define vmlal_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800800, (Qd), (Dn), (Dm)) )
1627 #define vmlsl_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900a00, (Qd), (Dn), (Dm)) )
1628 #define vmlsl_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00a00, (Qd), (Dn), (Dm)) )
1629 #define vmlsl_s8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2800a00, (Qd), (Dn), (Dm)) )
1630 #define vmlsl_u16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3900a00, (Qd), (Dn), (Dm)) )
1631 #define vmlsl_u32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3a00a00, (Qd), (Dn), (Dm)) )
1632 #define vmlsl_u8(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf3800a00, (Qd), (Dn), (Dm)) )
1633 
1634 // VMLAL, VMLSL (by scalar)
1635 #define vmlal_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900240 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1636 #define vmlal_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00240 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1637 #define vmlal_lane_u16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3900240 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1638 #define vmlal_lane_u32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3a00240 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1639 #define vmlsl_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900640 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1640 #define vmlsl_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00640 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1641 #define vmlsl_lane_u16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3900640 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1642 #define vmlsl_lane_u32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf3a00640 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1643 
1644 // VMOV (ARM core register to scalar)
1645 #define vset_lane_f32(Ft, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdFt_acc( 0xee000b10 | _NENC_21(lane), (Dd), (Ft)) )
1646 #define vset_lane_p16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __poly16ToInt32(Rt)) )
1647 #define vset_lane_p8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __poly8ToInt32(Rt)) )
1648 #define vset_lane_s16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __int16ToInt32(Rt)) )
1649 #define vset_lane_s32(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdRt_acc( 0xee000b10 | _NENC_21(lane), (Dd), __int32ToInt32(Rt)) )
1650 #define vset_lane_s8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __int8ToInt32(Rt)) )
1651 #define vset_lane_u16(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdRt_acc( 0xee000b30 | _NENC_21x6(lane), (Dd), __uint16ToInt32(Rt)) )
1652 #define vset_lane_u32(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdRt_acc( 0xee000b10 | _NENC_21(lane), (Dd), __uint32ToInt32(Rt)) )
1653 #define vset_lane_u8(Rt, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_DdRt_acc( 0xee400b10 | _NENC_21x6_5(lane), (Dd), __uint8ToInt32(Rt)) )
1654 
1655 // VMOV (scalar to ARM core register)
1656 #define vget_lane_f32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_FtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1657 #define vget_lane_p16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (poly16_t)__neon_RtDn( 0xee900b30 | _NENC_21x6(lane), (Dm)) )
1658 #define vget_lane_p8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (poly8_t)__neon_RtDn( 0xeed00b10 | _NENC_21x6_5(lane), (Dm)) )
1659 #define vget_lane_s16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (int16_t)__neon_RtDn( 0xee100b30 | _NENC_21x6(lane), (Dm)) )
1660 #define vget_lane_s8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (int8_t)__neon_RtDn( 0xee500b10 | _NENC_21x6_5(lane), (Dm)) )
1661 #define vget_lane_s32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (int32_t)__neon_RtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1662 #define vget_lane_u16(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (uint16_t)__neon_RtDn( 0xee900b30 | _NENC_21x6(lane), (Dm)) )
1663 #define vget_lane_u8(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (uint8_t)__neon_RtDn( 0xeed00b10 | _NENC_21x6_5(lane), (Dm)) )
1664 #define vget_lane_u32(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (uint32_t)__neon_RtDn( 0xee100b10 | _NENC_21(lane), (Dm)) )
1665 
1666 // VMOV.64 (ARM core register pair to scalar)
1667 #define vset_lane_s64(R64t, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), __neon_DdRtRt2_acc( 0xec400b10, (Dd), __int64ToInt64(R64t)) )
1668 #define vset_lane_u64(R64t, Dd, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), __neon_DdRtRt2_acc( 0xec400b10, (Dd), __uint64ToInt64(R64t)) )
1669 #define vsetq_lane_s64(R64t, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdRtRt2_acc( 0xec400b10 | _NENC_0(lane), (Qd), __int64ToInt64(R64t)) )
1670 #define vsetq_lane_u64(R64t, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdRtRt2_acc( 0xec400b10 | _NENC_0(lane), (Qd), __uint64ToInt64(R64t)) )
1671 
1672 // VMOV.64 (scalar to ARM core register pair)
1673 #define vget_lane_s64(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), (int64_t)__neon_RtRt2Dm( 0xec500b10, (Dm)) )
1674 #define vget_lane_u64(Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 1, "invalid lane index"), (uint64_t)__neon_RtRt2Dm( 0xec500b10, (Dm)) )
1675 #define vgetq_lane_s64(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (int64_t)__neon_RtRt2Qm( 0xec500b10 | _NENC_0(lane), (Qm)) )
1676 #define vgetq_lane_u64(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), (uint64_t)__neon_RtRt2Qm( 0xec500b10 | _NENC_0(lane), (Qm)) )
1677 
1678 // VMOV.Q (ARM core register to scalar)
1679 #define vsetq_lane_f32(Ft, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdFt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), (Ft)) )
1680 #define vsetq_lane_p16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __poly16ToInt32(Rt)) )
1681 #define vsetq_lane_p8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __poly8ToInt32(Rt)) )
1682 #define vsetq_lane_s16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __int16ToInt32(Rt)) )
1683 #define vsetq_lane_s32(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdRt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), __int32ToInt32(Rt)) )
1684 #define vsetq_lane_s8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __int8ToInt32(Rt)) )
1685 #define vsetq_lane_u16(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_QdRt_acc( 0xee000b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qd), __uint16ToInt32(Rt)) )
1686 #define vsetq_lane_u32(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdRt_acc( 0xee000b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qd), __uint32ToInt32(Rt)) )
1687 #define vsetq_lane_u8(Rt, Qd, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_QdRt_acc( 0xee400b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qd), __uint8ToInt32(Rt)) )
1688 
1689 // VMOV.Q (scalar to ARM core register)
1690 #define vgetq_lane_f32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_FtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1691 #define vgetq_lane_p16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (poly16_t)__neon_RtQn( 0xee900b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1692 #define vgetq_lane_p8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (poly8_t)__neon_RtQn( 0xeed00b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1693 #define vgetq_lane_s16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (int16_t)__neon_RtQn( 0xee100b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1694 #define vgetq_lane_s8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (int8_t)__neon_RtQn( 0xee500b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1695 #define vgetq_lane_s32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (int32_t)__neon_RtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1696 #define vgetq_lane_u16(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), (uint16_t)__neon_RtQn( 0xee900b30 | _NENC_16((lane) >= 4 ? 1 : 0) | _NENC_21x6((lane) % 4), (Qm)) )
1697 #define vgetq_lane_u8(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), (uint8_t)__neon_RtQn( 0xeed00b10 | _NENC_16((lane) >= 8 ? 1 : 0) | _NENC_21x6_5((lane) % 8), (Qm)) )
1698 #define vgetq_lane_u32(Qm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), (uint32_t)__neon_RtQn( 0xee100b10 | _NENC_16((lane) >= 2 ? 1 : 0) | _NENC_21((lane) % 2), (Qm)) )
1699 
1700 // VMOVL
1701 #define vmovl_s16(Dm) ( __neon_QdDm( 0xf2900a10, (Dm)) )
1702 #define vmovl_s32(Dm) ( __neon_QdDm( 0xf2a00a10, (Dm)) )
1703 #define vmovl_s8(Dm) ( __neon_QdDm( 0xf2880a10, (Dm)) )
1704 #define vmovl_u16(Dm) ( __neon_QdDm( 0xf3900a10, (Dm)) )
1705 #define vmovl_u32(Dm) ( __neon_QdDm( 0xf3a00a10, (Dm)) )
1706 #define vmovl_u8(Dm) ( __neon_QdDm( 0xf3880a10, (Dm)) )
1707 
1708 // VMOVN
1709 #define vmovn_s16(Qm) ( __neon_DdQm( 0xf3b20200, (Qm)) )
1710 #define vmovn_s32(Qm) ( __neon_DdQm( 0xf3b60200, (Qm)) )
1711 #define vmovn_s64(Qm) ( __neon_DdQm( 0xf3ba0200, (Qm)) )
1712 #define vmovn_u16(Qm) ( __neon_DdQm( 0xf3b20200, (Qm)) )
1713 #define vmovn_u32(Qm) ( __neon_DdQm( 0xf3b60200, (Qm)) )
1714 #define vmovn_u64(Qm) ( __neon_DdQm( 0xf3ba0200, (Qm)) )
1715 
1716 // VMUL
1717 #define vmul_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000d10, (Dn), (Dm)) )
1718 #define vmul_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3000910, (Dn), (Dm)) )
1719 #define vmul_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100910, (Dn), (Dm)) )
1720 #define vmul_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200910, (Dn), (Dm)) )
1721 #define vmul_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000910, (Dn), (Dm)) )
1722 #define vmul_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100910, (Dn), (Dm)) )
1723 #define vmul_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200910, (Dn), (Dm)) )
1724 #define vmul_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000910, (Dn), (Dm)) )
1725 #define vmulq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf3000d50, (Qn), (Qm)) )
1726 #define vmulq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf3000950, (Qn), (Qm)) )
1727 #define vmulq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100950, (Qn), (Qm)) )
1728 #define vmulq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200950, (Qn), (Qm)) )
1729 #define vmulq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000950, (Qn), (Qm)) )
1730 #define vmulq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100950, (Qn), (Qm)) )
1731 #define vmulq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200950, (Qn), (Qm)) )
1732 #define vmulq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000950, (Qn), (Qm)) )
1733 
1734 // VMUL (by scalar - float)
1735 #define vmul_n_f32(Dn, Ft) ( __neon_DdDnFt( 0xf2a00940, (Dn), (Ft)) )
1736 #define vmulq_n_f32(Qn, Ft) ( __neon_QdQnFt( 0xf3a00940, (Qn), (Ft)) )
1737 
1738 // VMUL (by scalar)
1739 #define vmul_lane_f32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00940 | _NENC_5(lane), (Dn), (Dm)) )
1740 #define vmul_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900840 | _NENC_5x3(lane), (Dn), (Dm)) )
1741 #define vmul_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00840 | _NENC_5(lane), (Dn), (Dm)) )
1742 #define vmul_lane_u16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900840 | _NENC_5x3(lane), (Dn), (Dm)) )
1743 #define vmul_lane_u32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00840 | _NENC_5(lane), (Dn), (Dm)) )
1744 #define vmulq_lane_f32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00940 | _NENC_5(lane), (Qn), (Dm)) )
1745 #define vmulq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900840 | _NENC_5x3(lane), (Qn), (Dm)) )
1746 #define vmulq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00840 | _NENC_5(lane), (Qn), (Dm)) )
1747 #define vmulq_lane_u16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900840 | _NENC_5x3(lane), (Qn), (Dm)) )
1748 #define vmulq_lane_u32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00840 | _NENC_5(lane), (Qn), (Dm)) )
1749 
1750 // VMULL
1751 #define vmull_p64(Dn, Dm) ( __neon_QdDnDm( 0xf2a00e00, (Dn), (Dm)) )
1752 #define vmull_p8(Dn, Dm) ( __neon_QdDnDm( 0xf2800e00, (Dn), (Dm)) )
1753 #define vmull_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900c00, (Dn), (Dm)) )
1754 #define vmull_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00c00, (Dn), (Dm)) )
1755 #define vmull_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800c00, (Dn), (Dm)) )
1756 #define vmull_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900c00, (Dn), (Dm)) )
1757 #define vmull_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00c00, (Dn), (Dm)) )
1758 #define vmull_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800c00, (Dn), (Dm)) )
1759 
1760 // VMULL (by scalar)
1761 #define vmull_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf2900a40 | _NENC_5x3(lane), (Dn), (Dm)) )
1762 #define vmull_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf2a00a40 | _NENC_5(lane), (Dn), (Dm)) )
1763 #define vmull_lane_u16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf3900a40 | _NENC_5x3(lane), (Dn), (Dm)) )
1764 #define vmull_lane_u32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf3a00a40 | _NENC_5(lane), (Dn), (Dm)) )
1765 
1766 // VMVN
1767 #define vmvn_p16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1768 #define vmvn_p8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1769 #define vmvn_s16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1770 #define vmvn_s32(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1771 #define vmvn_s8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1772 #define vmvn_u16(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1773 #define vmvn_u32(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1774 #define vmvn_u8(Dm) ( __neon_DdDm( 0xf3b00580, (Dm)) )
1775 #define vmvnq_p16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1776 #define vmvnq_p8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1777 #define vmvnq_s16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1778 #define vmvnq_s32(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1779 #define vmvnq_s8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1780 #define vmvnq_u16(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1781 #define vmvnq_u32(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1782 #define vmvnq_u8(Qm) ( __neon_QdQm( 0xf3b005c0, (Qm)) )
1783 
1784 // VPADAL
1785 #define vpadal_s16(Dd, Dm) ( __neon_DdDm_acc( 0xf3b40600, (Dd), (Dm)) )
1786 #define vpadal_s32(Dd, Dm) ( __neon_DdDm_acc( 0xf3b80600, (Dd), (Dm)) )
1787 #define vpadal_s8(Dd, Dm) ( __neon_DdDm_acc( 0xf3b00600, (Dd), (Dm)) )
1788 #define vpadal_u16(Dd, Dm) ( __neon_DdDm_acc( 0xf3b40680, (Dd), (Dm)) )
1789 #define vpadal_u32(Dd, Dm) ( __neon_DdDm_acc( 0xf3b80680, (Dd), (Dm)) )
1790 #define vpadal_u8(Dd, Dm) ( __neon_DdDm_acc( 0xf3b00680, (Dd), (Dm)) )
1791 #define vpadalq_s16(Qd, Qm) ( __neon_QdQm_acc( 0xf3b40640, (Qd), (Qm)) )
1792 #define vpadalq_s32(Qd, Qm) ( __neon_QdQm_acc( 0xf3b80640, (Qd), (Qm)) )
1793 #define vpadalq_s8(Qd, Qm) ( __neon_QdQm_acc( 0xf3b00640, (Qd), (Qm)) )
1794 #define vpadalq_u16(Qd, Qm) ( __neon_QdQm_acc( 0xf3b406c0, (Qd), (Qm)) )
1795 #define vpadalq_u32(Qd, Qm) ( __neon_QdQm_acc( 0xf3b806c0, (Qd), (Qm)) )
1796 #define vpadalq_u8(Qd, Qm) ( __neon_QdQm_acc( 0xf3b006c0, (Qd), (Qm)) )
1797 
1798 // VPADD (floating point)
1799 #define vpadd_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000d00, (Dn), (Dm)) )
1800 
1801 // VPADD (integer)
1802 #define vpadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b10, (Dn), (Dm)) )
1803 #define vpadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b10, (Dn), (Dm)) )
1804 #define vpadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000b10, (Dn), (Dm)) )
1805 #define vpadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b10, (Dn), (Dm)) )
1806 #define vpadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b10, (Dn), (Dm)) )
1807 #define vpadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000b10, (Dn), (Dm)) )
1808 
1809 // VPADDL
1810 #define vpaddl_s16(Dm) ( __neon_DdDm( 0xf3b40200, (Dm)) )
1811 #define vpaddl_s32(Dm) ( __neon_DdDm( 0xf3b80200, (Dm)) )
1812 #define vpaddl_s8(Dm) ( __neon_DdDm( 0xf3b00200, (Dm)) )
1813 #define vpaddl_u16(Dm) ( __neon_DdDm( 0xf3b40280, (Dm)) )
1814 #define vpaddl_u32(Dm) ( __neon_DdDm( 0xf3b80280, (Dm)) )
1815 #define vpaddl_u8(Dm) ( __neon_DdDm( 0xf3b00280, (Dm)) )
1816 #define vpaddlq_s16(Qm) ( __neon_QdQm( 0xf3b40240, (Qm)) )
1817 #define vpaddlq_s32(Qm) ( __neon_QdQm( 0xf3b80240, (Qm)) )
1818 #define vpaddlq_s8(Qm) ( __neon_QdQm( 0xf3b00240, (Qm)) )
1819 #define vpaddlq_u16(Qm) ( __neon_QdQm( 0xf3b402c0, (Qm)) )
1820 #define vpaddlq_u32(Qm) ( __neon_QdQm( 0xf3b802c0, (Qm)) )
1821 #define vpaddlq_u8(Qm) ( __neon_QdQm( 0xf3b002c0, (Qm)) )
1822 
1823 // VPMAX, VPMIN (floating point)
1824 #define vpmax_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3000f00, (Dn), (Dm)) )
1825 #define vpmin_f32(Dn, Dm) ( __neon_DdDnDm( 0xf3200f00, (Dn), (Dm)) )
1826 
1827 // VPMAX, VPMIN (integer)
1828 #define vpmax_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100a00, (Dn), (Dm)) )
1829 #define vpmax_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200a00, (Dn), (Dm)) )
1830 #define vpmax_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000a00, (Dn), (Dm)) )
1831 #define vpmax_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100a00, (Dn), (Dm)) )
1832 #define vpmax_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200a00, (Dn), (Dm)) )
1833 #define vpmax_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000a00, (Dn), (Dm)) )
1834 #define vpmin_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100a10, (Dn), (Dm)) )
1835 #define vpmin_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200a10, (Dn), (Dm)) )
1836 #define vpmin_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000a10, (Dn), (Dm)) )
1837 #define vpmin_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100a10, (Dn), (Dm)) )
1838 #define vpmin_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200a10, (Dn), (Dm)) )
1839 #define vpmin_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000a10, (Dn), (Dm)) )
1840 
1841 // VQABS, VQNEG
1842 #define vqabs_s16(Dm) ( __neon_DdDm( 0xf3b40700, (Dm)) )
1843 #define vqabs_s32(Dm) ( __neon_DdDm( 0xf3b80700, (Dm)) )
1844 #define vqabs_s8(Dm) ( __neon_DdDm( 0xf3b00700, (Dm)) )
1845 #define vqneg_s16(Dm) ( __neon_DdDm( 0xf3b40780, (Dm)) )
1846 #define vqneg_s32(Dm) ( __neon_DdDm( 0xf3b80780, (Dm)) )
1847 #define vqneg_s8(Dm) ( __neon_DdDm( 0xf3b00780, (Dm)) )
1848 #define vqabsq_s16(Qm) ( __neon_QdQm( 0xf3b40740, (Qm)) )
1849 #define vqabsq_s32(Qm) ( __neon_QdQm( 0xf3b80740, (Qm)) )
1850 #define vqabsq_s8(Qm) ( __neon_QdQm( 0xf3b00740, (Qm)) )
1851 #define vqnegq_s16(Qm) ( __neon_QdQm( 0xf3b407c0, (Qm)) )
1852 #define vqnegq_s32(Qm) ( __neon_QdQm( 0xf3b807c0, (Qm)) )
1853 #define vqnegq_s8(Qm) ( __neon_QdQm( 0xf3b007c0, (Qm)) )
1854 
1855 // VQADD
1856 #define vqadd_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100010, (Dn), (Dm)) )
1857 #define vqadd_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200010, (Dn), (Dm)) )
1858 #define vqadd_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300010, (Dn), (Dm)) )
1859 #define vqadd_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000010, (Dn), (Dm)) )
1860 #define vqadd_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100010, (Dn), (Dm)) )
1861 #define vqadd_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200010, (Dn), (Dm)) )
1862 #define vqadd_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300010, (Dn), (Dm)) )
1863 #define vqadd_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000010, (Dn), (Dm)) )
1864 #define vqaddq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100050, (Qn), (Qm)) )
1865 #define vqaddq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200050, (Qn), (Qm)) )
1866 #define vqaddq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300050, (Qn), (Qm)) )
1867 #define vqaddq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000050, (Qn), (Qm)) )
1868 #define vqaddq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100050, (Qn), (Qm)) )
1869 #define vqaddq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200050, (Qn), (Qm)) )
1870 #define vqaddq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300050, (Qn), (Qm)) )
1871 #define vqaddq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000050, (Qn), (Qm)) )
1872 
1873 // VQDMLAL, VQDMLSL
1874 #define vqdmlal_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900900, (Qd), (Dn), (Dm)) )
1875 #define vqdmlal_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00900, (Qd), (Dn), (Dm)) )
1876 #define vqdmlsl_s16(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2900b00, (Qd), (Dn), (Dm)) )
1877 #define vqdmlsl_s32(Qd, Dn, Dm) ( __neon_QdDnDm_acc( 0xf2a00b00, (Qd), (Dn), (Dm)) )
1878 
1879 // VQDMLAL, VQDMLSL (by scalar)
1880 #define vqdmlal_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900340 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1881 #define vqdmlal_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00340 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1882 #define vqdmlsl_lane_s16(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2900740 | _NENC_5x3(lane), (Qd), (Dn), (Dm)) )
1883 #define vqdmlsl_lane_s32(Qd, Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx_acc( 0xf2a00740 | _NENC_5(lane), (Qd), (Dn), (Dm)) )
1884 
1885 // VQDMULH (by scalar)
1886 #define vqdmulh_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900c40 | _NENC_5x3(lane), (Dn), (Dm)) )
1887 #define vqdmulh_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00c40 | _NENC_5(lane), (Dn), (Dm)) )
1888 #define vqrdmulh_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_DdDnDmx( 0xf2900d40 | _NENC_5x3(lane), (Dn), (Dm)) )
1889 #define vqrdmulh_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_DdDnDmx( 0xf2a00d40 | _NENC_5(lane), (Dn), (Dm)) )
1890 #define vqdmulhq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900c40 | _NENC_5x3(lane), (Qn), (Dm)) )
1891 #define vqdmulhq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00c40 | _NENC_5(lane), (Qn), (Dm)) )
1892 #define vqrdmulhq_lane_s16(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdQnDmx( 0xf3900d40 | _NENC_5x3(lane), (Qn), (Dm)) )
1893 #define vqrdmulhq_lane_s32(Qn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdQnDmx( 0xf3a00d40 | _NENC_5(lane), (Qn), (Dm)) )
1894 
1895 // VQDMULH, VQRDMULH
1896 #define vqdmulh_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100b00, (Dn), (Dm)) )
1897 #define vqdmulh_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200b00, (Dn), (Dm)) )
1898 #define vqrdmulh_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100b00, (Dn), (Dm)) )
1899 #define vqrdmulh_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200b00, (Dn), (Dm)) )
1900 #define vqdmulhq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100b40, (Qn), (Qm)) )
1901 #define vqdmulhq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200b40, (Qn), (Qm)) )
1902 #define vqrdmulhq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100b40, (Qn), (Qm)) )
1903 #define vqrdmulhq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200b40, (Qn), (Qm)) )
1904 
1905 // VQDMULL
1906 #define vqdmull_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900d00, (Dn), (Dm)) )
1907 #define vqdmull_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00d00, (Dn), (Dm)) )
1908 
1909 // VQDMULL (by scalar)
1910 #define vqdmull_lane_s16(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_QdDnDmx( 0xf2900b40 | _NENC_5x3(lane), (Dn), (Dm)) )
1911 #define vqdmull_lane_s32(Dn, Dm, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_QdDnDmx( 0xf2a00b40 | _NENC_5(lane), (Dn), (Dm)) )
1912 
1913 // VQMOVN, VQMOVUN
1914 #define vqmovn_s16(Qm) ( __neon_DdQm( 0xf3b20280, (Qm)) )
1915 #define vqmovn_s32(Qm) ( __neon_DdQm( 0xf3b60280, (Qm)) )
1916 #define vqmovn_s64(Qm) ( __neon_DdQm( 0xf3ba0280, (Qm)) )
1917 #define vqmovn_u16(Qm) ( __neon_DdQm( 0xf3b202c0, (Qm)) )
1918 #define vqmovn_u32(Qm) ( __neon_DdQm( 0xf3b602c0, (Qm)) )
1919 #define vqmovn_u64(Qm) ( __neon_DdQm( 0xf3ba02c0, (Qm)) )
1920 #define vqmovun_s16(Qm) ( __neon_DdQm( 0xf3b20240, (Qm)) )
1921 #define vqmovun_s32(Qm) ( __neon_DdQm( 0xf3b60240, (Qm)) )
1922 #define vqmovun_s64(Qm) ( __neon_DdQm( 0xf3ba0240, (Qm)) )
1923 
1924 // VQSHL, VQSHLU (immediate)
1925 #define vqshl_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900710 | _NENC_19_16(shift_amount), (Dm)) )
1926 #define vqshl_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00710 | _NENC_20_16(shift_amount), (Dm)) )
1927 #define vqshl_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800790 | _NENC_21_16(shift_amount), (Dm)) )
1928 #define vqshl_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880710 | _NENC_18_16(shift_amount), (Dm)) )
1929 #define vqshl_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf3900710 | _NENC_19_16(shift_amount), (Dm)) )
1930 #define vqshl_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf3a00710 | _NENC_20_16(shift_amount), (Dm)) )
1931 #define vqshl_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf3800790 | _NENC_21_16(shift_amount), (Dm)) )
1932 #define vqshl_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf3880710 | _NENC_18_16(shift_amount), (Dm)) )
1933 #define vqshlu_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf3900610 | _NENC_19_16(shift_amount), (Dm)) )
1934 #define vqshlu_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf3a00610 | _NENC_20_16(shift_amount), (Dm)) )
1935 #define vqshlu_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf3800690 | _NENC_21_16(shift_amount), (Dm)) )
1936 #define vqshlu_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf3880610 | _NENC_18_16(shift_amount), (Dm)) )
1937 #define vqshlq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900750 | _NENC_19_16(shift_amount), (Qm)) )
1938 #define vqshlq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00750 | _NENC_20_16(shift_amount), (Qm)) )
1939 #define vqshlq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28007d0 | _NENC_21_16(shift_amount), (Qm)) )
1940 #define vqshlq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880750 | _NENC_18_16(shift_amount), (Qm)) )
1941 #define vqshlq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf3900750 | _NENC_19_16(shift_amount), (Qm)) )
1942 #define vqshlq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf3a00750 | _NENC_20_16(shift_amount), (Qm)) )
1943 #define vqshlq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf38007d0 | _NENC_21_16(shift_amount), (Qm)) )
1944 #define vqshlq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf3880750 | _NENC_18_16(shift_amount), (Qm)) )
1945 #define vqshluq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf3900650 | _NENC_19_16(shift_amount), (Qm)) )
1946 #define vqshluq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf3a00650 | _NENC_20_16(shift_amount), (Qm)) )
1947 #define vqshluq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf38006d0 | _NENC_21_16(shift_amount), (Qm)) )
1948 #define vqshluq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf3880650 | _NENC_18_16(shift_amount), (Qm)) )
1949 
1950 // VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN (immediate)
1951 #define vqrshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880950 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1952 #define vqrshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900950 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1953 #define vqrshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00950 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1954 #define vqrshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880950 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1955 #define vqrshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900950 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1956 #define vqrshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00950 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1957 #define vqrshrun_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1958 #define vqrshrun_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1959 #define vqrshrun_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1960 #define vqshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880910 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1961 #define vqshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900910 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1962 #define vqshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00910 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1963 #define vqshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880910 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1964 #define vqshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900910 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1965 #define vqshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00910 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1966 #define vqshrun_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf3880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
1967 #define vqshrun_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf3900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
1968 #define vqshrun_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf3a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
1969 
1970 // VQSUB
1971 #define vqsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100210, (Dn), (Dm)) )
1972 #define vqsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200210, (Dn), (Dm)) )
1973 #define vqsub_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300210, (Dn), (Dm)) )
1974 #define vqsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000210, (Dn), (Dm)) )
1975 #define vqsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100210, (Dn), (Dm)) )
1976 #define vqsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200210, (Dn), (Dm)) )
1977 #define vqsub_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300210, (Dn), (Dm)) )
1978 #define vqsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000210, (Dn), (Dm)) )
1979 #define vqsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100250, (Qn), (Qm)) )
1980 #define vqsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200250, (Qn), (Qm)) )
1981 #define vqsubq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300250, (Qn), (Qm)) )
1982 #define vqsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000250, (Qn), (Qm)) )
1983 #define vqsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100250, (Qn), (Qm)) )
1984 #define vqsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200250, (Qn), (Qm)) )
1985 #define vqsubq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300250, (Qn), (Qm)) )
1986 #define vqsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000250, (Qn), (Qm)) )
1987 
1988 // VRECPE, VRSQRTE
1989 #define vrecpe_f32(Dm) ( __neon_DdDm( 0xf3bb0500, (Dm)) )
1990 #define vrecpe_u32(Dm) ( __neon_DdDm( 0xf3bb0400, (Dm)) )
1991 #define vrsqrte_f32(Dm) ( __neon_DdDm( 0xf3bb0580, (Dm)) )
1992 #define vrsqrte_u32(Dm) ( __neon_DdDm( 0xf3bb0480, (Dm)) )
1993 #define vrecpeq_f32(Qm) ( __neon_QdQm( 0xf3bb0540, (Qm)) )
1994 #define vrecpeq_u32(Qm) ( __neon_QdQm( 0xf3bb0440, (Qm)) )
1995 #define vrsqrteq_f32(Qm) ( __neon_QdQm( 0xf3bb05c0, (Qm)) )
1996 #define vrsqrteq_u32(Qm) ( __neon_QdQm( 0xf3bb04c0, (Qm)) )
1997 
1998 // VRECPS
1999 #define vrecps_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2000f10, (Dn), (Dm)) )
2000 #define vrecpsq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2000f50, (Qn), (Qm)) )
2001 
2002 // VREV
2003 #define vrev16_p8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2004 #define vrev16_s8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2005 #define vrev16_u8(Dm) ( __neon_DdDm( 0xf3b00100, (Dm)) )
2006 #define vrev32_p16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2007 #define vrev32_p8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2008 #define vrev32_s16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2009 #define vrev32_s8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2010 #define vrev32_u16(Dm) ( __neon_DdDm( 0xf3b40080, (Dm)) )
2011 #define vrev32_u8(Dm) ( __neon_DdDm( 0xf3b00080, (Dm)) )
2012 #define vrev64_f32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2013 #define vrev64_p16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2014 #define vrev64_p8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2015 #define vrev64_s16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2016 #define vrev64_s32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2017 #define vrev64_s8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2018 #define vrev64_u16(Dm) ( __neon_DdDm( 0xf3b40000, (Dm)) )
2019 #define vrev64_u32(Dm) ( __neon_DdDm( 0xf3b80000, (Dm)) )
2020 #define vrev64_u8(Dm) ( __neon_DdDm( 0xf3b00000, (Dm)) )
2021 #define vrev16q_p8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2022 #define vrev16q_s8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2023 #define vrev16q_u8(Qm) ( __neon_QdQm( 0xf3b00140, (Qm)) )
2024 #define vrev32q_p16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2025 #define vrev32q_p8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2026 #define vrev32q_s16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2027 #define vrev32q_s8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2028 #define vrev32q_u16(Qm) ( __neon_QdQm( 0xf3b400c0, (Qm)) )
2029 #define vrev32q_u8(Qm) ( __neon_QdQm( 0xf3b000c0, (Qm)) )
2030 #define vrev64q_f32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2031 #define vrev64q_p16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2032 #define vrev64q_p8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2033 #define vrev64q_s16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2034 #define vrev64q_s32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2035 #define vrev64q_s8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2036 #define vrev64q_u16(Qm) ( __neon_QdQm( 0xf3b40040, (Qm)) )
2037 #define vrev64q_u32(Qm) ( __neon_QdQm( 0xf3b80040, (Qm)) )
2038 #define vrev64q_u8(Qm) ( __neon_QdQm( 0xf3b00040, (Qm)) )
2039 
2040 // VRINT
2041 #define vrnd_f32(Dm) ( __neon_DdDm( 0xf3ba0580, (Dm)) )
2042 #define vrnda_f32(Dm) ( __neon_DdDm( 0xf3ba0500, (Dm)) )
2043 #define vrndm_f32(Dm) ( __neon_DdDm( 0xf3ba0680, (Dm)) )
2044 #define vrndn_f32(Dm) ( __neon_DdDm( 0xf3ba0400, (Dm)) )
2045 #define vrndp_f32(Dm) ( __neon_DdDm( 0xf3ba0780, (Dm)) )
2046 #define vrndx_f32(Dm) ( __neon_DdDm( 0xf3ba0480, (Dm)) )
2047 #define vrndq_f32(Qm) ( __neon_QdQm( 0xf3ba05c0, (Qm)) )
2048 #define vrndaq_f32(Qm) ( __neon_QdQm( 0xf3ba0540, (Qm)) )
2049 #define vrndmq_f32(Qm) ( __neon_QdQm( 0xf3ba06c0, (Qm)) )
2050 #define vrndnq_f32(Qm) ( __neon_QdQm( 0xf3ba0440, (Qm)) )
2051 #define vrndpq_f32(Qm) ( __neon_QdQm( 0xf3ba07c0, (Qm)) )
2052 #define vrndxq_f32(Qm) ( __neon_QdQm( 0xf3ba04c0, (Qm)) )
2053 
2054 // VRSQRTS
2055 #define vrsqrts_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200f10, (Dn), (Dm)) )
2056 #define vrsqrtsq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200f50, (Qn), (Qm)) )
2057 
2058 // VSHL (immediate)
2059 #define vshl_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900510 | _NENC_19_16(shift_amount), (Dm)) )
2060 #define vshl_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00510 | _NENC_20_16(shift_amount), (Dm)) )
2061 #define vshl_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800590 | _NENC_21_16(shift_amount), (Dm)) )
2062 #define vshl_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880510 | _NENC_18_16(shift_amount), (Dm)) )
2063 #define vshl_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm( 0xf2900510 | _NENC_19_16(shift_amount), (Dm)) )
2064 #define vshl_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm( 0xf2a00510 | _NENC_20_16(shift_amount), (Dm)) )
2065 #define vshl_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm( 0xf2800590 | _NENC_21_16(shift_amount), (Dm)) )
2066 #define vshl_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm( 0xf2880510 | _NENC_18_16(shift_amount), (Dm)) )
2067 #define vshlq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900550 | _NENC_19_16(shift_amount), (Qm)) )
2068 #define vshlq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00550 | _NENC_20_16(shift_amount), (Qm)) )
2069 #define vshlq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28005d0 | _NENC_21_16(shift_amount), (Qm)) )
2070 #define vshlq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880550 | _NENC_18_16(shift_amount), (Qm)) )
2071 #define vshlq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm( 0xf2900550 | _NENC_19_16(shift_amount), (Qm)) )
2072 #define vshlq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm( 0xf2a00550 | _NENC_20_16(shift_amount), (Qm)) )
2073 #define vshlq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm( 0xf28005d0 | _NENC_21_16(shift_amount), (Qm)) )
2074 #define vshlq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm( 0xf2880550 | _NENC_18_16(shift_amount), (Qm)) )
2075 
2076 // VSHL, VQSHL, VRSHL, VQRSHL (register)
2077 #define vqrshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100510, (Dm), (Dn)) )
2078 #define vqrshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200510, (Dm), (Dn)) )
2079 #define vqrshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300510, (Dm), (Dn)) )
2080 #define vqrshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000510, (Dm), (Dn)) )
2081 #define vqrshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100510, (Dm), (Dn)) )
2082 #define vqrshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200510, (Dm), (Dn)) )
2083 #define vqrshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300510, (Dm), (Dn)) )
2084 #define vqrshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000510, (Dm), (Dn)) )
2085 #define vqshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100410, (Dm), (Dn)) )
2086 #define vqshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200410, (Dm), (Dn)) )
2087 #define vqshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300410, (Dm), (Dn)) )
2088 #define vqshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000410, (Dm), (Dn)) )
2089 #define vqshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100410, (Dm), (Dn)) )
2090 #define vqshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200410, (Dm), (Dn)) )
2091 #define vqshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300410, (Dm), (Dn)) )
2092 #define vqshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000410, (Dm), (Dn)) )
2093 #define vrshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100500, (Dm), (Dn)) )
2094 #define vrshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200500, (Dm), (Dn)) )
2095 #define vrshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300500, (Dm), (Dn)) )
2096 #define vrshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000500, (Dm), (Dn)) )
2097 #define vrshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100500, (Dm), (Dn)) )
2098 #define vrshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200500, (Dm), (Dn)) )
2099 #define vrshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300500, (Dm), (Dn)) )
2100 #define vrshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000500, (Dm), (Dn)) )
2101 #define vshl_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100400, (Dm), (Dn)) )
2102 #define vshl_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200400, (Dm), (Dn)) )
2103 #define vshl_s64(Dn, Dm) ( __neon_DdDnDm( 0xf2300400, (Dm), (Dn)) )
2104 #define vshl_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000400, (Dm), (Dn)) )
2105 #define vshl_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100400, (Dm), (Dn)) )
2106 #define vshl_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200400, (Dm), (Dn)) )
2107 #define vshl_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300400, (Dm), (Dn)) )
2108 #define vshl_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000400, (Dm), (Dn)) )
2109 #define vqrshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100550, (Qm), (Qn)) )
2110 #define vqrshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200550, (Qm), (Qn)) )
2111 #define vqrshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300550, (Qm), (Qn)) )
2112 #define vqrshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000550, (Qm), (Qn)) )
2113 #define vqrshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100550, (Qm), (Qn)) )
2114 #define vqrshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200550, (Qm), (Qn)) )
2115 #define vqrshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300550, (Qm), (Qn)) )
2116 #define vqrshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000550, (Qm), (Qn)) )
2117 #define vqshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100450, (Qm), (Qn)) )
2118 #define vqshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200450, (Qm), (Qn)) )
2119 #define vqshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300450, (Qm), (Qn)) )
2120 #define vqshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000450, (Qm), (Qn)) )
2121 #define vqshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100450, (Qm), (Qn)) )
2122 #define vqshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200450, (Qm), (Qn)) )
2123 #define vqshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300450, (Qm), (Qn)) )
2124 #define vqshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000450, (Qm), (Qn)) )
2125 #define vrshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100540, (Qm), (Qn)) )
2126 #define vrshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200540, (Qm), (Qn)) )
2127 #define vrshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300540, (Qm), (Qn)) )
2128 #define vrshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000540, (Qm), (Qn)) )
2129 #define vrshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100540, (Qm), (Qn)) )
2130 #define vrshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200540, (Qm), (Qn)) )
2131 #define vrshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300540, (Qm), (Qn)) )
2132 #define vrshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000540, (Qm), (Qn)) )
2133 #define vshlq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100440, (Qm), (Qn)) )
2134 #define vshlq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200440, (Qm), (Qn)) )
2135 #define vshlq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf2300440, (Qm), (Qn)) )
2136 #define vshlq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000440, (Qm), (Qn)) )
2137 #define vshlq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100440, (Qm), (Qn)) )
2138 #define vshlq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200440, (Qm), (Qn)) )
2139 #define vshlq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300440, (Qm), (Qn)) )
2140 #define vshlq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000440, (Qm), (Qn)) )
2141 
2142 // VSHLL (shift_amount != size)
2143 #define __internal_vshll_n_t1_s16(Dm, shift_amount) ( __neon_QdDm( 0xf2900a10 | _NENC_19_16(shift_amount), (Dm)) )
2144 #define __internal_vshll_n_t1_s32(Dm, shift_amount) ( __neon_QdDm( 0xf2a00a10 | _NENC_20_16(shift_amount), (Dm)) )
2145 #define __internal_vshll_n_t1_s8(Dm, shift_amount) ( __neon_QdDm( 0xf2880a10 | _NENC_18_16(shift_amount), (Dm)) )
2146 #define __internal_vshll_n_t1_u16(Dm, shift_amount) ( __neon_QdDm( 0xf3900a10 | _NENC_19_16(shift_amount), (Dm)) )
2147 #define __internal_vshll_n_t1_u32(Dm, shift_amount) ( __neon_QdDm( 0xf3a00a10 | _NENC_20_16(shift_amount), (Dm)) )
2148 #define __internal_vshll_n_t1_u8(Dm, shift_amount) ( __neon_QdDm( 0xf3880a10 | _NENC_18_16(shift_amount), (Dm)) )
2149 
2150 // VSHLL (shift_amount == size)
2151 #define __internal_vshll_n_t2_s16(Dm) ( __neon_QdDm( 0xf3b60300, (Dm)) )
2152 #define __internal_vshll_n_t2_s32(Dm) ( __neon_QdDm( 0xf3ba0300, (Dm)) )
2153 #define __internal_vshll_n_t2_s8(Dm) ( __neon_QdDm( 0xf3b20300, (Dm)) )
2154 #define __internal_vshll_n_t2_u16(Dm) ( __neon_QdDm( 0xf3b60300, (Dm)) )
2155 #define __internal_vshll_n_t2_u32(Dm) ( __neon_QdDm( 0xf3ba0300, (Dm)) )
2156 #define __internal_vshll_n_t2_u8(Dm) ( __neon_QdDm( 0xf3b20300, (Dm)) )
2157 
2158 // VSHR, VRSHR (immediate)
2159 #define vrshr_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf2900210 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2160 #define vrshr_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf2a00210 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2161 #define vrshr_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf2800290 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2162 #define vrshr_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf2880210 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2163 #define vrshr_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf3900210 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2164 #define vrshr_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf3a00210 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2165 #define vrshr_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf3800290 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2166 #define vrshr_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf3880210 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2167 #define vshr_n_s16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf2900010 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2168 #define vshr_n_s32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf2a00010 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2169 #define vshr_n_s64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf2800090 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2170 #define vshr_n_s8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf2880010 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2171 #define vshr_n_u16(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm( 0xf3900010 | _NENC_19_16(16 - (shift_amount)), (Dm)) )
2172 #define vshr_n_u32(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm( 0xf3a00010 | _NENC_20_16(32 - (shift_amount)), (Dm)) )
2173 #define vshr_n_u64(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm( 0xf3800090 | _NENC_21_16(64 - (shift_amount)), (Dm)) )
2174 #define vshr_n_u8(Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm( 0xf3880010 | _NENC_18_16(8 - (shift_amount)), (Dm)) )
2175 #define vrshrq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf2900250 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2176 #define vrshrq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf2a00250 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2177 #define vrshrq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf28002d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2178 #define vrshrq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf2880250 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2179 #define vrshrq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf3900250 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2180 #define vrshrq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf3a00250 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2181 #define vrshrq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf38002d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2182 #define vrshrq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf3880250 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2183 #define vshrq_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf2900050 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2184 #define vshrq_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf2a00050 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2185 #define vshrq_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf28000d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2186 #define vshrq_n_s8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf2880050 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2187 #define vshrq_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm( 0xf3900050 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2188 #define vshrq_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm( 0xf3a00050 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2189 #define vshrq_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm( 0xf38000d0 | _NENC_21_16(64 - (shift_amount)), (Qm)) )
2190 #define vshrq_n_u8(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm( 0xf3880050 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2191 
2192 // VSHRN, VRSHRN (immediate)
2193 #define vrshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2194 #define vrshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2195 #define vrshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2196 #define vrshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880850 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2197 #define vrshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900850 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2198 #define vrshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00850 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2199 #define vshrn_n_s16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2200 #define vshrn_n_s32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2201 #define vshrn_n_s64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2202 #define vshrn_n_u16(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdQm( 0xf2880810 | _NENC_18_16(8 - (shift_amount)), (Qm)) )
2203 #define vshrn_n_u32(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdQm( 0xf2900810 | _NENC_19_16(16 - (shift_amount)), (Qm)) )
2204 #define vshrn_n_u64(Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdQm( 0xf2a00810 | _NENC_20_16(32 - (shift_amount)), (Qm)) )
2205 
2206 // VSLI (immediate)
2207 #define vsli_n_p16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2208 #define vsli_n_p8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2209 #define vsli_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2210 #define vsli_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00510 | _NENC_20_16(shift_amount), (Dd), (Dm)) )
2211 #define vsli_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800590 | _NENC_21_16(shift_amount), (Dd), (Dm)) )
2212 #define vsli_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2213 #define vsli_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900510 | _NENC_19_16(shift_amount), (Dd), (Dm)) )
2214 #define vsli_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00510 | _NENC_20_16(shift_amount), (Dd), (Dm)) )
2215 #define vsli_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800590 | _NENC_21_16(shift_amount), (Dd), (Dm)) )
2216 #define vsli_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880510 | _NENC_18_16(shift_amount), (Dd), (Dm)) )
2217 #define vsliq_n_p16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2218 #define vsliq_n_p8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2219 #define vsliq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2220 #define vsliq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00550 | _NENC_20_16(shift_amount), (Qd), (Qm)) )
2221 #define vsliq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38005d0 | _NENC_21_16(shift_amount), (Qd), (Qm)) )
2222 #define vsliq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2223 #define vsliq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900550 | _NENC_19_16(shift_amount), (Qd), (Qm)) )
2224 #define vsliq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00550 | _NENC_20_16(shift_amount), (Qd), (Qm)) )
2225 #define vsliq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38005d0 | _NENC_21_16(shift_amount), (Qd), (Qm)) )
2226 #define vsliq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 0 && (shift_amount) < 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880550 | _NENC_18_16(shift_amount), (Qd), (Qm)) )
2227 
2228 // VSRA, VRSRA (immediate)
2229 #define vrsra_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf2900310 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2230 #define vrsra_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf2a00310 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2231 #define vrsra_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf2800390 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2232 #define vrsra_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf2880310 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2233 #define vrsra_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900310 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2234 #define vrsra_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00310 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2235 #define vrsra_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800390 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2236 #define vrsra_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880310 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2237 #define vsra_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf2900110 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2238 #define vsra_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf2a00110 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2239 #define vsra_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf2800190 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2240 #define vsra_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf2880110 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2241 #define vsra_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900110 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2242 #define vsra_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00110 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2243 #define vsra_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800190 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2244 #define vsra_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880110 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2245 #define vrsraq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf2900350 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2246 #define vrsraq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf2a00350 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2247 #define vrsraq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf28003d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2248 #define vrsraq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf2880350 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2249 #define vrsraq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900350 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2250 #define vrsraq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00350 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2251 #define vrsraq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38003d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2252 #define vrsraq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880350 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2253 #define vsraq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf2900150 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2254 #define vsraq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf2a00150 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2255 #define vsraq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf28001d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2256 #define vsraq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf2880150 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2257 #define vsraq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900150 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2258 #define vsraq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00150 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2259 #define vsraq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38001d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2260 #define vsraq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880150 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2261 
2262 // VSRI (immediate)
2263 #define vsri_n_p16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2264 #define vsri_n_p8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2265 #define vsri_n_s16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2266 #define vsri_n_s32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00410 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2267 #define vsri_n_s64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800490 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2268 #define vsri_n_s8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2269 #define vsri_n_u16(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_DdDm_acc( 0xf3900410 | _NENC_19_16(16 - (shift_amount)), (Dd), (Dm)) )
2270 #define vsri_n_u32(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_DdDm_acc( 0xf3a00410 | _NENC_20_16(32 - (shift_amount)), (Dd), (Dm)) )
2271 #define vsri_n_u64(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_DdDm_acc( 0xf3800490 | _NENC_21_16(64 - (shift_amount)), (Dd), (Dm)) )
2272 #define vsri_n_u8(Dd, Dm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_DdDm_acc( 0xf3880410 | _NENC_18_16(8 - (shift_amount)), (Dd), (Dm)) )
2273 #define vsriq_n_p16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2274 #define vsriq_n_p8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2275 #define vsriq_n_s16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2276 #define vsriq_n_s32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00450 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2277 #define vsriq_n_s64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38004d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2278 #define vsriq_n_s8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2279 #define vsriq_n_u16(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 16, "invalid shift amount"), __neon_QdQm_acc( 0xf3900450 | _NENC_19_16(16 - (shift_amount)), (Qd), (Qm)) )
2280 #define vsriq_n_u32(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 32, "invalid shift amount"), __neon_QdQm_acc( 0xf3a00450 | _NENC_20_16(32 - (shift_amount)), (Qd), (Qm)) )
2281 #define vsriq_n_u64(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 64, "invalid shift amount"), __neon_QdQm_acc( 0xf38004d0 | _NENC_21_16(64 - (shift_amount)), (Qd), (Qm)) )
2282 #define vsriq_n_u8(Qd, Qm, shift_amount) ( __static_assert((shift_amount) >= 1 && (shift_amount) <= 8, "invalid shift amount"), __neon_QdQm_acc( 0xf3880450 | _NENC_18_16(8 - (shift_amount)), (Qd), (Qm)) )
2283 
2284 // VST1 (multiple single elements)
2285 #define vst1_f32(pD, D) ( __neon_AdrD1( 0xf400078f, __float32ToN64(pD), (D)) )
2286 #define vst1_p16(pD, D) ( __neon_AdrD1( 0xf400074f, __poly16ToN64(pD), (D)) )
2287 #define vst1_p8(pD, D) ( __neon_AdrD1( 0xf400070f, __poly8ToN64(pD), (D)) )
2288 #define vst1_s16(pD, D) ( __neon_AdrD1( 0xf400074f, __int16ToN64(pD), (D)) )
2289 #define vst1_s32(pD, D) ( __neon_AdrD1( 0xf400078f, __int32ToN64(pD), (D)) )
2290 #define vst1_s64(pD, D) ( __neon_AdrD1( 0xf40007cf, __int64ToN64(pD), (D)) )
2291 #define vst1_s8(pD, D) ( __neon_AdrD1( 0xf400070f, __int8ToN64(pD), (D)) )
2292 #define vst1_u16(pD, D) ( __neon_AdrD1( 0xf400074f, __uint16ToN64(pD), (D)) )
2293 #define vst1_u32(pD, D) ( __neon_AdrD1( 0xf400078f, __uint32ToN64(pD), (D)) )
2294 #define vst1_u64(pD, D) ( __neon_AdrD1( 0xf40007cf, __uint64ToN64(pD), (D)) )
2295 #define vst1_u8(pD, D) ( __neon_AdrD1( 0xf400070f, __uint8ToN64(pD), (D)) )
2296 #define vst1_f32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D)) )
2297 #define vst1_p16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D)) )
2298 #define vst1_p8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (D)) )
2299 #define vst1_s16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D)) )
2300 #define vst1_s32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D)) )
2301 #define vst1_s64_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf40007cf | _NENC_5_4(_NEON_ALIGN64(align)), __int64ToN64(pD), (D)) )
2302 #define vst1_s8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (D)) )
2303 #define vst1_u16_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400074f | _NENC_5_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D)) )
2304 #define vst1_u32_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400078f | _NENC_5_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D)) )
2305 #define vst1_u64_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf40007cf | _NENC_5_4(_NEON_ALIGN64(align)), __uint64ToN64(pD), (D)) )
2306 #define vst1_u8_ex(pD, D, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrD1( 0xf400070f | _NENC_5_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (D)) )
2307 #define vst1q_f32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __float32ToN64(pD), (Q)) )
2308 #define vst1q_p16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __poly16ToN64(pD), (Q)) )
2309 #define vst1q_p8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __poly8ToN64(pD), (Q)) )
2310 #define vst1q_s16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __int16ToN64(pD), (Q)) )
2311 #define vst1q_s32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __int32ToN64(pD), (Q)) )
2312 #define vst1q_s64(pD, Q) ( __neon_AdrQ1( 0xf4000acf, __int64ToN64(pD), (Q)) )
2313 #define vst1q_s8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __int8ToN64(pD), (Q)) )
2314 #define vst1q_u16(pD, Q) ( __neon_AdrQ1( 0xf4000a4f, __uint16ToN64(pD), (Q)) )
2315 #define vst1q_u32(pD, Q) ( __neon_AdrQ1( 0xf4000a8f, __uint32ToN64(pD), (Q)) )
2316 #define vst1q_u64(pD, Q) ( __neon_AdrQ1( 0xf4000acf, __uint64ToN64(pD), (Q)) )
2317 #define vst1q_u8(pD, Q) ( __neon_AdrQ1( 0xf4000a0f, __uint8ToN64(pD), (Q)) )
2318 #define vst1q_f32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q)) )
2319 #define vst1q_p16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (Q)) )
2320 #define vst1q_p8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (Q)) )
2321 #define vst1q_s16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (Q)) )
2322 #define vst1q_s32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q)) )
2323 #define vst1q_s64_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64(pD), (Q)) )
2324 #define vst1q_s8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (Q)) )
2325 #define vst1q_u16_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a4f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (Q)) )
2326 #define vst1q_u32_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a8f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q)) )
2327 #define vst1q_u64_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64(pD), (Q)) )
2328 #define vst1q_u8_ex(pD, Q, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf4000a0f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (Q)) )
2329 
2330 // VST1 (single element from one lane)
2331 #define vst1_lane_f32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __float32ToN64(pD), (D)) )
2332 #define vst1_lane_p16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __poly16ToN64(pD), (D)) )
2333 #define vst1_lane_p8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __poly8ToN64(pD), (D)) )
2334 #define vst1_lane_s16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __int16ToN64(pD), (D)) )
2335 #define vst1_lane_s32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __int32ToN64(pD), (D)) )
2336 #define vst1_lane_s8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __int8ToN64(pD), (D)) )
2337 #define vst1_lane_u16(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane), __uint16ToN64(pD), (D)) )
2338 #define vst1_lane_u32(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrD1( 0xf480080f | _NENC_7(lane), __uint32ToN64(pD), (D)) )
2339 #define vst1_lane_u8(pD, D, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrD1( 0xf480000f | _NENC_7_5(lane), __uint8ToN64(pD), (D)) )
2340 #define vst1q_lane_f32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q)) )
2341 #define vst1q_lane_p16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q)) )
2342 #define vst1q_lane_p8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __poly8ToN64(pD), (Q)) )
2343 #define vst1q_lane_s16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q)) )
2344 #define vst1q_lane_s32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q)) )
2345 #define vst1q_lane_s8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __int8ToN64(pD), (Q)) )
2346 #define vst1q_lane_u16(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q)) )
2347 #define vst1q_lane_u32(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q)) )
2348 #define vst1q_lane_u8(pD, Q, lane) ( __static_assert((lane) >= 0 && (lane) < 16, "invalid lane index"), __neon_AdrQ1( 0xf480000f | _NENC_7_5((lane) % 8) | _NENC_12((lane) >= 8 ? 1 : 0), __uint8ToN64(pD), (Q)) )
2349 
2350 // VST1 (single element from one lane, aligned)
2351 #define vst1_lane_f32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __float32ToN64(pD), (D)) )
2352 #define vst1_lane_p16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64(pD), (D)) )
2353 #define vst1_lane_s16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64(pD), (D)) )
2354 #define vst1_lane_s32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __int32ToN64(pD), (D)) )
2355 #define vst1_lane_u16_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480040f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64(pD), (D)) )
2356 #define vst1_lane_u32_ex(pD, D, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrD1( 0xf480080f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __uint32ToN64(pD), (D)) )
2357 #define vst1q_lane_f32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __float32ToN64(pD), (Q)) )
2358 #define vst1q_lane_p16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __poly16ToN64(pD), (Q)) )
2359 #define vst1q_lane_s16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __int16ToN64(pD), (Q)) )
2360 #define vst1q_lane_s32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __int32ToN64(pD), (Q)) )
2361 #define vst1q_lane_u16_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480040f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN16(align)), __uint16ToN64(pD), (Q)) )
2362 #define vst1q_lane_u32_ex(pD, Q, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQ1( 0xf480080f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN32(align) > 0 ? 3 : 0), __uint32ToN64(pD), (Q)) )
2363 
2364 // VST2 (multiple 2-element structures)
2365 #define vst2_f32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __float32ToN64(pD), (D2)) )
2366 #define vst2_p16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __poly16ToN64(pD), (D2)) )
2367 #define vst2_p8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __poly8ToN64(pD), (D2)) )
2368 #define vst2_s16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __int16ToN64(pD), (D2)) )
2369 #define vst2_s32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __int32ToN64(pD), (D2)) )
2370 #define vst2_s8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __int8ToN64(pD), (D2)) )
2371 #define vst2_u16(pD, D2) ( __neon_AdrDx2( 0xf400084f, __uint16ToN64(pD), (D2)) )
2372 #define vst2_u32(pD, D2) ( __neon_AdrDx2( 0xf400088f, __uint32ToN64(pD), (D2)) )
2373 #define vst2_u8(pD, D2) ( __neon_AdrDx2( 0xf400080f, __uint8ToN64(pD), (D2)) )
2374 #define vst2_s64(pD, D2) ( __neon_AdrDx2( 0xf4000acf, __int64ToN64(pD), (D2)) )
2375 #define vst2_u64(pD, D2) ( __neon_AdrDx2( 0xf4000acf, __uint64ToN64(pD), (D2)) )
2376 #define vst2_s64_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __int64ToN64(pD), (D2)) )
2377 #define vst2_u64_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf4000acf | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint64ToN64(pD), (D2)) )
2378 #define vst2_f32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (D2)) )
2379 #define vst2_p16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (D2)) )
2380 #define vst2_p8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (D2)) )
2381 #define vst2_s16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (D2)) )
2382 #define vst2_s32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (D2)) )
2383 #define vst2_s8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (D2)) )
2384 #define vst2_u16_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400084f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (D2)) )
2385 #define vst2_u32_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400088f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (D2)) )
2386 #define vst2_u8_ex(pD, D2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx2( 0xf400080f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (D2)) )
2387 #define vst2q_f32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __float32ToN64(pD), (Q2)) )
2388 #define vst2q_p16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __poly16ToN64(pD), (Q2)) )
2389 #define vst2q_p8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __poly8ToN64(pD), (Q2)) )
2390 #define vst2q_s16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __int16ToN64(pD), (Q2)) )
2391 #define vst2q_s32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __int32ToN64(pD), (Q2)) )
2392 #define vst2q_s8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __int8ToN64(pD), (Q2)) )
2393 #define vst2q_u16(pD, Q2) ( __neon_AdrQx2( 0xf400094f, __uint16ToN64(pD), (Q2)) )
2394 #define vst2q_u32(pD, Q2) ( __neon_AdrQx2( 0xf400098f, __uint32ToN64(pD), (Q2)) )
2395 #define vst2q_u8(pD, Q2) ( __neon_AdrQx2( 0xf400090f, __uint8ToN64(pD), (Q2)) )
2396 #define vst2q_f32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q2)) )
2397 #define vst2q_p16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly16ToN64(pD), (Q2)) )
2398 #define vst2q_p8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __poly8ToN64(pD), (Q2)) )
2399 #define vst2q_s16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int16ToN64(pD), (Q2)) )
2400 #define vst2q_s32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q2)) )
2401 #define vst2q_s8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __int8ToN64(pD), (Q2)) )
2402 #define vst2q_u16_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400094f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint16ToN64(pD), (Q2)) )
2403 #define vst2q_u32_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400098f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q2)) )
2404 #define vst2q_u8_ex(pD, Q2, align) ( __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx2( 0xf400090f | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint8ToN64(pD), (Q2)) )
2405 
2406 // VST2 (single 2-element structure from one lane)
2407 #define vst2_lane_f32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __float32ToN64(pD), (D2)) )
2408 #define vst2_lane_p16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __poly16ToN64(pD), (D2)) )
2409 #define vst2_lane_p8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __poly8ToN64(pD), (D2)) )
2410 #define vst2_lane_s16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __int16ToN64(pD), (D2)) )
2411 #define vst2_lane_s32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __int32ToN64(pD), (D2)) )
2412 #define vst2_lane_s8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __int8ToN64(pD), (D2)) )
2413 #define vst2_lane_u16(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane), __uint16ToN64(pD), (D2)) )
2414 #define vst2_lane_u32(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane), __uint32ToN64(pD), (D2)) )
2415 #define vst2_lane_u8(pD, D2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane), __uint8ToN64(pD), (D2)) )
2416 #define vst2q_lane_f32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q2)) )
2417 #define vst2q_lane_p16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q2)) )
2418 #define vst2q_lane_s16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q2)) )
2419 #define vst2q_lane_s32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q2)) )
2420 #define vst2q_lane_u16(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q2)) )
2421 #define vst2q_lane_u32(pD, Q2, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q2)) )
2422 
2423 // VST2 (single 2-element structure from one lane, aligned)
2424 #define vst2_lane_f32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D2)) )
2425 #define vst2_lane_p16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64(pD), (D2)) )
2426 #define vst2_lane_p8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __poly8ToN64(pD), (D2)) )
2427 #define vst2_lane_s16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64(pD), (D2)) )
2428 #define vst2_lane_s32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D2)) )
2429 #define vst2_lane_s8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __int8ToN64(pD), (D2)) )
2430 #define vst2_lane_u16_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480050f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64(pD), (D2)) )
2431 #define vst2_lane_u32_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480090f | _NENC_7(lane) | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D2)) )
2432 #define vst2_lane_u8_ex(pD, D2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN16(align) >= 0, "invalid align"), __neon_AdrDx2x( 0xf480010f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN16(align)), __uint8ToN64(pD), (D2)) )
2433 #define vst2q_lane_f32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (Q2)) )
2434 #define vst2q_lane_p16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __poly16ToN64(pD), (Q2)) )
2435 #define vst2q_lane_s16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __int16ToN64(pD), (Q2)) )
2436 #define vst2q_lane_s32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (Q2)) )
2437 #define vst2q_lane_u16_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480052f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN32(align)), __uint16ToN64(pD), (Q2)) )
2438 #define vst2q_lane_u32_ex(pD, Q2, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx2x( 0xf480094f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (Q2)) )
2439 
2440 // VST3 (multiple 3-element structures)
2441 #define vst3_f32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __float32ToN64(pD), (D3)) )
2442 #define vst3_p16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __poly16ToN64(pD), (D3)) )
2443 #define vst3_p8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __poly8ToN64(pD), (D3)) )
2444 #define vst3_s16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __int16ToN64(pD), (D3)) )
2445 #define vst3_s32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __int32ToN64(pD), (D3)) )
2446 #define vst3_s8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __int8ToN64(pD), (D3)) )
2447 #define vst3_u16(pD, D3) ( __neon_AdrDx3( 0xf400044f, __uint16ToN64(pD), (D3)) )
2448 #define vst3_u32(pD, D3) ( __neon_AdrDx3( 0xf400048f, __uint32ToN64(pD), (D3)) )
2449 #define vst3_u8(pD, D3) ( __neon_AdrDx3( 0xf400040f, __uint8ToN64(pD), (D3)) )
2450 #define vst3_s64(pD, D3) ( __neon_AdrDx3( 0xf40006cf, __int64ToN64(pD), (D3)) )
2451 #define vst3_u64(pD, D3) ( __neon_AdrDx3( 0xf40006cf, __uint64ToN64(pD), (D3)) )
2452 #define vst3_s64_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf40006cf | _NENC_4(_NEON_ALIGN64(align)), __int64ToN64(pD), (D3)) )
2453 #define vst3_u64_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf40006cf | _NENC_4(_NEON_ALIGN64(align)), __uint64ToN64(pD), (D3)) )
2454 #define vst3_f32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (D3)) )
2455 #define vst3_p16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D3)) )
2456 #define vst3_p8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (D3)) )
2457 #define vst3_s16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D3)) )
2458 #define vst3_s32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (D3)) )
2459 #define vst3_s8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (D3)) )
2460 #define vst3_u16_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400044f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D3)) )
2461 #define vst3_u32_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400048f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (D3)) )
2462 #define vst3_u8_ex(pD, D3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx3( 0xf400040f | _NENC_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (D3)) )
2463 #define vst3q_f32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __float32ToN64(pD), (Q3)) )
2464 #define vst3q_p16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __poly16ToN64(pD), (Q3)) )
2465 #define vst3q_p8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __poly8ToN64(pD), (Q3)) )
2466 #define vst3q_s16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __int16ToN64(pD), (Q3)) )
2467 #define vst3q_s32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __int32ToN64(pD), (Q3)) )
2468 #define vst3q_s8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __int8ToN64(pD), (Q3)) )
2469 #define vst3q_u16(pD, Q3) ( __neon_AdrQx3( 0xf400054f, __uint16ToN64(pD), (Q3)) )
2470 #define vst3q_u32(pD, Q3) ( __neon_AdrQx3( 0xf400058f, __uint32ToN64(pD), (Q3)) )
2471 #define vst3q_u8(pD, Q3) ( __neon_AdrQx3( 0xf400050f, __uint8ToN64(pD), (Q3)) )
2472 #define vst3q_f32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __float32ToN64(pD), (Q3)) )
2473 #define vst3q_p16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (Q3)) )
2474 #define vst3q_p8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __poly8ToN64(pD), (Q3)) )
2475 #define vst3q_s16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (Q3)) )
2476 #define vst3q_s32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __int32ToN64(pD), (Q3)) )
2477 #define vst3q_s8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __int8ToN64(pD), (Q3)) )
2478 #define vst3q_u16_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400054f | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (Q3)) )
2479 #define vst3q_u32_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400058f | _NENC_4(_NEON_ALIGN64(align)), __uint32ToN64(pD), (Q3)) )
2480 #define vst3q_u8_ex(pD, Q3, align) ( __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx3( 0xf400050f | _NENC_4(_NEON_ALIGN64(align)), __uint8ToN64(pD), (Q3)) )
2481 
2482 // VST3 (single 3-element structure from one lane)
2483 #define vst3_lane_f32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __float32ToN64(pD), (D3)) )
2484 #define vst3_lane_p16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __poly16ToN64(pD), (D3)) )
2485 #define vst3_lane_p8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __poly8ToN64(pD), (D3)) )
2486 #define vst3_lane_s16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __int16ToN64(pD), (D3)) )
2487 #define vst3_lane_s32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __int32ToN64(pD), (D3)) )
2488 #define vst3_lane_s8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __int8ToN64(pD), (D3)) )
2489 #define vst3_lane_u16(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx3x( 0xf480060f | _NENC_7_6(lane), __uint16ToN64(pD), (D3)) )
2490 #define vst3_lane_u32(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx3x( 0xf4800a0f | _NENC_7(lane), __uint32ToN64(pD), (D3)) )
2491 #define vst3_lane_u8(pD, D3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx3x( 0xf480020f | _NENC_7_5(lane), __uint8ToN64(pD), (D3)) )
2492 #define vst3q_lane_f32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q3)) )
2493 #define vst3q_lane_p16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q3)) )
2494 #define vst3q_lane_s16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q3)) )
2495 #define vst3q_lane_s32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q3)) )
2496 #define vst3q_lane_u16(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx3x( 0xf480062f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q3)) )
2497 #define vst3q_lane_u32(pD, Q3, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx3x( 0xf4800a4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q3)) )
2498 
2499 // VST4 (multiple 4-element structures)
2500 #define vst4_f32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __float32ToN64(pD), (D4)) )
2501 #define vst4_p16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __poly16ToN64(pD), (D4)) )
2502 #define vst4_p8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __poly8ToN64(pD), (D4)) )
2503 #define vst4_s16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __int16ToN64(pD), (D4)) )
2504 #define vst4_s32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __int32ToN64(pD), (D4)) )
2505 #define vst4_s8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __int8ToN64(pD), (D4)) )
2506 #define vst4_u16(pD, D4) ( __neon_AdrDx4( 0xf400004f, __uint16ToN64(pD), (D4)) )
2507 #define vst4_u32(pD, D4) ( __neon_AdrDx4( 0xf400008f, __uint32ToN64(pD), (D4)) )
2508 #define vst4_u8(pD, D4) ( __neon_AdrDx4( 0xf400000f, __uint8ToN64(pD), (D4)) )
2509 #define vst4_s64(pD, D4) ( __neon_AdrDx4( 0xf40002cf, __int64ToN64(pD), (D4)) )
2510 #define vst4_u64(pD, D4) ( __neon_AdrDx4( 0xf40002cf, __uint64ToN64(pD), (D4)) )
2511 #define vst4_s64_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf40002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int64ToN64(pD), (D4)) )
2512 #define vst4_u64_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf40002cf | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint64ToN64(pD), (D4)) )
2513 #define vst4_f32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64(pD), (D4)) )
2514 #define vst4_p16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64(pD), (D4)) )
2515 #define vst4_p8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64(pD), (D4)) )
2516 #define vst4_s16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64(pD), (D4)) )
2517 #define vst4_s32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64(pD), (D4)) )
2518 #define vst4_s8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64(pD), (D4)) )
2519 #define vst4_u16_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400004f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64(pD), (D4)) )
2520 #define vst4_u32_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400008f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64(pD), (D4)) )
2521 #define vst4_u8_ex(pD, D4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrDx4( 0xf400000f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64(pD), (D4)) )
2522 #define vst4q_f32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __float32ToN64(pD), (Q4)) )
2523 #define vst4q_p16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __poly16ToN64(pD), (Q4)) )
2524 #define vst4q_p8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __poly8ToN64(pD), (Q4)) )
2525 #define vst4q_s16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __int16ToN64(pD), (Q4)) )
2526 #define vst4q_s32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __int32ToN64(pD), (Q4)) )
2527 #define vst4q_s8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __int8ToN64(pD), (Q4)) )
2528 #define vst4q_u16(pD, Q4) ( __neon_AdrQx4( 0xf400014f, __uint16ToN64(pD), (Q4)) )
2529 #define vst4q_u32(pD, Q4) ( __neon_AdrQx4( 0xf400018f, __uint32ToN64(pD), (Q4)) )
2530 #define vst4q_u8(pD, Q4) ( __neon_AdrQx4( 0xf400010f, __uint8ToN64(pD), (Q4)) )
2531 #define vst4q_f32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __float32ToN64(pD), (Q4)) )
2532 #define vst4q_p16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly16ToN64(pD), (Q4)) )
2533 #define vst4q_p8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __poly8ToN64(pD), (Q4)) )
2534 #define vst4q_s16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int16ToN64(pD), (Q4)) )
2535 #define vst4q_s32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int32ToN64(pD), (Q4)) )
2536 #define vst4q_s8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __int8ToN64(pD), (Q4)) )
2537 #define vst4q_u16_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400014f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint16ToN64(pD), (Q4)) )
2538 #define vst4q_u32_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400018f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint32ToN64(pD), (Q4)) )
2539 #define vst4q_u8_ex(pD, Q4, align) ( __static_assert(_NEON_ALIGN64_128_256(align) >= 0, "invalid align"), __neon_AdrQx4( 0xf400010f | _NENC_5_4(_NEON_ALIGN64_128_256(align)), __uint8ToN64(pD), (Q4)) )
2540 
2541 // VST4 (single 4-element structure from one lane)
2542 #define vst4_lane_f32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __float32ToN64(pD), (D4)) )
2543 #define vst4_lane_p16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __poly16ToN64(pD), (D4)) )
2544 #define vst4_lane_p8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __poly8ToN64(pD), (D4)) )
2545 #define vst4_lane_s16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __int16ToN64(pD), (D4)) )
2546 #define vst4_lane_s32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __int32ToN64(pD), (D4)) )
2547 #define vst4_lane_s8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __int8ToN64(pD), (D4)) )
2548 #define vst4_lane_u16(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane), __uint16ToN64(pD), (D4)) )
2549 #define vst4_lane_u32(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane), __uint32ToN64(pD), (D4)) )
2550 #define vst4_lane_u8(pD, D4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane), __uint8ToN64(pD), (D4)) )
2551 #define vst4q_lane_f32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __float32ToN64(pD), (Q4)) )
2552 #define vst4q_lane_p16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __poly16ToN64(pD), (Q4)) )
2553 #define vst4q_lane_s16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __int16ToN64(pD), (Q4)) )
2554 #define vst4q_lane_s32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __int32ToN64(pD), (Q4)) )
2555 #define vst4q_lane_u16(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0), __uint16ToN64(pD), (Q4)) )
2556 #define vst4q_lane_u32(pD, Q4, lane) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0), __uint32ToN64(pD), (Q4)) )
2557 
2558 // VST4 (single 4-element structure from one lane, aligned)
2559 #define vst4_lane_f32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (D4)) )
2560 #define vst4_lane_p16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (D4)) )
2561 #define vst4_lane_p8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __poly8ToN64(pD), (D4)) )
2562 #define vst4_lane_s16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (D4)) )
2563 #define vst4_lane_s32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (D4)) )
2564 #define vst4_lane_s8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __int8ToN64(pD), (D4)) )
2565 #define vst4_lane_u16_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480070f | _NENC_7_6(lane) | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (D4)) )
2566 #define vst4_lane_u32_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 2, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf4800b0f | _NENC_7(lane) | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (D4)) )
2567 #define vst4_lane_u8_ex(pD, D4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN32(align) >= 0, "invalid align"), __neon_AdrDx4x( 0xf480030f | _NENC_7_5(lane) | _NENC_4(_NEON_ALIGN32(align)), __uint8ToN64(pD), (D4)) )
2568 #define vst4q_lane_f32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __float32ToN64(pD), (Q4)) )
2569 #define vst4q_lane_p16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __poly16ToN64(pD), (Q4)) )
2570 #define vst4q_lane_s16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __int16ToN64(pD), (Q4)) )
2571 #define vst4q_lane_s32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __int32ToN64(pD), (Q4)) )
2572 #define vst4q_lane_u16_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 8, "invalid lane index"), __static_assert(_NEON_ALIGN64(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf480072f | _NENC_7_6((lane) % 4) | _NENC_12((lane) >= 4 ? 1 : 0) | _NENC_4(_NEON_ALIGN64(align)), __uint16ToN64(pD), (Q4)) )
2573 #define vst4q_lane_u32_ex(pD, Q4, lane, align) ( __static_assert((lane) >= 0 && (lane) < 4, "invalid lane index"), __static_assert(_NEON_ALIGN64_128(align) >= 0, "invalid align"), __neon_AdrQx4x( 0xf4800b4f | _NENC_7((lane) % 2) | _NENC_12((lane) >= 2 ? 1 : 0) | _NENC_5_4(_NEON_ALIGN64_128(align)), __uint32ToN64(pD), (Q4)) )
2574 
2575 // VSUB
2576 #define vsub_f32(Dn, Dm) ( __neon_DdDnDm( 0xf2200d00, (Dn), (Dm)) )
2577 #define vsub_s16(Dn, Dm) ( __neon_DdDnDm( 0xf3100800, (Dn), (Dm)) )
2578 #define vsub_s32(Dn, Dm) ( __neon_DdDnDm( 0xf3200800, (Dn), (Dm)) )
2579 #define vsub_s64(Dn, Dm) ( __neon_DdDnDm( 0xf3300800, (Dn), (Dm)) )
2580 #define vsub_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3000800, (Dn), (Dm)) )
2581 #define vsub_u16(Dn, Dm) ( __neon_DdDnDm( 0xf3100800, (Dn), (Dm)) )
2582 #define vsub_u32(Dn, Dm) ( __neon_DdDnDm( 0xf3200800, (Dn), (Dm)) )
2583 #define vsub_u64(Dn, Dm) ( __neon_DdDnDm( 0xf3300800, (Dn), (Dm)) )
2584 #define vsub_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3000800, (Dn), (Dm)) )
2585 #define vsubq_f32(Qn, Qm) ( __neon_QdQnQm( 0xf2200d40, (Qn), (Qm)) )
2586 #define vsubq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf3100840, (Qn), (Qm)) )
2587 #define vsubq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf3200840, (Qn), (Qm)) )
2588 #define vsubq_s64(Qn, Qm) ( __neon_QdQnQm( 0xf3300840, (Qn), (Qm)) )
2589 #define vsubq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf3000840, (Qn), (Qm)) )
2590 #define vsubq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf3100840, (Qn), (Qm)) )
2591 #define vsubq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf3200840, (Qn), (Qm)) )
2592 #define vsubq_u64(Qn, Qm) ( __neon_QdQnQm( 0xf3300840, (Qn), (Qm)) )
2593 #define vsubq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf3000840, (Qn), (Qm)) )
2594 
2595 // VSUBHN, VRSUBHN
2596 #define vrsubhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf3800600, (Qn), (Qm)) )
2597 #define vrsubhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf3900600, (Qn), (Qm)) )
2598 #define vrsubhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00600, (Qn), (Qm)) )
2599 #define vrsubhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf3800600, (Qn), (Qm)) )
2600 #define vrsubhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf3900600, (Qn), (Qm)) )
2601 #define vrsubhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf3a00600, (Qn), (Qm)) )
2602 #define vsubhn_s16(Qn, Qm) ( __neon_DdQnQm( 0xf2800600, (Qn), (Qm)) )
2603 #define vsubhn_s32(Qn, Qm) ( __neon_DdQnQm( 0xf2900600, (Qn), (Qm)) )
2604 #define vsubhn_s64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00600, (Qn), (Qm)) )
2605 #define vsubhn_u16(Qn, Qm) ( __neon_DdQnQm( 0xf2800600, (Qn), (Qm)) )
2606 #define vsubhn_u32(Qn, Qm) ( __neon_DdQnQm( 0xf2900600, (Qn), (Qm)) )
2607 #define vsubhn_u64(Qn, Qm) ( __neon_DdQnQm( 0xf2a00600, (Qn), (Qm)) )
2608 
2609 // VSUBL, VSUBW
2610 #define vsubl_s16(Dn, Dm) ( __neon_QdDnDm( 0xf2900200, (Dn), (Dm)) )
2611 #define vsubl_s32(Dn, Dm) ( __neon_QdDnDm( 0xf2a00200, (Dn), (Dm)) )
2612 #define vsubl_s8(Dn, Dm) ( __neon_QdDnDm( 0xf2800200, (Dn), (Dm)) )
2613 #define vsubl_u16(Dn, Dm) ( __neon_QdDnDm( 0xf3900200, (Dn), (Dm)) )
2614 #define vsubl_u32(Dn, Dm) ( __neon_QdDnDm( 0xf3a00200, (Dn), (Dm)) )
2615 #define vsubl_u8(Dn, Dm) ( __neon_QdDnDm( 0xf3800200, (Dn), (Dm)) )
2616 #define vsubw_s16(Qn, Dm) ( __neon_QdQnDm( 0xf2900300, (Qn), (Dm)) )
2617 #define vsubw_s32(Qn, Dm) ( __neon_QdQnDm( 0xf2a00300, (Qn), (Dm)) )
2618 #define vsubw_s8(Qn, Dm) ( __neon_QdQnDm( 0xf2800300, (Qn), (Dm)) )
2619 #define vsubw_u16(Qn, Dm) ( __neon_QdQnDm( 0xf3900300, (Qn), (Dm)) )
2620 #define vsubw_u32(Qn, Dm) ( __neon_QdQnDm( 0xf3a00300, (Qn), (Dm)) )
2621 #define vsubw_u8(Qn, Dm) ( __neon_QdQnDm( 0xf3800300, (Qn), (Dm)) )
2622 
2623 // VTBL, VTBX
2624 #define vtbl2_p8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2625 #define vtbl2_s8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2626 #define vtbl2_u8(D2, Dm) ( __neon_DdDx2Dm( 0xf3b00900, (D2), (Dm)) )
2627 #define vtbx2_p8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2628 #define vtbx2_s8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2629 #define vtbx2_u8(Dd, D2, Dm) ( __neon_DdDx2Dm_acc( 0xf3b00940, (Dd), (D2), (Dm)) )
2630 #define vtbl3_p8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2631 #define vtbl3_s8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2632 #define vtbl3_u8(D3, Dm) ( __neon_DdDx3Dm( 0xf3b00a00, (D3), (Dm)) )
2633 #define vtbx3_p8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2634 #define vtbx3_s8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2635 #define vtbx3_u8(Dd, D3, Dm) ( __neon_DdDx3Dm_acc( 0xf3b00a40, (Dd), (D3), (Dm)) )
2636 #define vtbl4_p8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2637 #define vtbl4_s8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2638 #define vtbl4_u8(D4, Dm) ( __neon_DdDx4Dm( 0xf3b00b00, (D4), (Dm)) )
2639 #define vtbx4_p8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2640 #define vtbx4_s8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2641 #define vtbx4_u8(Dd, D4, Dm) ( __neon_DdDx4Dm_acc( 0xf3b00b40, (Dd), (D4), (Dm)) )
2642 #define vtbl1_p8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2643 #define vtbl1_s8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2644 #define vtbl1_u8(Dn, Dm) ( __neon_DdDnDm( 0xf3b00800, (Dn), (Dm)) )
2645 #define vtbx1_p8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2646 #define vtbx1_s8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2647 #define vtbx1_u8(Dd, Dn, Dm) ( __neon_DdDnDm_acc( 0xf3b00840, (Dd), (Dn), (Dm)) )
2648 
2649 // VTRN
2650 #define vtrn_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2651 #define vtrn_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2652 #define vtrn_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2653 #define vtrn_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2654 #define vtrn_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2655 #define vtrn_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2656 #define vtrn_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60080, (Dd), (Dm)) )
2657 #define vtrn_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2658 #define vtrn_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20080, (Dd), (Dm)) )
2659 #define vtrnq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2660 #define vtrnq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2661 #define vtrnq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2662 #define vtrnq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2663 #define vtrnq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2664 #define vtrnq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2665 #define vtrnq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b600c0, (Qd), (Qm)) )
2666 #define vtrnq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba00c0, (Qd), (Qm)) )
2667 #define vtrnq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b200c0, (Qd), (Qm)) )
2668 
2669 // VTRNQ64
2670 #define vtrnq_s64(Qd, Qm) ( __neon_QdQm_acc3( 0x00000000, (Qd), (Qm)) )
2671 #define vtrnq_u64(Qd, Qm) ( __neon_QdQm_acc3( 0x00000000, (Qd), (Qm)) )
2672 
2673 // VTST
2674 #define vtst_p8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2675 #define vtst_s16(Dn, Dm) ( __neon_DdDnDm( 0xf2100810, (Dn), (Dm)) )
2676 #define vtst_s32(Dn, Dm) ( __neon_DdDnDm( 0xf2200810, (Dn), (Dm)) )
2677 #define vtst_s8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2678 #define vtst_u16(Dn, Dm) ( __neon_DdDnDm( 0xf2100810, (Dn), (Dm)) )
2679 #define vtst_u32(Dn, Dm) ( __neon_DdDnDm( 0xf2200810, (Dn), (Dm)) )
2680 #define vtst_u8(Dn, Dm) ( __neon_DdDnDm( 0xf2000810, (Dn), (Dm)) )
2681 #define vtstq_p8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2682 #define vtstq_s16(Qn, Qm) ( __neon_QdQnQm( 0xf2100850, (Qn), (Qm)) )
2683 #define vtstq_s32(Qn, Qm) ( __neon_QdQnQm( 0xf2200850, (Qn), (Qm)) )
2684 #define vtstq_s8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2685 #define vtstq_u16(Qn, Qm) ( __neon_QdQnQm( 0xf2100850, (Qn), (Qm)) )
2686 #define vtstq_u32(Qn, Qm) ( __neon_QdQnQm( 0xf2200850, (Qn), (Qm)) )
2687 #define vtstq_u8(Qn, Qm) ( __neon_QdQnQm( 0xf2000850, (Qn), (Qm)) )
2688 
2689 // VUZP
2690 #define vuzp_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2691 #define vuzp_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2692 #define vuzp_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2693 #define vuzp_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2694 #define vuzp_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60100, (Dd), (Dm)) )
2695 #define vuzp_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20100, (Dd), (Dm)) )
2696 #define vuzp_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2697 #define vuzp_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2698 #define vuzp_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2699 #define vuzpq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2700 #define vuzpq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2701 #define vuzpq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2702 #define vuzpq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2703 #define vuzpq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2704 #define vuzpq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2705 #define vuzpq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b60140, (Qd), (Qm)) )
2706 #define vuzpq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba0140, (Qd), (Qm)) )
2707 #define vuzpq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b20140, (Qd), (Qm)) )
2708 
2709 // VZIP
2710 #define vzip_p16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2711 #define vzip_p8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2712 #define vzip_s16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2713 #define vzip_s8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2714 #define vzip_u16(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b60180, (Dd), (Dm)) )
2715 #define vzip_u8(Dd, Dm) ( __neon_DdDm_acc2( 0xf3b20180, (Dd), (Dm)) )
2716 #define vzip_f32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2717 #define vzip_s32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2718 #define vzip_u32(Dd, Dm) ( __neon_DdDm_acc2( 0xf3ba0080, (Dd), (Dm)) )
2719 #define vzipq_f32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2720 #define vzipq_p16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2721 #define vzipq_p8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2722 #define vzipq_s16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2723 #define vzipq_s32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2724 #define vzipq_s8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2725 #define vzipq_u16(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b601c0, (Qd), (Qm)) )
2726 #define vzipq_u32(Qd, Qm) ( __neon_QdQm_acc2( 0xf3ba01c0, (Qd), (Qm)) )
2727 #define vzipq_u8(Qd, Qm) ( __neon_QdQm_acc2( 0xf3b201c0, (Qd), (Qm)) )
2728 
2729 // } +++ auto-generated code ends (Neon macros)
2730 
2731 
2732 
2734 //
2735 // { +++ auto-generated code begins (vreinterpret macros)
2736 
2737 #define vreinterpret_f32_s8(a) (a)
2738 #define vreinterpret_f32_s16(a) (a)
2739 #define vreinterpret_f32_s32(a) (a)
2740 #define vreinterpret_f32_s64(a) (a)
2741 #define vreinterpret_f32_p8(a) (a)
2742 #define vreinterpret_f32_p16(a) (a)
2743 #define vreinterpret_f32_u8(a) (a)
2744 #define vreinterpret_f32_u16(a) (a)
2745 #define vreinterpret_f32_u32(a) (a)
2746 #define vreinterpret_f32_u64(a) (a)
2747 #define vreinterpret_s8_f32(a) (a)
2748 #define vreinterpret_s8_s16(a) (a)
2749 #define vreinterpret_s8_s32(a) (a)
2750 #define vreinterpret_s8_s64(a) (a)
2751 #define vreinterpret_s8_p8(a) (a)
2752 #define vreinterpret_s8_p16(a) (a)
2753 #define vreinterpret_s8_u8(a) (a)
2754 #define vreinterpret_s8_u16(a) (a)
2755 #define vreinterpret_s8_u32(a) (a)
2756 #define vreinterpret_s8_u64(a) (a)
2757 #define vreinterpret_s16_f32(a) (a)
2758 #define vreinterpret_s16_s8(a) (a)
2759 #define vreinterpret_s16_s32(a) (a)
2760 #define vreinterpret_s16_s64(a) (a)
2761 #define vreinterpret_s16_p8(a) (a)
2762 #define vreinterpret_s16_p16(a) (a)
2763 #define vreinterpret_s16_u8(a) (a)
2764 #define vreinterpret_s16_u16(a) (a)
2765 #define vreinterpret_s16_u32(a) (a)
2766 #define vreinterpret_s16_u64(a) (a)
2767 #define vreinterpret_s32_f32(a) (a)
2768 #define vreinterpret_s32_s8(a) (a)
2769 #define vreinterpret_s32_s16(a) (a)
2770 #define vreinterpret_s32_s64(a) (a)
2771 #define vreinterpret_s32_p8(a) (a)
2772 #define vreinterpret_s32_p16(a) (a)
2773 #define vreinterpret_s32_u8(a) (a)
2774 #define vreinterpret_s32_u16(a) (a)
2775 #define vreinterpret_s32_u32(a) (a)
2776 #define vreinterpret_s32_u64(a) (a)
2777 #define vreinterpret_s64_f32(a) (a)
2778 #define vreinterpret_s64_s8(a) (a)
2779 #define vreinterpret_s64_s16(a) (a)
2780 #define vreinterpret_s64_s32(a) (a)
2781 #define vreinterpret_s64_p8(a) (a)
2782 #define vreinterpret_s64_p16(a) (a)
2783 #define vreinterpret_s64_u8(a) (a)
2784 #define vreinterpret_s64_u16(a) (a)
2785 #define vreinterpret_s64_u32(a) (a)
2786 #define vreinterpret_s64_u64(a) (a)
2787 #define vreinterpret_p8_f32(a) (a)
2788 #define vreinterpret_p8_s8(a) (a)
2789 #define vreinterpret_p8_s16(a) (a)
2790 #define vreinterpret_p8_s32(a) (a)
2791 #define vreinterpret_p8_s64(a) (a)
2792 #define vreinterpret_p8_p16(a) (a)
2793 #define vreinterpret_p8_u8(a) (a)
2794 #define vreinterpret_p8_u16(a) (a)
2795 #define vreinterpret_p8_u32(a) (a)
2796 #define vreinterpret_p8_u64(a) (a)
2797 #define vreinterpret_p16_f32(a) (a)
2798 #define vreinterpret_p16_s8(a) (a)
2799 #define vreinterpret_p16_s16(a) (a)
2800 #define vreinterpret_p16_s32(a) (a)
2801 #define vreinterpret_p16_s64(a) (a)
2802 #define vreinterpret_p16_p8(a) (a)
2803 #define vreinterpret_p16_u8(a) (a)
2804 #define vreinterpret_p16_u16(a) (a)
2805 #define vreinterpret_p16_u32(a) (a)
2806 #define vreinterpret_p16_u64(a) (a)
2807 #define vreinterpret_u8_f32(a) (a)
2808 #define vreinterpret_u8_s8(a) (a)
2809 #define vreinterpret_u8_s16(a) (a)
2810 #define vreinterpret_u8_s32(a) (a)
2811 #define vreinterpret_u8_s64(a) (a)
2812 #define vreinterpret_u8_p8(a) (a)
2813 #define vreinterpret_u8_p16(a) (a)
2814 #define vreinterpret_u8_u16(a) (a)
2815 #define vreinterpret_u8_u32(a) (a)
2816 #define vreinterpret_u8_u64(a) (a)
2817 #define vreinterpret_u16_f32(a) (a)
2818 #define vreinterpret_u16_s8(a) (a)
2819 #define vreinterpret_u16_s16(a) (a)
2820 #define vreinterpret_u16_s32(a) (a)
2821 #define vreinterpret_u16_s64(a) (a)
2822 #define vreinterpret_u16_p8(a) (a)
2823 #define vreinterpret_u16_p16(a) (a)
2824 #define vreinterpret_u16_u8(a) (a)
2825 #define vreinterpret_u16_u32(a) (a)
2826 #define vreinterpret_u16_u64(a) (a)
2827 #define vreinterpret_u32_f32(a) (a)
2828 #define vreinterpret_u32_s8(a) (a)
2829 #define vreinterpret_u32_s16(a) (a)
2830 #define vreinterpret_u32_s32(a) (a)
2831 #define vreinterpret_u32_s64(a) (a)
2832 #define vreinterpret_u32_p8(a) (a)
2833 #define vreinterpret_u32_p16(a) (a)
2834 #define vreinterpret_u32_u8(a) (a)
2835 #define vreinterpret_u32_u16(a) (a)
2836 #define vreinterpret_u32_u64(a) (a)
2837 #define vreinterpret_u64_f32(a) (a)
2838 #define vreinterpret_u64_s8(a) (a)
2839 #define vreinterpret_u64_s16(a) (a)
2840 #define vreinterpret_u64_s32(a) (a)
2841 #define vreinterpret_u64_s64(a) (a)
2842 #define vreinterpret_u64_p8(a) (a)
2843 #define vreinterpret_u64_p16(a) (a)
2844 #define vreinterpret_u64_u8(a) (a)
2845 #define vreinterpret_u64_u16(a) (a)
2846 #define vreinterpret_u64_u32(a) (a)
2847 #define vreinterpretq_f32_s8(a) (a)
2848 #define vreinterpretq_f32_s16(a) (a)
2849 #define vreinterpretq_f32_s32(a) (a)
2850 #define vreinterpretq_f32_s64(a) (a)
2851 #define vreinterpretq_f32_p8(a) (a)
2852 #define vreinterpretq_f32_p16(a) (a)
2853 #define vreinterpretq_f32_u8(a) (a)
2854 #define vreinterpretq_f32_u16(a) (a)
2855 #define vreinterpretq_f32_u32(a) (a)
2856 #define vreinterpretq_f32_u64(a) (a)
2857 #define vreinterpretq_s8_f32(a) (a)
2858 #define vreinterpretq_s8_s16(a) (a)
2859 #define vreinterpretq_s8_s32(a) (a)
2860 #define vreinterpretq_s8_s64(a) (a)
2861 #define vreinterpretq_s8_p8(a) (a)
2862 #define vreinterpretq_s8_p16(a) (a)
2863 #define vreinterpretq_s8_u8(a) (a)
2864 #define vreinterpretq_s8_u16(a) (a)
2865 #define vreinterpretq_s8_u32(a) (a)
2866 #define vreinterpretq_s8_u64(a) (a)
2867 #define vreinterpretq_s16_f32(a) (a)
2868 #define vreinterpretq_s16_s8(a) (a)
2869 #define vreinterpretq_s16_s32(a) (a)
2870 #define vreinterpretq_s16_s64(a) (a)
2871 #define vreinterpretq_s16_p8(a) (a)
2872 #define vreinterpretq_s16_p16(a) (a)
2873 #define vreinterpretq_s16_u8(a) (a)
2874 #define vreinterpretq_s16_u16(a) (a)
2875 #define vreinterpretq_s16_u32(a) (a)
2876 #define vreinterpretq_s16_u64(a) (a)
2877 #define vreinterpretq_s32_f32(a) (a)
2878 #define vreinterpretq_s32_s8(a) (a)
2879 #define vreinterpretq_s32_s16(a) (a)
2880 #define vreinterpretq_s32_s64(a) (a)
2881 #define vreinterpretq_s32_p8(a) (a)
2882 #define vreinterpretq_s32_p16(a) (a)
2883 #define vreinterpretq_s32_u8(a) (a)
2884 #define vreinterpretq_s32_u16(a) (a)
2885 #define vreinterpretq_s32_u32(a) (a)
2886 #define vreinterpretq_s32_u64(a) (a)
2887 #define vreinterpretq_s64_f32(a) (a)
2888 #define vreinterpretq_s64_s8(a) (a)
2889 #define vreinterpretq_s64_s16(a) (a)
2890 #define vreinterpretq_s64_s32(a) (a)
2891 #define vreinterpretq_s64_p8(a) (a)
2892 #define vreinterpretq_s64_p16(a) (a)
2893 #define vreinterpretq_s64_u8(a) (a)
2894 #define vreinterpretq_s64_u16(a) (a)
2895 #define vreinterpretq_s64_u32(a) (a)
2896 #define vreinterpretq_s64_u64(a) (a)
2897 #define vreinterpretq_p8_f32(a) (a)
2898 #define vreinterpretq_p8_s8(a) (a)
2899 #define vreinterpretq_p8_s16(a) (a)
2900 #define vreinterpretq_p8_s32(a) (a)
2901 #define vreinterpretq_p8_s64(a) (a)
2902 #define vreinterpretq_p8_p16(a) (a)
2903 #define vreinterpretq_p8_u8(a) (a)
2904 #define vreinterpretq_p8_u16(a) (a)
2905 #define vreinterpretq_p8_u32(a) (a)
2906 #define vreinterpretq_p8_u64(a) (a)
2907 #define vreinterpretq_p16_f32(a) (a)
2908 #define vreinterpretq_p16_s8(a) (a)
2909 #define vreinterpretq_p16_s16(a) (a)
2910 #define vreinterpretq_p16_s32(a) (a)
2911 #define vreinterpretq_p16_s64(a) (a)
2912 #define vreinterpretq_p16_p8(a) (a)
2913 #define vreinterpretq_p16_u8(a) (a)
2914 #define vreinterpretq_p16_u16(a) (a)
2915 #define vreinterpretq_p16_u32(a) (a)
2916 #define vreinterpretq_p16_u64(a) (a)
2917 #define vreinterpretq_u8_f32(a) (a)
2918 #define vreinterpretq_u8_s8(a) (a)
2919 #define vreinterpretq_u8_s16(a) (a)
2920 #define vreinterpretq_u8_s32(a) (a)
2921 #define vreinterpretq_u8_s64(a) (a)
2922 #define vreinterpretq_u8_p8(a) (a)
2923 #define vreinterpretq_u8_p16(a) (a)
2924 #define vreinterpretq_u8_u16(a) (a)
2925 #define vreinterpretq_u8_u32(a) (a)
2926 #define vreinterpretq_u8_u64(a) (a)
2927 #define vreinterpretq_u16_f32(a) (a)
2928 #define vreinterpretq_u16_s8(a) (a)
2929 #define vreinterpretq_u16_s16(a) (a)
2930 #define vreinterpretq_u16_s32(a) (a)
2931 #define vreinterpretq_u16_s64(a) (a)
2932 #define vreinterpretq_u16_p8(a) (a)
2933 #define vreinterpretq_u16_p16(a) (a)
2934 #define vreinterpretq_u16_u8(a) (a)
2935 #define vreinterpretq_u16_u32(a) (a)
2936 #define vreinterpretq_u16_u64(a) (a)
2937 #define vreinterpretq_u32_f32(a) (a)
2938 #define vreinterpretq_u32_s8(a) (a)
2939 #define vreinterpretq_u32_s16(a) (a)
2940 #define vreinterpretq_u32_s32(a) (a)
2941 #define vreinterpretq_u32_s64(a) (a)
2942 #define vreinterpretq_u32_p8(a) (a)
2943 #define vreinterpretq_u32_p16(a) (a)
2944 #define vreinterpretq_u32_u8(a) (a)
2945 #define vreinterpretq_u32_u16(a) (a)
2946 #define vreinterpretq_u32_u64(a) (a)
2947 #define vreinterpretq_u64_f32(a) (a)
2948 #define vreinterpretq_u64_s8(a) (a)
2949 #define vreinterpretq_u64_s16(a) (a)
2950 #define vreinterpretq_u64_s32(a) (a)
2951 #define vreinterpretq_u64_s64(a) (a)
2952 #define vreinterpretq_u64_p8(a) (a)
2953 #define vreinterpretq_u64_p16(a) (a)
2954 #define vreinterpretq_u64_u8(a) (a)
2955 #define vreinterpretq_u64_u16(a) (a)
2956 #define vreinterpretq_u64_u32(a) (a)
2957 
2958 // } +++ auto-generated code ends (vreinterpret macros)
2959 
2960 // { +++ auto-generated code begins (Pseudo intrinsics)
2961 
2962 // Multiply by scalar
2963 #define vmul_n_s16(Vd, Rt) vmul_lane_s16((Vd), vmov_n_s16(Rt), 0)
2964 #define vmul_n_s32(Vd, Rt) vmul_lane_s32((Vd), vmov_n_s32(Rt), 0)
2965 #define vmul_n_u16(Vd, Rt) vmul_lane_u16((Vd), vmov_n_u16(Rt), 0)
2966 #define vmul_n_u32(Vd, Rt) vmul_lane_u32((Vd), vmov_n_u32(Rt), 0)
2967 #define vmulq_n_s16(Vd, Rt) vmulq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2968 #define vmulq_n_s32(Vd, Rt) vmulq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2969 #define vmulq_n_u16(Vd, Rt) vmulq_lane_u16((Vd), vmov_n_u16(Rt), 0)
2970 #define vmulq_n_u32(Vd, Rt) vmulq_lane_u32((Vd), vmov_n_u32(Rt), 0)
2971 #define vmull_n_s16(Vd, Rt) vmull_lane_s16((Vd), vmov_n_s16(Rt), 0)
2972 #define vmull_n_s32(Vd, Rt) vmull_lane_s32((Vd), vmov_n_s32(Rt), 0)
2973 #define vmull_n_u16(Vd, Rt) vmull_lane_u16((Vd), vmov_n_u16(Rt), 0)
2974 #define vmull_n_u32(Vd, Rt) vmull_lane_u32((Vd), vmov_n_u32(Rt), 0)
2975 #define vqdmulh_n_s16(Vd, Rt) vqdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
2976 #define vqdmulh_n_s32(Vd, Rt) vqdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
2977 #define vqdmulhq_n_s16(Vd, Rt) vqdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2978 #define vqdmulhq_n_s32(Vd, Rt) vqdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2979 #define vqdmull_n_s16(Vd, Rt) vqdmull_lane_s16((Vd), vmov_n_s16(Rt), 0)
2980 #define vqdmull_n_s32(Vd, Rt) vqdmull_lane_s32((Vd), vmov_n_s32(Rt), 0)
2981 #define vqrdmulh_n_s16(Vd, Rt) vqrdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
2982 #define vqrdmulh_n_s32(Vd, Rt) vqrdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
2983 #define vqrdmulhq_n_s16(Vd, Rt) vqrdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
2984 #define vqrdmulhq_n_s32(Vd, Rt) vqrdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
2985 
2986 // Multiply by scalar with accumulate
2987 #define vmla_n_s16(Vd, Vn, Rt) vmla_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
2988 #define vmla_n_s32(Vd, Vn, Rt) vmla_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
2989 #define vmla_n_u16(Vd, Vn, Rt) vmla_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
2990 #define vmla_n_u32(Vd, Vn, Rt) vmla_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
2991 #define vmlaq_n_s16(Vd, Vn, Rt) vmlaq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
2992 #define vmlaq_n_s32(Vd, Vn, Rt) vmlaq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
2993 #define vmlaq_n_u16(Vd, Vn, Rt) vmlaq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
2994 #define vmlaq_n_u32(Vd, Vn, Rt) vmlaq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
2995 #define vmlal_n_s16(Vd, Vn, Rt) vmlal_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
2996 #define vmlal_n_s32(Vd, Vn, Rt) vmlal_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
2997 #define vmlal_n_u16(Vd, Vn, Rt) vmlal_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
2998 #define vmlal_n_u32(Vd, Vn, Rt) vmlal_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
2999 #define vmls_n_s16(Vd, Vn, Rt) vmls_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3000 #define vmls_n_s32(Vd, Vn, Rt) vmls_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3001 #define vmls_n_u16(Vd, Vn, Rt) vmls_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3002 #define vmls_n_u32(Vd, Vn, Rt) vmls_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3003 #define vmlsq_n_s16(Vd, Vn, Rt) vmlsq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3004 #define vmlsq_n_s32(Vd, Vn, Rt) vmlsq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3005 #define vmlsq_n_u16(Vd, Vn, Rt) vmlsq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3006 #define vmlsq_n_u32(Vd, Vn, Rt) vmlsq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3007 #define vmlsl_n_s16(Vd, Vn, Rt) vmlsl_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3008 #define vmlsl_n_s32(Vd, Vn, Rt) vmlsl_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3009 #define vmlsl_n_u16(Vd, Vn, Rt) vmlsl_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
3010 #define vmlsl_n_u32(Vd, Vn, Rt) vmlsl_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
3011 #define vqdmlal_n_s16(Vd, Vn, Rt) vqdmlal_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3012 #define vqdmlal_n_s32(Vd, Vn, Rt) vqdmlal_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3013 #define vqdmlsl_n_s16(Vd, Vn, Rt) vqdmlsl_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
3014 #define vqdmlsl_n_s32(Vd, Vn, Rt) vqdmlsl_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
3015 
3016 // VDUP.64 (scalar)
3017 #define vdup_lane_s64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), (Dn) )
3018 #define vdup_lane_u64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), (Dn) )
3019 
3020 // VDUP.W.64 (scalar)
3021 #define vdupq_lane_s64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), vcombine_s64((Dn), (Dn)) )
3022 #define vdupq_lane_u64(Dn, lane) ( __static_assert(lane == 0, "invalid lane index"), vcombine_u64((Dn), (Dn)) )
3023 
3024 // } +++ auto-generated code ends (Pseudo intrinsics)
__n128 __neon_QdDm(unsigned int _Enc, __n64)
void __neon_AdrQx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x3)
__n128 __neon_QdRt(unsigned int _Enc, int)
__n128 __neon_QdQnFt_acc(unsigned int, __n128, __n128, float)
__n64 __neon_DdDnFt(unsigned int, __n64, float)
__n64 __neon_DdRtRt2_acc(unsigned int _Enc, __n64, __int64)
__n64 int32x2_t
Definition: arm_neon.h:185
__n128 __neon_QdDnDm(unsigned int _Enc, __n64, __n64)
__n128x3 int16x8x3_t
Definition: arm_neon.h:227
__n128x4 float32x4x4_t
Definition: arm_neon.h:220
int __neon_RtDn(unsigned int _Enc, __n64)
short int16_t
Definition: stdint.h:9
__n64 float32x2_t
Definition: arm_neon.h:173
__n64x2 uint64x1x2_t
Definition: arm_neon.h:214
__n128 poly8x16_t
Definition: arm_neon.h:237
__n64x2 int8x8x2_t
Definition: arm_neon.h:178
_Check_return_ _In_ int _Out_writes_bytes_(_MaxCharCount) void *_DstBuf
__n64 __neon_DdQm_high(unsigned int _Enc, __n128)
__n64x4 __neon_Dx4Adr_acc(unsigned int _Enc, __n64x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__inline int32_t __uint16ToInt32(uint16_t i)
Definition: arm_neon.h:152
__n128 int8x16_t
Definition: arm_neon.h:221
__n64x3 int64x1x3_t
Definition: arm_neon.h:191
__n128 float32x4_t
Definition: arm_neon.h:217
__n64x3 poly16x4x3_t
Definition: arm_neon.h:199
__n64 __neon_DdRt(unsigned int _Enc, int)
__n128 __neon_QdQnDmx_acc(unsigned int _Enc, __n128, __n128, __n64)
__n64x4 int8x8x4_t
Definition: arm_neon.h:180
__n128x2 int64x2x2_t
Definition: arm_neon.h:234
__n64x2 int16x4x2_t
Definition: arm_neon.h:182
__n128 __neon_QdDnDmx(unsigned int _Enc, __n64, __n64)
__n64 poly8x8_t
Definition: arm_neon.h:193
long long int64_t
Definition: stdint.h:11
__n128x3 int64x2x3_t
Definition: arm_neon.h:235
__n128x4 uint8x16x4_t
Definition: arm_neon.h:248
__n128x2 __neon_QdQm_acc3(unsigned int _Enc, __n128, __n128)
__n64 __neon_DdDnFt_acc(unsigned int, __n64, __n64, float)
__n64 __neon_DdRt_acc(unsigned int _Enc, __n64, int)
__n64x4 float32x2x4_t
Definition: arm_neon.h:176
__n128 val[3]
Definition: arm_neon.h:104
__n128x3 uint64x2x3_t
Definition: arm_neon.h:259
Definition: arm_neon.h:87
__n128 __neon_QdFt(unsigned int _Enc, float)
Definition: arm_neon.h:102
__n128 __neon_QdRtRt2_acc(unsigned int _Enc, __n128, __int64)
__n64x4 uint8x8x4_t
Definition: arm_neon.h:204
__n64 __neon_DdDnDm(unsigned int _Enc, __n64, __n64)
__n64x3 uint16x4x3_t
Definition: arm_neon.h:207
__n128x3 uint16x8x3_t
Definition: arm_neon.h:251
#define _In_reads_bytes_(size)
Definition: sal.h:327
__inline _Post_equal_to_(p) __n64 *__int8ToN64(_In_ int8_t *p)
Definition: arm_neon.h:122
__n64x4 int32x2x4_t
Definition: arm_neon.h:188
unsigned short uint16_t
Definition: stdint.h:13
__n128x3 poly8x16x3_t
Definition: arm_neon.h:239
__n128 __neon_QdRtRt2_dup(unsigned int _Enc, __int64)
__n64 uint64x1_t
Definition: arm_neon.h:213
struct __n64x3 __n64x3
__n128x4 int32x4x4_t
Definition: arm_neon.h:232
__n64x4 uint32x2x4_t
Definition: arm_neon.h:212
__n64x2 int64x1x2_t
Definition: arm_neon.h:190
__n128x4 __neon_Qx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 __neon_DdQnQm(unsigned int _Enc, __n128, __n128)
void __neon_AdrQx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x4)
struct __n128x2 __n128x2
unsigned char uint8_t
Definition: stdint.h:12
__n64 __neon_DdDx2Dm(unsigned int _Enc, __n64x2, __n64)
#define DUMMYNEONSTRUCT
Definition: arm_neon.h:37
__inline int32_t __int32ToInt32(int32_t i)
Definition: arm_neon.h:148
__n64x3 int32x2x3_t
Definition: arm_neon.h:187
__n128x3 poly16x8x3_t
Definition: arm_neon.h:243
struct __n64x2 __n64x2
__n128x2 int32x4x2_t
Definition: arm_neon.h:230
__n64x4 poly8x8x4_t
Definition: arm_neon.h:196
void __neon_AdrDx4(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x4)
unsigned __int16 poly16_t
Definition: arm_neon.h:115
__n128 __neon_QdQm_acc(unsigned int _Enc, __n128, __n128)
void __neon_AdrQx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x3)
__n64x4 uint64x1x4_t
Definition: arm_neon.h:216
__n128 __neon_QdDnDmx_acc(unsigned int _Enc, __n128, __n64, __n64)
__n64x2 __neon_Dx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64x3 poly8x8x3_t
Definition: arm_neon.h:195
__n128x4 int16x8x4_t
Definition: arm_neon.h:228
void __neon_AdrD1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64)
__inline int32_t __poly8ToInt32(poly8_t i)
Definition: arm_neon.h:156
__n128x2 int8x16x2_t
Definition: arm_neon.h:222
Definition: arm_neon.h:107
float __neon_FtDn(unsigned int _Enc, __n64)
__inline int64_t __int64ToInt64(int64_t i)
Definition: arm_neon.h:149
__n128 uint64x2_t
Definition: arm_neon.h:257
__n64 __neon_DdDnDm_acc(unsigned int _Enc, __n64, __n64, __n64)
__n64x2 poly8x8x2_t
Definition: arm_neon.h:194
__n64 val[4]
Definition: arm_neon.h:94
int i[4]
Definition: dvec.h:70
void __neon_AdrDx3(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x3)
void __neon_AdrQx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x2)
__n64x2 float32x2x2_t
Definition: arm_neon.h:174
__n64x3 float32x2x3_t
Definition: arm_neon.h:175
__n128 __neon_QdFt_acc(unsigned int _Enc, __n128, float)
#define _In_
Definition: sal.h:314
__n64x4 int16x4x4_t
Definition: arm_neon.h:184
__n64x2 uint32x2x2_t
Definition: arm_neon.h:210
__n128x2 float32x4x2_t
Definition: arm_neon.h:218
__n64 __neon_D1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__int64 __neon_RtRt2Dm(unsigned int _Enc, __n64)
__n128x2 uint32x4x2_t
Definition: arm_neon.h:254
__n128x3 __neon_Qx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 int32x4_t
Definition: arm_neon.h:229
int __neon_RtQn(unsigned int _Enc, __n128)
__n128 __neon_QdQnQm(unsigned int _Enc, __n128, __n128)
__n128 __neon_QdQnDm(unsigned int _Enc, __n128, __n64)
__n64 __neon_DdQm(unsigned int _Enc, __n128)
__n128 int64x2_t
Definition: arm_neon.h:233
__n128x4 poly16x8x4_t
Definition: arm_neon.h:244
unsigned int uint32_t
Definition: stdint.h:14
__n128 __neon_Q1Adr_acc(unsigned int _Enc, __n128, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
struct __n64x4 __n64x4
__inline int32_t __uint32ToInt32(uint32_t i)
Definition: arm_neon.h:153
__n128x3 int8x16x3_t
Definition: arm_neon.h:223
__n128x4 uint32x4x4_t
Definition: arm_neon.h:256
unsigned long long uint64_t
Definition: stdint.h:15
struct __n128x4 __n128x4
__n64 int64x1_t
Definition: arm_neon.h:189
__n128
Definition: arm_neon.h:80
__n64 uint8x8_t
Definition: arm_neon.h:201
__n128x4 poly8x16x4_t
Definition: arm_neon.h:240
__n64x2 uint16x4x2_t
Definition: arm_neon.h:206
__n64x2 uint8x8x2_t
Definition: arm_neon.h:202
__n128x2 int16x8x2_t
Definition: arm_neon.h:226
__n128x2 uint16x8x2_t
Definition: arm_neon.h:250
Definition: arm_neon.h:82
__n128 val[4]
Definition: arm_neon.h:109
__n64 int16x4_t
Definition: arm_neon.h:181
__n64x3 uint8x8x3_t
Definition: arm_neon.h:203
__n64x4 __neon_Dx4Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 int16x8_t
Definition: arm_neon.h:225
__inline int32_t __uint8ToInt32(uint8_t i)
Definition: arm_neon.h:151
__n64x4 int64x1x4_t
Definition: arm_neon.h:192
__inline int32_t __poly16ToInt32(poly16_t i)
Definition: arm_neon.h:157
signed char int8_t
Definition: stdint.h:8
__n128x4 uint16x8x4_t
Definition: arm_neon.h:252
struct __n128x3 __n128x3
__n128 __neon_QdQnFt(unsigned int, __n128, float)
void __neon_AdrDx2x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x2)
__n64x3 int16x4x3_t
Definition: arm_neon.h:183
__n64 __neon_DdRtRt2(unsigned int _Enc, __int64)
__n64 val[2]
Definition: arm_neon.h:84
__n64 __neon_DdFt_acc(unsigned int _Enc, __n64, float)
void __neon_AdrDx3x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x3)
__n128x3 float32x4x3_t
Definition: arm_neon.h:219
__n128 uint8x16_t
Definition: arm_neon.h:245
__n64 uint32x2_t
Definition: arm_neon.h:209
__n128x3 uint8x16x3_t
Definition: arm_neon.h:247
float float32_t
Definition: arm_neon.h:117
#define _ADVSIMD_ALIGN(x)
Definition: arm_neon.h:32
__n64
Definition: arm_neon.h:55
__n64 __neon_DdDx4Dm(unsigned int _Enc, __n64x4, __n64)
void __neon_AdrDx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x4)
__n64 __neon_DdDm(unsigned int _Enc, __n64)
__n64 __neon_DdFt(unsigned int _Enc, float)
__n64 int8x8_t
Definition: arm_neon.h:177
__inline int32_t __int8ToInt32(int8_t i)
Definition: arm_neon.h:146
__n64x3 int8x8x3_t
Definition: arm_neon.h:179
__n64x2 __neon_DdDm_acc2(unsigned int _Enc, __n64, __n64)
__n64 val[3]
Definition: arm_neon.h:89
__n128 uint16x8_t
Definition: arm_neon.h:249
__n128x4 int64x2x4_t
Definition: arm_neon.h:236
__n64x4 poly16x4x4_t
Definition: arm_neon.h:200
void __neon_AdrQx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x2)
__n64 __neon_D1Adr_acc(unsigned int _Enc, __n64, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 __neon_DdDx3Dm_acc(unsigned int _Enc, __n64, __n64x3, __n64)
__int64 __neon_RtRt2Qm(unsigned int _Enc, __n128)
void __neon_AdrDx2(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n64x2)
__inline int64_t __uint64ToInt64(uint64_t i)
Definition: arm_neon.h:154
void __neon_AdrQ1(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128)
int int32_t
Definition: stdint.h:10
__n128x4 uint64x2x4_t
Definition: arm_neon.h:260
__n128x2 uint8x16x2_t
Definition: arm_neon.h:246
__n64 __neon_DdQm_low(unsigned int _Enc, __n128)
__n64 __neon_DdDx4Dm_acc(unsigned int _Enc, __n64, __n64x4, __n64)
__n128 __neon_QdDnDm_acc(unsigned int _Enc, __n128, __n64, __n64)
__n128 poly16x8_t
Definition: arm_neon.h:241
__n128x2 __neon_QdQm_acc2(unsigned int _Enc, __n128, __n128)
__n64 __neon_DdDx3Dm(unsigned int _Enc, __n64x3, __n64)
__n128 uint32x4_t
Definition: arm_neon.h:253
unsigned __int8 poly8_t
Definition: arm_neon.h:114
__n128 val[2]
Definition: arm_neon.h:99
__inline int32_t __int16ToInt32(int16_t i)
Definition: arm_neon.h:147
__n128x2 __neon_Qx2Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
void __neon_AdrQx4x(unsigned int _Enc, _Out_writes_bytes_(_Inexpressible_(_Enc)) __n64 *, __n128x4)
__n64x2 __neon_Dx2Adr_acc(unsigned int _Enc, __n64x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 __neon_QdQm(unsigned int _Enc, __n128)
__n64x4 uint16x4x4_t
Definition: arm_neon.h:208
__n128 __neon_QdDnDm_merge(unsigned int _Enc, __n64, __n64)
__n64x2 poly16x4x2_t
Definition: arm_neon.h:198
__n128x3 int32x4x3_t
Definition: arm_neon.h:231
__n128x2 __neon_Qx2Adr_acc(unsigned int _Enc, __n128x2, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128 __neon_QdRt_acc(unsigned int _Enc, __n128, int)
__n128x2 poly8x16x2_t
Definition: arm_neon.h:238
__n64 __neon_DdDx2Dm_acc(unsigned int _Enc, __n64, __n64x2, __n64)
__n128x2 poly16x8x2_t
Definition: arm_neon.h:242
__n64x3 uint32x2x3_t
Definition: arm_neon.h:211
float __neon_FtQn(unsigned int _Enc, __n128)
__n128 __neon_QdQnDmx(unsigned int _Enc, __n128, __n64)
__n128x4 __neon_Qx4Adr_acc(unsigned int _Enc, __n128x4, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x3 __neon_Qx3Adr_acc(unsigned int _Enc, __n128x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64x3 __neon_Dx3Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x2 uint64x2x2_t
Definition: arm_neon.h:258
__n64x3 __neon_Dx3Adr_acc(unsigned int _Enc, __n64x3, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n128x3 uint32x4x3_t
Definition: arm_neon.h:255
__n128x4 int8x16x4_t
Definition: arm_neon.h:224
__n64 __neon_DdDnDmx_acc(unsigned int _Enc, __n64, __n64, __n64)
Definition: arm_neon.h:92
__n64x2 int32x2x2_t
Definition: arm_neon.h:186
Definition: arm_neon.h:97
__n64 poly16x4_t
Definition: arm_neon.h:197
__n128 __neon_Q1Adr(unsigned int _Enc, _In_reads_bytes_(_Inexpressible_(_Enc)) const __n64 *)
__n64 uint16x4_t
Definition: arm_neon.h:205
__n64x3 uint64x1x3_t
Definition: arm_neon.h:215
__n64 __neon_DdDnDmx(unsigned int _Enc, __n64, __n64)
__n128 __neon_QdQnQm_acc(unsigned int _Enc, __n128, __n128, __n128)
__n64 __neon_DdDm_acc(unsigned int _Enc, __n64, __n64)