Github User Fetcher 1.0.0
C Application with Server and GUI
Loading...
Searching...
No Matches
graphene-simd4f.h
Go to the documentation of this file.
1/* graphene-simd4f.h: SIMD wrappers and operations
2 *
3 * SPDX-License-Identifier: MIT
4 *
5 * Copyright 2014 Emmanuele Bassi
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SH1_0 THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
25
26#pragma once
27
28#if !defined(GRAPHENE_H_INSIDE) && !defined(GRAPHENE_COMPILATION)
29#error "Only graphene.h can be included directly."
30#endif
31
32/* needed for memcpy() */
33#include <string.h>
34#include <math.h>
35#include <float.h>
36
37#include "graphene-config.h"
38#include "graphene-macros.h"
40
42
43/* Platform specific operations */
44
47 float y,
48 float z,
49 float w);
58
61 float *v);
64 float *v);
67 float *v);
68
71 unsigned int i);
80
91
94 const graphene_simd4f_t b);
97 const graphene_simd4f_t b);
100 const graphene_simd4f_t b);
103 const graphene_simd4f_t b);
104
111
114 const graphene_simd4f_t b);
117 const graphene_simd4f_t b);
120 const graphene_simd4f_t b);
121
124 const graphene_simd4f_t b);
127 const graphene_simd4f_t b);
128
135
140
143 const graphene_simd4f_t b);
146 const graphene_simd4f_t b);
149 float v);
150
155
158 const graphene_simd4f_t b);
161 const graphene_simd4f_t b);
164 const graphene_simd4f_t b);
167 const graphene_simd4f_t b);
170 const graphene_simd4f_t b);
173 const graphene_simd4f_t b);
176
177#if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
178
179/* SSE2 implementation of SIMD 4f */
180
181/* Union type used to do single lane reading without memcpy */
182typedef union {
184 float f[4];
185} graphene_simd4f_union_t;
186
187/* On GCC, we use __extension__ macros to avoid a static inline */
188# if defined(__GNUC__)
189
190/* Use GCC statement __extension__ to inline all these functions */
191
192# define graphene_simd4f_init(x,y,z,w) \
193 (__extension__ ({ \
194 (graphene_simd4f_t) { (x), (y), (z), (w) }; \
195 }))
196
197# define graphene_simd4f_init_zero() \
198 (__extension__ ({ \
199 (graphene_simd4f_t) _mm_setzero_ps(); \
200 }))
201
202# define graphene_simd4f_init_4f(v) \
203 (__extension__ ({ \
204 (graphene_simd4f_t) _mm_loadu_ps (v); \
205 }))
206
207# define graphene_simd4f_init_3f(v) \
208 (__extension__ ({ \
209 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
210 }))
211
212# define graphene_simd4f_init_2f(v) \
213 (__extension__ ({ \
214 (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
215 }))
216
217# define graphene_simd4f_dup_4f(s,v) \
218 (__extension__ ({ \
219 _mm_storeu_ps ((v), (s)); \
220 }))
221
222# define graphene_simd4f_dup_3f(s,v) \
223 (__extension__ ({ \
224 memcpy ((v), &(s), sizeof (float) * 3); \
225 }))
226
227# define graphene_simd4f_dup_2f(s,v) \
228 (__extension__ ({ \
229 memcpy ((v), &(s), sizeof (float) * 2); \
230 }))
231
232# define graphene_simd4f_get(s,i) \
233 (__extension__ ({ \
234 graphene_simd4f_union_t __u = { (s) }; \
235 (float) __u.f[(i)]; \
236 }))
237
238# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0)
239# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1)
240# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2)
241# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3)
242
243# define graphene_simd4f_splat(v) \
244 (__extension__ ({ \
245 (graphene_simd4f_t) _mm_set1_ps ((v)); \
246 }))
247
248# define graphene_simd4f_splat_x(v) \
249 (__extension__ ({ \
250 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0)); \
251 }))
252
253# define graphene_simd4f_splat_y(v) \
254 (__extension__ ({ \
255 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1)); \
256 }))
257
258# define graphene_simd4f_splat_z(v) \
259 (__extension__ ({ \
260 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2)); \
261 }))
262
263# define graphene_simd4f_splat_w(v) \
264 (__extension__ ({ \
265 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3)); \
266 }))
267
268# define graphene_simd4f_add(a,b) \
269 (__extension__ ({ \
270 (graphene_simd4f_t) _mm_add_ps ((a), (b)); \
271 }))
272
273# define graphene_simd4f_sub(a,b) \
274 (__extension__ ({ \
275 (graphene_simd4f_t) _mm_sub_ps ((a), (b)); \
276 }))
277
278# define graphene_simd4f_mul(a,b) \
279 (__extension__ ({ \
280 (graphene_simd4f_t) _mm_mul_ps ((a), (b)); \
281 }))
282
283# define graphene_simd4f_div(a,b) \
284 (__extension__ ({ \
285 (graphene_simd4f_t) _mm_div_ps ((a), (b)); \
286 }))
287
288# define graphene_simd4f_sqrt(v) \
289 (__extension__ ({ \
290 (graphene_simd4f_t) _mm_sqrt_ps ((v)); \
291 }))
292
293# define graphene_simd4f_reciprocal(v) \
294 (__extension__ ({ \
295 const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f); \
296 const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); \
297 const graphene_simd4f_t __s = _mm_rcp_ps ((v)); \
298 const graphene_simd4f_t __m = graphene_simd4f_mul ((v), \
299 _mm_andnot_ps (_mm_cmpeq_ps ((v), __zero), \
300 __s)); \
301 graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, __m)); \
302 }))
303
304# define graphene_simd4f_rsqrt(v) \
305 (__extension__ ({ \
306 const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); \
307 const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); \
308 graphene_simd4f_t __s = _mm_rsqrt_ps ((v)); \
309 graphene_simd4f_mul (graphene_simd4f_mul (__s, __half), \
310 graphene_simd4f_sub (__three, \
311 graphene_simd4f_mul (__s, graphene_simd4f_mul ((v), __s)))); \
312 }))
313
314# define graphene_simd4f_cross3(a,b) \
315 (__extension__ ({ \
316 const graphene_simd4f_t __a_yzx = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 0, 2, 1)); \
317 const graphene_simd4f_t __a_zxy = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 1, 0, 2)); \
318 const graphene_simd4f_t __b_yzx = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 0, 2, 1)); \
319 const graphene_simd4f_t __b_zxy = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 1, 0, 2)); \
320 (graphene_simd4f_t) _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); \
321 }))
322
323# if defined(GRAPHENE_USE_SSE4_1)
324# define graphene_simd4f_dot3(a,b) \
325 (__extension__ ({ \
326 (graphene_simd4f_t) _mm_dp_ps ((a), (b), 0x7f); \
327 }))
328# else
329# define graphene_simd4f_dot3(a,b) \
330 (__extension__ ({ \
331 const unsigned int __mask_bits[] GRAPHENE_ALIGN16 = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
332 const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); \
333 const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); \
334 const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask); \
335 const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0)); \
336 const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); \
337 (graphene_simd4f_t) _mm_shuffle_ps (__s2, __s2, 0); \
338 }))
339# endif
340
341# define graphene_simd4f_dot3_scalar(a,b) \
342 (__extension__ ({ \
343 float __res; \
344 _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); \
345 __res; \
346 }))
347
348# define graphene_simd4f_min(a,b) \
349 (__extension__ ({ \
350 (graphene_simd4f_t) _mm_min_ps ((a), (b)); \
351 }))
352
353# define graphene_simd4f_max(a,b) \
354 (__extension__ ({ \
355 (graphene_simd4f_t) _mm_max_ps ((a), (b)); \
356 }))
357
358# define graphene_simd4f_shuffle_wxyz(v) \
359 (__extension__ ({ \
360 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 1, 0, 3)); \
361 }))
362
363# define graphene_simd4f_shuffle_zwxy(v) \
364 (__extension__ ({ \
365 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 0, 3, 2)); \
366 }))
367
368# define graphene_simd4f_shuffle_yzwx(v) \
369 (__extension__ ({ \
370 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 3, 2, 1)); \
371 }))
372
373# define graphene_simd4f_zero_w(v) \
374 (__extension__ ({ \
375 graphene_simd4f_t __s = _mm_unpackhi_ps ((v), _mm_setzero_ps ()); \
376 (graphene_simd4f_t) _mm_movelh_ps ((v), __s); \
377 }))
378
379# define graphene_simd4f_zero_zw(v) \
380 (__extension__ ({ \
381 (graphene_simd4f_t) _mm_movelh_ps ((v), _mm_setzero_ps ()); \
382 }))
383
384# define graphene_simd4f_merge_w(s,v) \
385 (__extension__ ({ \
386 graphene_simd4f_t __s = _mm_unpackhi_ps ((s), _mm_set1_ps ((v))); \
387 (graphene_simd4f_t) _mm_movelh_ps ((s), __s); \
388 }))
389
390# define graphene_simd4f_merge_high(a,b) \
391 (__extension__ ({ \
392 (graphene_simd4f_t) _mm_movehl_ps ((b), (a)); \
393 }))
394
395# define graphene_simd4f_merge_low(a,b) \
396 (__extension__ ({ \
397 (graphene_simd4f_t) _mm_movelh_ps ((a), (b)); \
398 }))
399
400typedef GRAPHENE_ALIGN16 union {
401 unsigned int ui[4];
402 float f[4];
403} graphene_simd4f_uif_t;
404
405# define graphene_simd4f_flip_sign_0101(v) \
406 (__extension__ ({ \
407 const graphene_simd4f_uif_t __pnpn = { { \
408 0x00000000, \
409 0x80000000, \
410 0x00000000, \
411 0x80000000 \
412 } }; \
413 (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__pnpn.f)); \
414 }))
415
416# define graphene_simd4f_flip_sign_1010(v) \
417 (__extension__ ({ \
418 const graphene_simd4f_uif_t __npnp = { { \
419 0x80000000, \
420 0x00000000, \
421 0x80000000, \
422 0x00000000, \
423 } }; \
424 (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__npnp.f)); \
425 }))
426
427# define graphene_simd4f_cmp_eq(a,b) \
428 (__extension__ ({ \
429 __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
430 (bool) (_mm_movemask_epi8 (__res) == 0); \
431 }))
432
433# define graphene_simd4f_cmp_neq(a,b) \
434 (__extension__ ({ \
435 __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
436 (bool) (_mm_movemask_epi8 (__res) != 0); \
437 }))
438
439# define graphene_simd4f_cmp_lt(a,b) \
440 (__extension__ ({ \
441 __m128i __res = (__m128i) _mm_cmplt_ps ((a), (b)); \
442 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
443 }))
444
445# define graphene_simd4f_cmp_le(a,b) \
446 (__extension__ ({ \
447 __m128i __res = (__m128i) _mm_cmple_ps ((a), (b)); \
448 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
449 }))
450
451# define graphene_simd4f_cmp_ge(a,b) \
452 (__extension__ ({ \
453 __m128i __res = (__m128i) _mm_cmpge_ps ((a), (b)); \
454 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
455 }))
456
457# define graphene_simd4f_cmp_gt(a,b) \
458 (__extension__ ({ \
459 __m128i __res = (__m128i) _mm_cmpgt_ps ((a), (b)); \
460 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
461 }))
462
463# define graphene_simd4f_neg(s) \
464 (__extension__ ({ \
465 const graphene_simd4f_uif_t __mask = { { \
466 0x80000000, \
467 0x80000000, \
468 0x80000000, \
469 0x80000000, \
470 } }; \
471 (graphene_simd4f_t) _mm_xor_ps ((s), _mm_load_ps (__mask.f)); \
472 }))
473
474/* On MSVC, we use static inlines */
475# elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
476
477/* Use static inline to inline all these functions */
478
479#define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
480
481static inline graphene_simd4f_t
482_simd4f_init (float x, float y, float z, float w)
483{
484 graphene_simd4f_t __s = { x, y, z, w };
485 return __s;
486}
487
488#define graphene_simd4f_init_zero() \
489 _mm_setzero_ps()
490
491#define graphene_simd4f_init_4f(v) \
492 _mm_loadu_ps(v)
493
494#define graphene_simd4f_init_3f(v) \
495 graphene_simd4f_init (v[0], v[1], v[2], 0.f)
496
497#define graphene_simd4f_init_2f(v) \
498 graphene_simd4f_init (v[0], v[1], 0.f, 0.f)
499
500#define graphene_simd4f_dup_4f(s,v) \
501 _mm_storeu_ps (v, s)
502
503#define graphene_simd4f_dup_3f(s,v) \
504 memcpy (v, &s, sizeof (float) * 3)
505
506#define graphene_simd4f_dup_2f(s,v) \
507 memcpy (v, &s, sizeof (float) * 2)
508
509#define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i)
510#define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0)
511#define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1)
512#define graphene_simd4f_get_z(s) _simd4f_get_xyzw(s, 2)
513#define graphene_simd4f_get_w(s) _simd4f_get_xyzw(s, 3)
514
515static inline float
516_simd4f_get_xyzw (graphene_simd4f_t s, int mode)
517{
518 /* mode: get_x=0
519 get_y=1
520 get_z=2
521 get_w=3 */
522
523 graphene_simd4f_union_t u;
524 u.s = s;
525 return u.f[mode];
526}
527
528#define graphene_simd4f_splat(v) \
529 _mm_set1_ps (v)
530
531#define graphene_simd4f_splat_x(v) \
532 _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 0, 0, 0))
533
534#define graphene_simd4f_splat_y(v) \
535 _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 1, 1, 1))
536
537#define graphene_simd4f_splat_z(v) \
538 _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 2, 2, 2))
539
540#define graphene_simd4f_splat_w(v) \
541 _mm_shuffle_ps (v, v, _MM_SHUFFLE (3, 3, 3, 3))
542
543#define graphene_simd4f_add(a,b) \
544 _mm_add_ps (a, b)
545
546#define graphene_simd4f_sub(a,b) \
547 _mm_sub_ps (a, b)
548
549#define graphene_simd4f_mul(a,b) \
550 _mm_mul_ps (a, b)
551
552#define graphene_simd4f_div(a,b) \
553 _mm_div_ps (a, b)
554
555#define graphene_simd4f_sqrt(v) \
556 _mm_sqrt_ps (v)
557
558#define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v)
559
560static inline graphene_simd4f_t
561_simd4f_reciprocal(const graphene_simd4f_t v)
562{
563 const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f);
564 const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f);
565 const graphene_simd4f_t __s = _mm_rcp_ps (v);
567 _mm_andnot_ps (_mm_cmpeq_ps (v, __zero),
568 __s));
569 return graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, __m));
570}
571
572#define graphene_simd4f_rsqrt(v) _simd4f_rsqrt(v)
573
574static inline graphene_simd4f_t
575_simd4f_rsqrt(const graphene_simd4f_t v)
576{
577 const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f);
578 const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f);
579 graphene_simd4f_t __s = _mm_rsqrt_ps (v);
580 return graphene_simd4f_mul (graphene_simd4f_mul (__s, __half),
581 graphene_simd4f_sub (__three,
583}
584
585#define graphene_simd4f_cross3(a,b) \
586 _simd4f_cross3(a,b)
587
588static inline graphene_simd4f_t
589_simd4f_cross3 (const graphene_simd4f_t a,
590 const graphene_simd4f_t b)
591{
592 const graphene_simd4f_t __a_yzx = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 0, 2, 1));
593 const graphene_simd4f_t __a_zxy = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 1, 0, 2));
594 const graphene_simd4f_t __b_yzx = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 0, 2, 1));
595 const graphene_simd4f_t __b_zxy = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 1, 0, 2));
596
597 return _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx));
598}
599
600#define graphene_simd4f_dot3(a,b) \
601 _simd4f_dot3(a,b)
602
603static inline graphene_simd4f_t
604_simd4f_dot3 (const graphene_simd4f_t a,
605 const graphene_simd4f_t b)
606{
607#if defined(GRAPHENE_USE_SSE4_1)
608 return _mm_dp_ps (a, b, 0x7f);
609#else
610 GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
611 const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits);
612 const graphene_simd4f_t __m = _mm_mul_ps ((a), (b));
613 const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask);
614 const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0));
615 const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1));
616
617 return _mm_shuffle_ps (__s2, __s2, 0);
618#endif
619}
620
621#define graphene_simd4f_dot3_scalar(a,b) \
622 _simd4f_dot3_scalar(a,b)
623
624static inline float
625_simd4f_dot3_scalar (const graphene_simd4f_t a,
626 const graphene_simd4f_t b)
627{
628 float __res;
629 _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b));
630 return __res;
631}
632
633#define graphene_simd4f_min(a,b) \
634 _mm_min_ps (a, b)
635
636#define graphene_simd4f_max(a,b) \
637 _mm_max_ps (a, b)
638
639
640#define graphene_simd4f_shuffle_wxyz(v) \
641 _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3))
642
643#define graphene_simd4f_shuffle_zwxy(v) \
644 _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2))
645
646#define graphene_simd4f_shuffle_yzwx(v) \
647 _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1))
648
649#define graphene_simd4f_zero_w(v) \
650 _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ()))
651
652#define graphene_simd4f_zero_zw(v) \
653 _mm_movelh_ps (v, _mm_setzero_ps ())
654
655#define graphene_simd4f_merge_w(s,v) \
656 _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v)))
657
658#define graphene_simd4f_merge_high(a,b) \
659 _mm_movehl_ps (b, a)
660
661#define graphene_simd4f_merge_low(a,b) \
662 _mm_movelh_ps (a, b)
663
664typedef GRAPHENE_ALIGN16 union {
665 unsigned int ui[4];
666 float f[4];
667} graphene_simd4f_uif_t;
668
669#define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v)
670
671static inline graphene_simd4f_t
672_simd4f_flip_sign_0101 (const graphene_simd4f_t v)
673{
674 const graphene_simd4f_uif_t __pnpn = { {
675 0x00000000,
676 0x80000000,
677 0x00000000,
678 0x80000000
679 } };
680
681 return _mm_xor_ps (v, _mm_load_ps (__pnpn.f));
682}
683
684#define graphene_simd4f_flip_sign_1010(v) _simd4f_flip_sign_1010(v)
685
686static inline graphene_simd4f_t
687_simd4f_flip_sign_1010(const graphene_simd4f_t v)
688{
689 const graphene_simd4f_uif_t __npnp = { {
690 0x80000000,
691 0x00000000,
692 0x80000000,
693 0x00000000,
694 } };
695
696 return _mm_xor_ps (v, _mm_load_ps (__npnp.f));
697}
698
699#define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
700
701static inline bool
702_simd4f_cmp_eq (const graphene_simd4f_t a,
703 const graphene_simd4f_t b)
704{
705 __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b));
706 return (_mm_movemask_epi8 (__res) == 0);
707}
708
709#define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
710
711static inline bool
712_simd4f_cmp_neq (const graphene_simd4f_t a,
713 const graphene_simd4f_t b)
714{
715 __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b));
716 return (_mm_movemask_epi8 (__res) != 0);
717}
718
719#define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
720
721static inline bool
722_simd4f_cmp_lt (const graphene_simd4f_t a,
723 const graphene_simd4f_t b)
724{
725 __m128i __res = _mm_castps_si128 (_mm_cmplt_ps (a, b));
726 return (_mm_movemask_epi8 (__res) == 0xffff);
727}
728
729#define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
730
731static inline bool
732_simd4f_cmp_le (const graphene_simd4f_t a,
733 const graphene_simd4f_t b)
734{
735 __m128i __res = _mm_castps_si128 (_mm_cmple_ps (a, b));
736 return (_mm_movemask_epi8 (__res) == 0xffff);
737}
738
739#define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
740
741static inline bool
742_simd4f_cmp_ge (const graphene_simd4f_t a,
743 const graphene_simd4f_t b)
744{
745 __m128i __res = _mm_castps_si128 (_mm_cmpge_ps (a, b));
746 return (_mm_movemask_epi8 (__res) == 0xffff);
747}
748
749#define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
750
751static inline bool
752_simd4f_cmp_gt (const graphene_simd4f_t a,
753 const graphene_simd4f_t b)
754{
755 __m128i __res = _mm_castps_si128 (_mm_cmpgt_ps (a, b));
756 return (_mm_movemask_epi8 (__res) == 0xffff);
757}
758
759#define graphene_simd4f_neg(s) _simd4f_neg(s)
760
761static inline graphene_simd4f_t
762_simd4f_neg (const graphene_simd4f_t s)
763{
764 const graphene_simd4f_uif_t __mask = { {
765 0x80000000,
766 0x80000000,
767 0x80000000,
768 0x80000000,
769 } };
770
771 return _mm_xor_ps (s, _mm_load_ps (__mask.f));
772}
773
774#else /* SSE intrinsics-not GCC or Visual Studio */
775
776# error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
777
778/* Use static inline to inline all these functions */
779
780# endif /* !__GNUC__ && !_MSC_VER */
781
782#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_GCC)
783
784/* GCC vector intrinsic implementation of SIMD 4f */
785
786typedef int graphene_simd4i_t __attribute__((vector_size (16)));
787
788# define graphene_simd4f_init(x,y,z,w) \
789 (__extension__ ({ \
790 (graphene_simd4f_t) { (x), (y), (z), (w) }; \
791 }))
792
793# define graphene_simd4f_init_zero() \
794 (__extension__ ({ \
795 (graphene_simd4f_t) { 0.f, 0.f, 0.f, 0.f }; \
796 }))
797
798# define graphene_simd4f_init_4f(v) \
799 (__extension__ ({ \
800 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], (v)[3] }; \
801 }))
802
803# define graphene_simd4f_init_3f(v) \
804 (__extension__ ({ \
805 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
806 }))
807
808# define graphene_simd4f_init_2f(v) \
809 (__extension__ ({ \
810 (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
811 }))
812
813# define graphene_simd4f_dup_4f(s,v) \
814 (__extension__ ({ \
815 memcpy ((v), &(s), sizeof (float) * 4); \
816 }))
817
818# define graphene_simd4f_dup_3f(s,v) \
819 (__extension__ ({ \
820 memcpy ((v), &(s), sizeof (float) * 3); \
821 }))
822
823# define graphene_simd4f_dup_2f(s,v) \
824 (__extension__ ({ \
825 memcpy ((v), &(s), sizeof (float) * 2); \
826 }))
827
828# define graphene_simd4f_get(s,i) (__extension__ ({ (float) (s)[(i)]; }))
829# define graphene_simd4f_get_x(s) graphene_simd4f_get ((s), 0)
830# define graphene_simd4f_get_y(s) graphene_simd4f_get ((s), 1)
831# define graphene_simd4f_get_z(s) graphene_simd4f_get ((s), 2)
832# define graphene_simd4f_get_w(s) graphene_simd4f_get ((s), 3)
833
834# define graphene_simd4f_splat(v) \
835 (__extension__ ({ \
836 (graphene_simd4f_t) { (v), (v), (v), (v) }; \
837 }))
838
839# define graphene_simd4f_splat_x(v) \
840 (__extension__ ({ \
841 float __val = graphene_simd4f_get_x ((v)); \
842 (graphene_simd4f_t) { __val, __val, __val, __val }; \
843 }))
844
845# define graphene_simd4f_splat_y(v) \
846 (__extension__ ({ \
847 float __val = graphene_simd4f_get_y ((v)); \
848 (graphene_simd4f_t) { __val, __val, __val, __val }; \
849 }))
850
851# define graphene_simd4f_splat_z(v) \
852 (__extension__ ({ \
853 float __val = graphene_simd4f_get_z ((v)); \
854 (graphene_simd4f_t) { __val, __val, __val, __val }; \
855 }))
856
857# define graphene_simd4f_splat_w(v) \
858 (__extension__ ({ \
859 float __val = graphene_simd4f_get_w ((v)); \
860 (graphene_simd4f_t) { __val, __val, __val, __val }; \
861 }))
862
863# define graphene_simd4f_reciprocal(v) \
864 (__extension__ ({ \
865 (graphene_simd4f_t) { \
866 fabsf ((v)[0]) > FLT_EPSILON ? 1.f / (v)[0] : copysignf (INFINITY, (v)[0]), \
867 fabsf ((v)[1]) > FLT_EPSILON ? 1.f / (v)[1] : copysignf (INFINITY, (v)[1]), \
868 fabsf ((v)[2]) > FLT_EPSILON ? 1.f / (v)[2] : copysignf (INFINITY, (v)[2]), \
869 fabsf ((v)[3]) > FLT_EPSILON ? 1.f / (v)[3] : copysignf (INFINITY, (v)[3]), \
870 }; \
871 }))
872
873# define graphene_simd4f_sqrt(v) \
874 (__extension__ ({ \
875 (graphene_simd4f_t) { \
876 sqrtf ((v)[0]), \
877 sqrtf ((v)[1]), \
878 sqrtf ((v)[2]), \
879 sqrtf ((v)[3]), \
880 }; \
881 }))
882
883# define graphene_simd4f_rsqrt(v) \
884 (__extension__ ({ \
885 _Pragma ("GCC diagnostic push") \
886 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
887 const graphene_simd4f_t __val = (graphene_simd4f_t) { \
888 (v)[0] != 0.f ? 1.f / sqrtf ((v)[0]) : 0.f, \
889 (v)[1] != 0.f ? 1.f / sqrtf ((v)[1]) : 0.f, \
890 (v)[2] != 0.f ? 1.f / sqrtf ((v)[2]) : 0.f, \
891 (v)[3] != 0.f ? 1.f / sqrtf ((v)[3]) : 0.f, \
892 }; \
893 _Pragma ("GCC diagnostic pop") \
894 __val; \
895 }))
896
897# define graphene_simd4f_add(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) + (b)); }))
898# define graphene_simd4f_sub(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) - (b)); }))
899# define graphene_simd4f_mul(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) * (b)); }))
900# define graphene_simd4f_div(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) / (b)); }))
901
902# define graphene_simd4f_cross3(a,b) \
903 (__extension__ ({ \
904 const graphene_simd4f_t __cross_a = (a); \
905 const graphene_simd4f_t __cross_b = (b); \
906 graphene_simd4f_init (__cross_a[1] * __cross_b[2] - __cross_a[2] * __cross_b[1], \
907 __cross_a[2] * __cross_b[0] - __cross_a[0] * __cross_b[2], \
908 __cross_a[0] * __cross_b[1] - __cross_a[1] * __cross_b[0], \
909 0.f); \
910 }))
911
912# define graphene_simd4f_dot3(a,b) \
913 (__extension__ ({ \
914 const graphene_simd4f_t __dot_a = (a); \
915 const graphene_simd4f_t __dot_b = (b); \
916 const float __res = __dot_a[0] * __dot_b[0] + __dot_a[1] * __dot_b[1] + __dot_a[2] * __dot_b[2]; \
917 graphene_simd4f_init (__res, __res, __res, __res); \
918 }))
919
920# define graphene_simd4f_dot3_scalar(a,b) \
921 (__extension__ ({ \
922 graphene_simd4f_get_x (graphene_simd4f_dot3 (a, b)); \
923 }))
924
925# define graphene_simd4f_min(a,b) \
926 (__extension__ ({ \
927 const graphene_simd4f_t __a = (a); \
928 const graphene_simd4f_t __b = (b); \
929 graphene_simd4f_init (__a[0] < __b[0] ? __a[0] : __b[0], \
930 __a[1] < __b[1] ? __a[1] : __b[1], \
931 __a[2] < __b[2] ? __a[2] : __b[2], \
932 __a[3] < __b[3] ? __a[3] : __b[3]); \
933 }))
934
935# define graphene_simd4f_max(a,b) \
936 (__extension__ ({ \
937 const graphene_simd4f_t __a = (a); \
938 const graphene_simd4f_t __b = (b); \
939 graphene_simd4f_init (__a[0] > __b[0] ? __a[0] : __b[0], \
940 __a[1] > __b[1] ? __a[1] : __b[1], \
941 __a[2] > __b[2] ? __a[2] : __b[2], \
942 __a[3] > __b[3] ? __a[3] : __b[3]); \
943 }))
944
945# define graphene_simd4f_shuffle_wxyz(v) \
946 (__extension__ ({ \
947 const graphene_simd4i_t __mask = { 3, 0, 1, 2 }; \
948 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
949 }))
950
951# define graphene_simd4f_shuffle_zwxy(v) \
952 (__extension__ ({ \
953 const graphene_simd4i_t __mask = { 2, 3, 0, 1 }; \
954 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
955 }))
956
957# define graphene_simd4f_shuffle_yzwx(v) \
958 (__extension__ ({ \
959 const graphene_simd4i_t __mask = { 1, 2, 3, 0 }; \
960 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
961 }))
962
963# define graphene_simd4f_zero_w(v) \
964 (__extension__ ({ \
965 const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
966 (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
967 }))
968
969# define graphene_simd4f_zero_zw(v) \
970 (__extension__ ({ \
971 const graphene_simd4i_t __mask = { 0, 1, 4, 4 }; \
972 (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
973 }))
974
975# define graphene_simd4f_merge_w(s,v) \
976 (__extension__ ({ \
977 const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
978 (graphene_simd4f_t) __builtin_shuffle ((s), graphene_simd4f_splat ((v)), __mask); \
979 }))
980
981# define graphene_simd4f_merge_high(a,b) \
982 (__extension__ ({ \
983 const graphene_simd4i_t __mask = { 2, 3, 6, 7 }; \
984 (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
985 }))
986
987# define graphene_simd4f_merge_low(a,b) \
988 (__extension__ ({ \
989 const graphene_simd4i_t __mask = { 0, 1, 4, 5 }; \
990 (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
991 }))
992
993# define graphene_simd4f_flip_sign_0101(v) \
994 (__extension__ ({ \
995 const graphene_simd4f_t __v = (v); \
996 graphene_simd4f_init (__v[0], -__v[1], __v[2], -__v[3]); \
997 }))
998
999# define graphene_simd4f_flip_sign_1010(v) \
1000 (__extension__ ({ \
1001 const graphene_simd4f_t __v = (v); \
1002 graphene_simd4f_init (-__v[0], __v[1], -__v[2], __v[3]); \
1003 }))
1004
1005# define graphene_simd4f_cmp_eq(a,b) \
1006 (__extension__ ({ \
1007 _Pragma ("GCC diagnostic push") \
1008 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1009 const graphene_simd4i_t __res = (a) == (b); \
1010 const bool __val = (bool) (__res[0] != 0 && \
1011 __res[1] != 0 && \
1012 __res[2] != 0 && \
1013 __res[3] != 0); \
1014 _Pragma ("GCC diagnostic pop") \
1015 __val; \
1016 }))
1017
1018# define graphene_simd4f_cmp_neq(a,b) (!graphene_simd4f_cmp_eq (a,b))
1019
1020# define graphene_simd4f_cmp_lt(a,b) \
1021 (__extension__ ({ \
1022 _Pragma ("GCC diagnostic push") \
1023 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1024 const graphene_simd4i_t __res = (a) < (b); \
1025 const bool __val = (bool) (__res[0] != 0 && \
1026 __res[1] != 0 && \
1027 __res[2] != 0 && \
1028 __res[3] != 0); \
1029 _Pragma ("GCC diagnostic pop") \
1030 __val; \
1031 }))
1032
1033# define graphene_simd4f_cmp_le(a,b) \
1034 (__extension__ ({ \
1035 _Pragma ("GCC diagnostic push") \
1036 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1037 const graphene_simd4i_t __res = (a) <= (b); \
1038 const bool __val = (bool) (__res[0] != 0 && \
1039 __res[1] != 0 && \
1040 __res[2] != 0 && \
1041 __res[3] != 0); \
1042 _Pragma ("GCC diagnostic pop") \
1043 __val; \
1044 }))
1045
1046# define graphene_simd4f_cmp_ge(a,b) \
1047 (__extension__ ({ \
1048 _Pragma ("GCC diagnostic push") \
1049 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1050 const graphene_simd4i_t __res = (a) >= (b); \
1051 const bool __val = (bool) (__res[0] != 0 && \
1052 __res[1] != 0 && \
1053 __res[2] != 0 && \
1054 __res[3] != 0); \
1055 _Pragma ("GCC diagnostic pop") \
1056 __val; \
1057 }))
1058
1059# define graphene_simd4f_cmp_gt(a,b) \
1060 (__extension__ ({ \
1061 _Pragma ("GCC diagnostic push") \
1062 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1063 const graphene_simd4i_t __res = (a) > (b); \
1064 const bool __val = (bool) (__res[0] != 0 && \
1065 __res[1] != 0 && \
1066 __res[2] != 0 && \
1067 __res[3] != 0); \
1068 _Pragma ("GCC diagnostic pop") \
1069 __val; \
1070 }))
1071
1072# define graphene_simd4f_neg(s) \
1073 (__extension__ ({ \
1074 const graphene_simd4f_t __s = (s); \
1075 const graphene_simd4f_t __minus_one = graphene_simd4f_splat (-1.f); \
1076 graphene_simd4f_mul (__s, __minus_one); \
1077 }))
1078
1079#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
1080
1081/* ARM Neon implementation of SIMD4f */
1082
1083/* Union type used for single lane reading without memcpy */
1084typedef union {
1086 float f[4];
1087} graphene_simd4f_union_t;
1088
1089/* NEON has optimised 2-lanes vectors we can use */
1090typedef float32x2_t graphene_simd2f_t;
1091
1092#ifdef __GNUC__
1093# define graphene_simd4f_init(x,y,z,w) \
1094 (__extension__ ({ \
1095 const float32_t __v[4] = { (x), (y), (z), (w) }; \
1096 (graphene_simd4f_t) vld1q_f32 (__v); \
1097 }))
1098
1099# define graphene_simd4f_init_zero() \
1100 (__extension__ ({ \
1101 (graphene_simd4f_t) vdupq_n_f32 (0.f); \
1102 }))
1103
1104# define graphene_simd4f_init_4f(v) \
1105 (__extension__ ({ \
1106 const float32_t *__v32 = (const float32_t *) (v); \
1107 (graphene_simd4f_t) vld1q_f32 (__v32); \
1108 }))
1109
1110# define graphene_simd4f_init_3f(v) \
1111 (__extension__ ({ \
1112 graphene_simd4f_init (v[0], v[1], v[2], 0.f); \
1113 }))
1114
1115# define graphene_simd4f_init_2f(v) \
1116 (__extension__ ({ \
1117 const float32_t *__v32 = (const float32_t *) (v); \
1118 const graphene_simd2f_t __low = vld1_f32 (__v32); \
1119 const float32_t __zero = 0; \
1120 const graphene_simd2f_t __high = vld1_dup_f32 (&__zero); \
1121 (graphene_simd4f_t) vcombine_f32 (__low, __high); \
1122 }))
1123
1124# define graphene_simd4f_dup_4f(s,v) \
1125 (__extension__ ({ \
1126 vst1q_f32 ((float32_t *) (v), (s)); \
1127 }))
1128
1129# define graphene_simd4f_dup_3f(s,v) \
1130 (__extension__ ({ \
1131 float *__v = (v); \
1132 vst1q_lane_f32 (__v++, (s), 0); \
1133 vst1q_lane_f32 (__v++, (s), 1); \
1134 vst1q_lane_f32 (__v, (s), 2); \
1135 }))
1136
1137# define graphene_simd4f_dup_2f(s,v) \
1138 (__extension__ ({ \
1139 const graphene_simd2f_t __low = vget_low_f32 ((s)); \
1140 vst1_f32 ((float32_t *) (v), __low); \
1141 }))
1142
1143# define graphene_simd4f_get(s,i) \
1144 (__extension__ ({ \
1145 (float) vgetq_lane_f32 ((s), (i)); \
1146 }))
1147
1148# define graphene_simd4f_splat(v) \
1149 (__extension__ ({ \
1150 (graphene_simd4f_t) vdupq_n_f32 ((v)); \
1151 }))
1152
1153# define graphene_simd4f_splat_x(s) \
1154 (__extension__ ({ \
1155 graphene_simd4f_splat (graphene_simd4f_get_x ((s))); \
1156 }))
1157
1158# define graphene_simd4f_splat_y(s) \
1159 (__extension__ ({ \
1160 graphene_simd4f_splat (graphene_simd4f_get_y ((s))); \
1161 }))
1162
1163# define graphene_simd4f_splat_z(s) \
1164 (__extension__ ({ \
1165 graphene_simd4f_splat (graphene_simd4f_get_z ((s))); \
1166 }))
1167
1168# define graphene_simd4f_splat_w(s) \
1169 (__extension__ ({ \
1170 graphene_simd4f_splat (graphene_simd4f_get_w ((s))); \
1171 }))
1172
1173# define graphene_simd4f_reciprocal(s) \
1174 (__extension__ ({ \
1175 graphene_simd4f_t __est = vrecpeq_f32 ((s)); \
1176 __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1177 (graphene_simd4f_t) vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1178 }))
1179
1180# define graphene_simd4f_add(a,b) \
1181 (__extension__ ({ \
1182 (graphene_simd4f_t) vaddq_f32 ((a), (b)); \
1183 }))
1184
1185# define graphene_simd4f_sub(a,b) \
1186 (__extension__ ({ \
1187 (graphene_simd4f_t) vsubq_f32 ((a), (b)); \
1188 }))
1189
1190# define graphene_simd4f_mul(a,b) \
1191 (__extension__ ({ \
1192 (graphene_simd4f_t) vmulq_f32 ((a), (b)); \
1193 }))
1194
1195# define graphene_simd4f_div(a,b) \
1196 (__extension__ ({ \
1197 graphene_simd4f_t __rec = graphene_simd4f_reciprocal ((b)); \
1198 (graphene_simd4f_t) vmulq_f32 ((a), __rec); \
1199 }))
1200
1201# define _simd4f_rsqrt_iter(v,estimate) \
1202 (__extension__ ({ \
1203 const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); \
1204 (graphene_simd4f_t) vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); \
1205 }))
1206
1207# define graphene_simd4f_rsqrt(s) \
1208 (__extension__ ({ \
1209 graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); \
1210 __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1211 __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1212 _simd4f_rsqrt_iter ((s), __estimate); \
1213 }))
1214
1215# define graphene_simd4f_sqrt(s) \
1216 (__extension__ ({ \
1217 graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s)); \
1218 graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq); \
1219 uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \
1220 (graphene_simd4f_t) vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); \
1221 }))
1222
1223# define graphene_simd4f_cross3(a,b) \
1224 (__extension__ ({ \
1225 const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
1226 const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); \
1227 const graphene_simd4f_t __a = (a), __b = (b); \
1228 const graphene_simd2f_t __a_low = vget_low_f32 (__a); \
1229 const graphene_simd2f_t __b_low = vget_low_f32 (__b); \
1230 const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low); \
1231 const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low); \
1232 graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a), \
1233 graphene_simd4f_mul (__a_yzx, __b)); \
1234 graphene_simd2f_t __s3_low = vget_low_f32 (__s3); \
1235 __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low); \
1236 (graphene_simd4f_t) vandq_s32 ((int32x4_t) __s3, __mask); \
1237 }))
1238
1239# define graphene_simd4f_dot3(a,b) \
1240 (__extension__ ({ \
1241 graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)); \
1242 }))
1243
1244# define graphene_simd4f_dot3_scalar(a,b) \
1245 (__extension__ ({ \
1246 const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); \
1247 const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); \
1248 (float) vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); \
1249 }))
1250
1251# define graphene_simd4f_min(a,b) \
1252 (__extension__ ({ \
1253 (graphene_simd4f_t) vminq_f32 ((a), (b)); \
1254 }))
1255
1256# define graphene_simd4f_max(a,b) \
1257 (__extension__ ({ \
1258 (graphene_simd4f_t) vmaxq_f32 (a, b); \
1259 }))
1260
1261# define graphene_simd4f_shuffle_wxyz(v) \
1262 (__extension__ ({ \
1263 graphene_simd4f_union_t __u = { (v) }; \
1264 graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); \
1265 }))
1266
1267# define graphene_simd4f_shuffle_zwxy(v) \
1268 (__extension__ ({ \
1269 graphene_simd4f_union_t __u = { (v) }; \
1270 graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); \
1271 }))
1272
1273# define graphene_simd4f_shuffle_yzwx(v) \
1274 (__extension__ ({ \
1275 graphene_simd4f_union_t __u = { (v) }; \
1276 graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); \
1277 }))
1278
1279# define graphene_simd4f_zero_w(v) \
1280 (__extension__ ({ \
1281 graphene_simd4f_union_t __u = { (v) }; \
1282 graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); \
1283 }))
1284
1285# define graphene_simd4f_zero_zw(v) \
1286 (__extension__ ({ \
1287 graphene_simd4f_union_t __u = { (v) }; \
1288 graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); \
1289 }))
1290
1291# define graphene_simd4f_merge_w(s,v) \
1292 (__extension__ ({ \
1293 graphene_simd4f_union_t __u = { (s) }; \
1294 graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); \
1295 }))
1296
1297# define graphene_simd4f_merge_high(a,b) \
1298 (__extension__ ({ \
1299 graphene_simd4f_union_t __u_a = { (a) }; \
1300 graphene_simd4f_union_t __u_b = { (b) }; \
1301 graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); \
1302 }))
1303
1304# define graphene_simd4f_merge_low(a,b) \
1305 (__extension__ ({ \
1306 graphene_simd4f_union_t __u_a = { (a) }; \
1307 graphene_simd4f_union_t __u_b = { (b) }; \
1308 graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]); \
1309 }))
1310
1311# define graphene_simd4f_flip_sign_0101(s) \
1312 (__extension__ ({ \
1313 const unsigned int __upnpn[4] = { \
1314 0x00000000, \
1315 0x80000000, \
1316 0x00000000, \
1317 0x80000000 \
1318 }; \
1319 const uint32x4_t __pnpn = vld1q_u32 (__upnpn); \
1320 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); \
1321 }))
1322
1323# define graphene_simd4f_flip_sign_1010(s) \
1324 (__extension__ ({ \
1325 const unsigned int __unpnp[4] = { \
1326 0x80000000, \
1327 0x00000000, \
1328 0x80000000, \
1329 0x00000000 \
1330 }; \
1331 const uint32x4_t __npnp = vld1q_u32 (__unpnp); \
1332 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); \
1333 }))
1334
1335# define graphene_simd4f_cmp_eq(a,b) \
1336 (__extension__ ({ \
1337 const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1338 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1339 vgetq_lane_u32 (__mask, 1) != 0 && \
1340 vgetq_lane_u32 (__mask, 2) != 0 && \
1341 vgetq_lane_u32 (__mask, 3) != 0); \
1342 }))
1343
1344# define graphene_simd4f_cmp_neq(a,b) \
1345 (__extension__ ({ \
1346 const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1347 (bool) (vgetq_lane_u32 (__mask, 0) == 0 || \
1348 vgetq_lane_u32 (__mask, 1) == 0 || \
1349 vgetq_lane_u32 (__mask, 2) == 0 || \
1350 vgetq_lane_u32 (__mask, 3) == 0); \
1351 }))
1352
1353# define graphene_simd4f_cmp_lt(a,b) \
1354 (__extension__ ({ \
1355 const uint32x4_t __mask = vcltq_f32 ((a), (b)); \
1356 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1357 vgetq_lane_u32 (__mask, 1) != 0 && \
1358 vgetq_lane_u32 (__mask, 2) != 0 && \
1359 vgetq_lane_u32 (__mask, 3) != 0); \
1360 }))
1361
1362# define graphene_simd4f_cmp_le(a,b) \
1363 (__extension__ ({ \
1364 const uint32x4_t __mask = vcleq_f32 ((a), (b)); \
1365 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1366 vgetq_lane_u32 (__mask, 1) != 0 && \
1367 vgetq_lane_u32 (__mask, 2) != 0 && \
1368 vgetq_lane_u32 (__mask, 3) != 0); \
1369 }))
1370
1371# define graphene_simd4f_cmp_ge(a,b) \
1372 (__extension__ ({ \
1373 const uint32x4_t __mask = vcgeq_f32 ((a), (b)); \
1374 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1375 vgetq_lane_u32 (__mask, 1) != 0 && \
1376 vgetq_lane_u32 (__mask, 2) != 0 && \
1377 vgetq_lane_u32 (__mask, 3) != 0); \
1378 }))
1379
1380# define graphene_simd4f_cmp_gt(a,b) \
1381 (__extension__ ({ \
1382 const uint32x4_t __mask = vcgtq_f32 ((a), (b)); \
1383 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1384 vgetq_lane_u32 (__mask, 1) != 0 && \
1385 vgetq_lane_u32 (__mask, 2) != 0 && \
1386 vgetq_lane_u32 (__mask, 3) != 0); \
1387 }))
1388
1389# define graphene_simd4f_neg(s) \
1390 (__extension__ ({ \
1391 const unsigned int __umask[4] = { \
1392 0x80000000, \
1393 0x80000000, \
1394 0x80000000, \
1395 0x80000000 \
1396 }; \
1397 const uint32x4_t __mask = vld1q_u32 (__umask); \
1398 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask)); \
1399 }))
1400
1401#elif defined _MSC_VER /* Visual Studio ARM */
1402
1403# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
1404static inline graphene_simd4f_t
1405_simd4f_init (float x, float y, float z, float w)
1406{
1407 const float32_t __v[4] = { (x), (y), (z), (w) };
1408 return vld1q_f32 (__v);
1409}
1410
1411# define graphene_simd4f_init_zero() vdupq_n_f32 (0.f)
1412
1413# define graphene_simd4f_init_4f(v) vld1q_f32 (v)
1414
1415# define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f)
1416
1417# define graphene_simd4f_init_2f(v) _simd4f_init_2f(v)
1418static inline graphene_simd4f_t
1419_simd4f_init_2f (const float *v)
1420{
1421 const float32_t *__v32 = (const float32_t *) (v);
1422 const graphene_simd2f_t __low = vld1_f32 (__v32);
1423 const float32_t __zero = 0;
1424 const graphene_simd2f_t __high = vld1_dup_f32 (&__zero);
1425 return vcombine_f32 (__low, __high);
1426}
1427
1428# define graphene_simd4f_dup_4f(s,v) vst1q_f32 ((float32_t *) (v), (s))
1429
1430# define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v)
1431static inline
1432void _simd4f_dup_3f (const graphene_simd4f_t s,
1433 float *v)
1434{
1435 float *__v = (v);
1436 vst1q_lane_f32 (__v++, (s), 0);
1437 vst1q_lane_f32 (__v++, (s), 1);
1438 vst1q_lane_f32 (__v, (s), 2);
1439}
1440
1441# define graphene_simd4f_dup_2f(s,v) vst1_f32 (v, vget_low_f32 (s))
1442
1443# define graphene_simd4f_get(s,i) vgetq_lane_f32 ((s), (i))
1444
1445# define graphene_simd4f_splat(v) vdupq_n_f32 ((v))
1446
1447# define graphene_simd4f_splat_x(s) graphene_simd4f_splat (graphene_simd4f_get_x ((s)))
1448
1449# define graphene_simd4f_splat_y(s) graphene_simd4f_splat (graphene_simd4f_get_y ((s)))
1450
1451# define graphene_simd4f_splat_z(s) graphene_simd4f_splat (graphene_simd4f_get_z ((s)))
1452
1453# define graphene_simd4f_splat_w(s) graphene_simd4f_splat (graphene_simd4f_get_w ((s)))
1454
1455# define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s)
1456static inline graphene_simd4f_t
1457_simd4f_reciprocal (const graphene_simd4f_t s)
1458{
1459 graphene_simd4f_t __est = vrecpeq_f32 ((s));
1460 __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est);
1461 return vmulq_f32 (vrecpsq_f32 (__est, (s)), __est);
1462}
1463
1464# define graphene_simd4f_add(a,b) vaddq_f32 ((a), (b))
1465
1466# define graphene_simd4f_sub(a,b) vsubq_f32 ((a), (b))
1467
1468# define graphene_simd4f_mul(a,b) vmulq_f32 ((a), (b))
1469
1470# define graphene_simd4f_div(a,b) vmulq_f32 (a, graphene_simd4f_reciprocal (b))
1471
1472static inline graphene_simd4f_t
1473_simd4f_rsqrt_iter (const graphene_simd4f_t v,
1474 const graphene_simd4f_t estimate)
1475{
1476 const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v));
1477 return vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate)));
1478}
1479
1480# define graphene_simd4f_rsqrt(s) _simd4f_rsqrt(s)
1481static inline graphene_simd4f_t
1482_simd4f_rsqrt (const graphene_simd4f_t s)
1483{
1484 graphene_simd4f_t __estimate = vrsqrteq_f32 ((s));
1485 __estimate = _simd4f_rsqrt_iter ((s), __estimate);
1486 __estimate = _simd4f_rsqrt_iter ((s), __estimate);
1487 return _simd4f_rsqrt_iter ((s), __estimate);
1488}
1489
1490# define graphene_simd4f_sqrt(s) _simd4f_sqrt(s)
1491static inline graphene_simd4f_t
1492_simd4f_sqrt (const graphene_simd4f_t s)
1493{
1496 uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \
1497 return vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq)));
1498}
1499
1500# define graphene_simd4f_cross3(a,b) _simd4f_cross3(a,b)
1501static inline graphene_simd4f_t
1502_simd4f_cross3 (const graphene_simd4f_t a,
1503 const graphene_simd4f_t b)
1504{
1505 const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
1506 const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits);
1507 const graphene_simd4f_t __a = (a), __b = (b);
1508 const graphene_simd2f_t __a_low = vget_low_f32 (__a);
1509 const graphene_simd2f_t __b_low = vget_low_f32 (__b);
1510 const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low);
1511 const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low);
1513 graphene_simd4f_mul (__a_yzx, __b));
1514 graphene_simd2f_t __s3_low = vget_low_f32 (__s3);
1515 __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low);
1516 return vandq_s32 (__s3, __mask);
1517}
1518
1519# define graphene_simd4f_dot3(a,b) graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b))
1520
1521# define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b)
1522static inline float
1523_simd4f_dot3_scalar (const graphene_simd4f_t a,
1524 const graphene_simd4f_t b)
1525{
1526 const graphene_simd4f_t __m = graphene_simd4f_mul (a, b);
1527 const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m));
1528 return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0);
1529}
1530
1531# define graphene_simd4f_min(a,b) vminq_f32 ((a), (b))
1532
1533# define graphene_simd4f_max(a,b) vmaxq_f32 (a, b)
1534
1535# define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v)
1536static inline graphene_simd4f_t
1537_simd4f_shuffle_wxyz (const graphene_simd4f_t v)
1538{
1539 graphene_simd4f_union_t __u = { (v) };
1540 return graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]);
1541}
1542
1543# define graphene_simd4f_shuffle_zwxy(v) _simd4f_shuffle_zwxy(v)
1544static inline graphene_simd4f_t
1545_simd4f_shuffle_zwxy (const graphene_simd4f_t v)
1546{
1547 graphene_simd4f_union_t __u = { (v) };
1548 return graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]);
1549}
1550
1551# define graphene_simd4f_shuffle_yzwx(v) _simd4f_shuffle_yzwx(v)
1552static inline graphene_simd4f_t
1553_simd4f_shuffle_yzwx (const graphene_simd4f_t v)
1554{
1555 graphene_simd4f_union_t __u = { (v) };
1556 return graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]);
1557}
1558
1559# define graphene_simd4f_zero_w(v) _simd4f_zero_w(v)
1560static inline graphene_simd4f_t
1561_simd4f_zero_w (const graphene_simd4f_t v)
1562{
1563 graphene_simd4f_union_t __u = { (v) };
1564 return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f);
1565}
1566
1567# define graphene_simd4f_zero_zw(v) _simd4f_zero_zw(v)
1568static inline graphene_simd4f_t
1569_simd4f_zero_zw (const graphene_simd4f_t v)
1570{
1571 graphene_simd4f_union_t __u = { (v) };
1572 return graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f);
1573}
1574
1575# define graphene_simd4f_merge_w(s,v) _simd4f_merge_w(s,v)
1576static inline graphene_simd4f_t
1577_simd4f_merge_w (const graphene_simd4f_t s,
1578 float v)
1579{
1580 graphene_simd4f_union_t __u = { (s) };
1581 return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v));
1582}
1583
1584# define graphene_simd4f_merge_high(a,b) _simd4f_merge_high(a,b)
1585static inline graphene_simd4f_t
1586_simd4f_merge_high (const graphene_simd4f_t a,
1587 const graphene_simd4f_t b)
1588{
1589 graphene_simd4f_union_t __u_a = { (a) };
1590 graphene_simd4f_union_t __u_b = { (b) };
1591 return graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]);
1592}
1593
1594# define graphene_simd4f_merge_low(a,b) _simd4f_merge_low(a,b)
1595static inline graphene_simd4f_t
1596_simd4f_merge_low (const graphene_simd4f_t a,
1597 const graphene_simd4f_t b)
1598{
1599 graphene_simd4f_union_t __u_a = { (a) };
1600 graphene_simd4f_union_t __u_b = { (b) };
1601 return graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]);
1602}
1603
1604
1605# define graphene_simd4f_flip_sign_0101(s) _simd4f_flip_sign_0101(s)
1606static inline graphene_simd4f_t
1607_simd4f_flip_sign_0101 (const graphene_simd4f_t s)
1608{
1609 const unsigned int __upnpn[4] = {
1610 0x00000000,
1611 0x80000000,
1612 0x00000000,
1613 0x80000000
1614 };
1615 const uint32x4_t __pnpn = vld1q_u32 (__upnpn);
1616 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn));
1617}
1618
1619# define graphene_simd4f_flip_sign_1010(s) _simd4f_flip_sign_1010(s)
1620static inline graphene_simd4f_t
1621_simd4f_flip_sign_1010 (const graphene_simd4f_t s)
1622{
1623 const unsigned int __unpnp[4] = {
1624 0x80000000,
1625 0x00000000,
1626 0x80000000,
1627 0x00000000
1628 };
1629
1630 const uint32x4_t __npnp = vld1q_u32 (__unpnp);
1631 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp));
1632}
1633
1634# define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
1635static inline bool
1636_simd4f_cmp_eq (const graphene_simd4f_t a,
1637 const graphene_simd4f_t b)
1638{
1639 const uint32x4_t __mask = vceqq_f32 ((a), (b));
1640 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1641 vgetq_lane_u32 (__mask, 1) != 0 &&
1642 vgetq_lane_u32 (__mask, 2) != 0 &&
1643 vgetq_lane_u32 (__mask, 3) != 0);
1644}
1645
1646# define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
1647static inline bool
1648_simd4f_cmp_neq (const graphene_simd4f_t a,
1649 const graphene_simd4f_t b)
1650{
1651 const uint32x4_t __mask = vceqq_f32 ((a), (b));
1652 return (vgetq_lane_u32 (__mask, 0) == 0 ||
1653 vgetq_lane_u32 (__mask, 1) == 0 ||
1654 vgetq_lane_u32 (__mask, 2) == 0 ||
1655 vgetq_lane_u32 (__mask, 3) == 0);
1656}
1657
1658# define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
1659static inline bool
1660_simd4f_cmp_lt (const graphene_simd4f_t a,
1661 const graphene_simd4f_t b)
1662{
1663 const uint32x4_t __mask = vcltq_f32 ((a), (b));
1664 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1665 vgetq_lane_u32 (__mask, 1) != 0 &&
1666 vgetq_lane_u32 (__mask, 2) != 0 &&
1667 vgetq_lane_u32 (__mask, 3) != 0);
1668}
1669
1670# define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
1671static inline bool
1672_simd4f_cmp_le (const graphene_simd4f_t a,
1673 const graphene_simd4f_t b)
1674{
1675 const uint32x4_t __mask = vcleq_f32 ((a), (b));
1676 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1677 vgetq_lane_u32 (__mask, 1) != 0 &&
1678 vgetq_lane_u32 (__mask, 2) != 0 &&
1679 vgetq_lane_u32 (__mask, 3) != 0);
1680}
1681
1682# define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
1683static inline bool
1684_simd4f_cmp_ge (const graphene_simd4f_t a,
1685 const graphene_simd4f_t b)
1686{
1687 const uint32x4_t __mask = vcgeq_f32 ((a), (b));
1688 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1689 vgetq_lane_u32 (__mask, 1) != 0 &&
1690 vgetq_lane_u32 (__mask, 2) != 0 &&
1691 vgetq_lane_u32 (__mask, 3) != 0);
1692}
1693
1694# define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
1695static inline bool
1696_simd4f_cmp_gt (const graphene_simd4f_t a,
1697 const graphene_simd4f_t b)
1698{
1699 const uint32x4_t __mask = vcgtq_f32 ((a), (b));
1700 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1701 vgetq_lane_u32 (__mask, 1) != 0 &&
1702 vgetq_lane_u32 (__mask, 2) != 0 &&
1703 vgetq_lane_u32 (__mask, 3) != 0);
1704}
1705
1706# define graphene_simd4f_neg(s) _simd4f_neg(s)
1707static inline graphene_simd4f_t
1708_simd4f_neg (const graphene_simd4f_t s)
1709{
1710 const unsigned int __umask[4] = {
1711 0x80000000,
1712 0x80000000,
1713 0x80000000,
1714 0x80000000
1715 };
1716 const uint32x4_t __mask = vld1q_u32 (__umask);
1717 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask));
1718}
1719
1720#else /* ARM NEON intrinsics-not GCC or Visual Studio */
1721
1722# error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
1723
1724/* Use static inline to inline all these functions */
1725
1726# endif /* !__GNUC__ && !_MSC_VER */
1727
1728/* macros that are not compiler-dependent */
1729# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0)
1730# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1)
1731# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2)
1732# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3)
1733
1734#elif defined(__GI_SCANNER__) || defined(GRAPHENE_USE_SCALAR)
1735
1736/* Fallback implementation using scalar types */
1737
1738#define graphene_simd4f_init(x,y,z,w) \
1739 (graphene_simd4f_init ((x), (y), (z), (w)))
1740#define graphene_simd4f_init_zero() \
1741 (graphene_simd4f_init_zero ())
1742#define graphene_simd4f_init_4f(v) \
1743 (graphene_simd4f_init_4f ((const float *) (v)))
1744#define graphene_simd4f_init_3f(v) \
1745 (graphene_simd4f_init_3f ((const float *) (v)))
1746#define graphene_simd4f_init_2f(v) \
1747 (graphene_simd4f_init_2f ((const float *) (v)))
1748#define graphene_simd4f_dup_4f(s,v) \
1749 (graphene_simd4f_dup_4f ((s), (float *) (v)))
1750#define graphene_simd4f_dup_3f(s,v) \
1751 (graphene_simd4f_dup_3f ((s), (float *) (v)))
1752#define graphene_simd4f_dup_2f(s,v) \
1753 (graphene_simd4f_dup_2f ((s), (float *) (v)))
1754#define graphene_simd4f_get(s,i) \
1755 (graphene_simd4f_get ((s), (i)))
1756#define graphene_simd4f_get_x(s) \
1757 (graphene_simd4f_get_x ((s)))
1758#define graphene_simd4f_get_y(s) \
1759 (graphene_simd4f_get_y ((s)))
1760#define graphene_simd4f_get_z(s) \
1761 (graphene_simd4f_get_z ((s)))
1762#define graphene_simd4f_get_w(s) \
1763 (graphene_simd4f_get_w ((s)))
1764#define graphene_simd4f_splat(v) \
1765 (graphene_simd4f_splat ((v)))
1766#define graphene_simd4f_splat_x(s) \
1767 (graphene_simd4f_splat_x ((s)))
1768#define graphene_simd4f_splat_y(s) \
1769 (graphene_simd4f_splat_y ((s)))
1770#define graphene_simd4f_splat_z(s) \
1771 (graphene_simd4f_splat_z ((s)))
1772#define graphene_simd4f_splat_w(s) \
1773 (graphene_simd4f_splat_w ((s)))
1774#define graphene_simd4f_add(a,b) \
1775 (graphene_simd4f_add ((a), (b)))
1776#define graphene_simd4f_sub(a,b) \
1777 (graphene_simd4f_sub ((a), (b)))
1778#define graphene_simd4f_mul(a,b) \
1779 (graphene_simd4f_mul ((a), (b)))
1780#define graphene_simd4f_div(a,b) \
1781 (graphene_simd4f_div ((a), (b)))
1782#define graphene_simd4f_sqrt(s) \
1783 (graphene_simd4f_sqrt ((s)))
1784#define graphene_simd4f_rsqrt(s) \
1785 (graphene_simd4f_rsqrt ((s)))
1786#define graphene_simd4f_reciprocal(s) \
1787 (graphene_simd4f_reciprocal ((s)))
1788#define graphene_simd4f_cross3(a,b) \
1789 (graphene_simd4f_cross3 ((a), (b)))
1790#define graphene_simd4f_dot3(a,b) \
1791 (graphene_simd4f_dot3 ((a), (b)))
1792#define graphene_simd4f_dot3_scalar(a,b) \
1793 (graphene_simd4f_dot3_scalar ((a), (b)))
1794#define graphene_simd4f_min(a,b) \
1795 (graphene_simd4f_min ((a), (b)))
1796#define graphene_simd4f_max(a,b) \
1797 (graphene_simd4f_max ((a), (b)))
1798#define graphene_simd4f_shuffle_wxyz(s) \
1799 (graphene_simd4f_shuffle_wxyz ((s)))
1800#define graphene_simd4f_shuffle_zwxy(s) \
1801 (graphene_simd4f_shuffle_zwxy ((s)))
1802#define graphene_simd4f_shuffle_yzwx(s) \
1803 (graphene_simd4f_shuffle_yzwx ((s)))
1804#define graphene_simd4f_flip_sign_0101(s) \
1805 (graphene_simd4f_flip_sign_0101 ((s)))
1806#define graphene_simd4f_flip_sign_1010(s) \
1807 (graphene_simd4f_flip_sign_1010 ((s)))
1808#define graphene_simd4f_zero_w(v) \
1809 (graphene_simd4f_zero_w ((v)))
1810#define graphene_simd4f_zero_zw(v) \
1811 (graphene_simd4f_zero_zw ((v)))
1812#define graphene_simd4f_merge_w(s,v) \
1813 (graphene_simd4f_merge_w ((s), (v)))
1814#define graphene_simd4f_merge_high(a,b) \
1815 (graphene_simd4f_merge_high ((a), (b)))
1816#define graphene_simd4f_merge_low(a,b) \
1817 (graphene_simd4f_merge_low ((a), (b)))
1818#define graphene_simd4f_cmp_eq(a,b) \
1819 (graphene_simd4f_cmp_eq ((a), (b)))
1820#define graphene_simd4f_cmp_neq(a,b) \
1821 (graphene_simd4f_cmp_neq ((a), (b)))
1822#define graphene_simd4f_cmp_lt(a,b) \
1823 (graphene_simd4f_cmp_lt ((a), (b)))
1824#define graphene_simd4f_cmp_le(a,b) \
1825 (graphene_simd4f_cmp_le ((a), (b)))
1826#define graphene_simd4f_cmp_ge(a,b) \
1827 (graphene_simd4f_cmp_ge ((a), (b)))
1828#define graphene_simd4f_cmp_gt(a,b) \
1829 (graphene_simd4f_cmp_gt ((a), (b)))
1830#define graphene_simd4f_neg(s) \
1831 (graphene_simd4f_neg ((s)))
1832
1833#else
1834# error "Unsupported simd4f implementation."
1835#endif
1836
1837/* Generic operations, inlined */
1838
1839/**
1840 * graphene_simd4f_madd:
1841 * @m1: a #graphene_simd4f_t
1842 * @m2: a #graphene_simd4f_t
1843 * @a: a #graphene_simd4f_t
1844 *
1845 * Adds @a to the product of @m1 and @m2.
1846 *
1847 * Returns: the result vector
1848 *
1849 * Since: 1.0
1850 */
1851static inline graphene_simd4f_t
1853 const graphene_simd4f_t m2,
1854 const graphene_simd4f_t a)
1855{
1856 return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
1857}
1858
1859/**
1860 * graphene_simd4f_sum:
1861 * @v: a #graphene_simd4f_t
1862 *
1863 * Sums all components of the given vector.
1864 *
1865 * Returns: a vector with all components set to be the
1866 * sum of the passed #graphene_simd4f_t
1867 *
1868 * Since: 1.0
1869 */
1870static inline graphene_simd4f_t
1881
1882/**
1883 * graphene_simd4f_sum_scalar:
1884 * @v: a #graphene_simd4f_t
1885 *
1886 * Sums all the components of the given vector.
1887 *
1888 * Returns: a scalar value with the sum of the components
1889 * of the given #graphene_simd4f_t
1890 *
1891 * Since: 1.0
1892 */
1893static inline float
1898
1899/**
1900 * graphene_simd4f_dot4:
1901 * @a: a #graphene_simd4f_t
1902 * @b: a #graphene_simd4f_t
1903 *
1904 * Computes the dot product of all the components of the two
1905 * given #graphene_simd4f_t.
1906 *
1907 * Returns: a vector whose components are all set to be the
1908 * dot product of the components of the two operands
1909 *
1910 * Since: 1.0
1911 */
1912static inline graphene_simd4f_t
1918
1919/**
1920 * graphene_simd4f_dot2:
1921 * @a: a #graphene_simd4f_t
1922 * @b: a #graphene_simd4f_t
1923 *
1924 * Computes the dot product of the first two components of the
1925 * two given #graphene_simd4f_t.
1926 *
1927 * Returns: a vector whose components are all set to the
1928 * dot product of the components of the two operands
1929 *
1930 * Since: 1.0
1931 */
1932static inline graphene_simd4f_t
1934 const graphene_simd4f_t b)
1935{
1939
1940 return graphene_simd4f_add (x, y);
1941}
1942
1943/**
1944 * graphene_simd4f_length4:
1945 * @v: a #graphene_simd4f_t
1946 *
1947 * Computes the length of the given #graphene_simd4f_t vector,
1948 * using all four of its components.
1949 *
1950 * Returns: the length vector
1951 *
1952 * Since: 1.0
1953 */
1954static inline graphene_simd4f_t
1959
1960/**
1961 * graphene_simd4f_length3:
1962 * @v: a #graphene_simd4f_t
1963 *
1964 * Computes the length of the given #graphene_simd4f_t vector,
1965 * using the first three of its components.
1966 *
1967 * Returns: the length vector
1968 *
1969 * Since: 1.0
1970 */
1971static inline graphene_simd4f_t
1976
1977/**
1978 * graphene_simd4f_length2:
1979 * @v: a #graphene_simd4f_t
1980 *
1981 * Computes the length of the given #graphene_simd4f_t vector,
1982 * using the first two of its components.
1983 *
1984 * Returns: the length vector
1985 *
1986 * Since: 1.0
1987 */
1988static inline graphene_simd4f_t
1993
1994/**
1995 * graphene_simd4f_normalize4:
1996 * @v: a #graphene_simd4f_t
1997 *
1998 * Computes the normalization of the given #graphene_simd4f_t vector,
1999 * using all of its components.
2000 *
2001 * Returns: the normalized vector
2002 *
2003 * Since: 1.0
2004 */
2005static inline graphene_simd4f_t
2011
2012/**
2013 * graphene_simd4f_normalize3:
2014 * @v: a #graphene_simd4f_t
2015 *
2016 * Computes the normalization of the given #graphene_simd4f_t vector,
2017 * using the first three of its components.
2018 *
2019 * Returns: the normalized vector
2020 *
2021 * Since: 1.0
2022 */
2023static inline graphene_simd4f_t
2029
2030/**
2031 * graphene_simd4f_normalize2:
2032 * @v: a #graphene_simd4f_t
2033 *
2034 * Computes the normalization of the given #graphene_simd4f_t vector,
2035 * using the first two of its components.
2036 *
2037 * Returns: the normalized vector
2038 *
2039 * Since: 1.0
2040 */
2041static inline graphene_simd4f_t
2047
2048/**
2049 * graphene_simd4f_is_zero4:
2050 * @v: a #graphene_simd4f_t
2051 *
2052 * Checks whether the given #graphene_simd4f_t has all its components
2053 * set to 0.
2054 *
2055 * Returns: `true` if all the vector components are zero
2056 *
2057 * Since: 1.0
2058 */
2059static inline bool
2065
2066/**
2067 * graphene_simd4f_is_zero3:
2068 * @v: a #graphene_simd4f_t
2069 *
2070 * Checks whether the given #graphene_simd4f_t has the first three of
2071 * its components set to 0.
2072 *
2073 * Returns: `true` if the vector's components are zero
2074 *
2075 * Since: 1.0
2076 */
2077static inline bool
2079{
2080 return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON &&
2081 fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON &&
2082 fabsf (graphene_simd4f_get_z (v)) <= FLT_EPSILON;
2083}
2084
2085/**
2086 * graphene_simd4f_is_zero2:
2087 * @v: a #graphene_simd4f_t
2088 *
2089 * Checks whether the given #graphene_simd4f_t has the first two of
2090 * its components set to 0.
2091 *
2092 * Returns: `true` if the vector's components are zero
2093 *
2094 * Since: 1.0
2095 */
2096static inline bool
2098{
2099 return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON &&
2100 fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON;
2101}
2102
2103/**
2104 * graphene_simd4f_interpolate:
2105 * @a: a #graphene_simd4f_t
2106 * @b: a #graphene_simd4f_t
2107 * @f: the interpolation factor
2108 *
2109 * Linearly interpolates all components of the two given
2110 * #graphene_simd4f_t vectors using the given factor @f.
2111 *
2112 * Returns: the intrerpolated vector
2113 *
2114 * Since: 1.0
2115 */
2116static inline graphene_simd4f_t
2118 const graphene_simd4f_t b,
2119 float f)
2120{
2123
2124 return graphene_simd4f_add (graphene_simd4f_mul (one_minus_f, a),
2126}
2127
2128/**
2129 * graphene_simd4f_clamp:
2130 * @v: a #graphene_simd4f_t
2131 * @min: the lower boundary
2132 * @max: the upper boundary
2133 *
2134 * Ensures that all components of the vector @v are within
2135 * the components of the @lower and @upper boundaries.
2136 *
2137 * Returns: the clamped vector
2138 *
2139 * Since: 1.2
2140 */
2141static inline graphene_simd4f_t
2143 const graphene_simd4f_t min,
2144 const graphene_simd4f_t max)
2145{
2146 const graphene_simd4f_t tmp = graphene_simd4f_max (min, v);
2147
2148 return graphene_simd4f_min (tmp, max);
2149}
2150
2151/**
2152 * graphene_simd4f_clamp_scalar:
2153 * @v: a #graphene_simd4f_t
2154 * @min: the lower boundary
2155 * @max: the upper boundary
2156 *
2157 * Ensures that all components of the vector @v are within
2158 * the @lower and @upper boundary scalar values.
2159 *
2160 * Returns: the clamped vector
2161 *
2162 * Since: 1.2
2163 */
2164static inline graphene_simd4f_t
2166 float min,
2167 float max)
2168{
2169 return graphene_simd4f_clamp (v,
2171 graphene_simd4f_splat (max));
2172}
2173
2174/**
2175 * graphene_simd4f_min_val:
2176 * @v: a #graphene_simd4f_t
2177 *
2178 * Computes the minimum value of all the channels in the given vector.
2179 *
2180 * Returns: a vector whose components are all set to the
2181 * minimum value in the original vector
2182 *
2183 * Since: 1.4
2184 */
2185static inline graphene_simd4f_t
2195
2196/**
2197 * graphene_simd4f_max_val:
2198 * @v: a #graphene_simd4f_t
2199 *
2200 * Computes the maximum value of all the channels in the given vector.
2201 *
2202 * Returns: a vector whose components are all set to the
2203 * maximum value in the original vector
2204 *
2205 * Since: 1.4
2206 */
2207static inline graphene_simd4f_t
2217
#define GRAPHENE_ALIGN16
#define GRAPHENE_END_DECLS
#define GRAPHENE_BEGIN_DECLS
static bool graphene_simd4f_is_zero4(const graphene_simd4f_t v)
#define graphene_simd4f_merge_low(a, b)
#define graphene_simd4f_get(s, i)
#define graphene_simd4f_get_y(s)
#define graphene_simd4f_shuffle_wxyz(s)
#define graphene_simd4f_rsqrt(s)
#define graphene_simd4f_cmp_eq(a, b)
static graphene_simd4f_t graphene_simd4f_normalize3(const graphene_simd4f_t v)
#define graphene_simd4f_dup_3f(s, v)
#define graphene_simd4f_zero_w(v)
#define graphene_simd4f_neg(s)
#define graphene_simd4f_mul(a, b)
#define graphene_simd4f_shuffle_zwxy(s)
static graphene_simd4f_t graphene_simd4f_clamp_scalar(const graphene_simd4f_t v, float min, float max)
static graphene_simd4f_t graphene_simd4f_normalize4(const graphene_simd4f_t v)
#define graphene_simd4f_merge_w(s, v)
#define graphene_simd4f_cmp_neq(a, b)
static float graphene_simd4f_sum_scalar(const graphene_simd4f_t v)
#define graphene_simd4f_cmp_ge(a, b)
static graphene_simd4f_t graphene_simd4f_dot2(const graphene_simd4f_t a, const graphene_simd4f_t b)
#define graphene_simd4f_init_zero()
#define graphene_simd4f_max(a, b)
static graphene_simd4f_t graphene_simd4f_interpolate(const graphene_simd4f_t a, const graphene_simd4f_t b, float f)
#define graphene_simd4f_sqrt(s)
#define graphene_simd4f_splat_z(s)
#define graphene_simd4f_cmp_le(a, b)
#define graphene_simd4f_init_2f(v)
#define graphene_simd4f_flip_sign_1010(s)
#define graphene_simd4f_cmp_gt(a, b)
static graphene_simd4f_t graphene_simd4f_dot4(const graphene_simd4f_t a, const graphene_simd4f_t b)
#define graphene_simd4f_dup_4f(s, v)
#define graphene_simd4f_splat_y(s)
#define graphene_simd4f_dot3_scalar(a, b)
#define graphene_simd4f_zero_zw(v)
static graphene_simd4f_t graphene_simd4f_normalize2(const graphene_simd4f_t v)
static bool graphene_simd4f_is_zero2(const graphene_simd4f_t v)
#define graphene_simd4f_merge_high(a, b)
static graphene_simd4f_t graphene_simd4f_max_val(const graphene_simd4f_t v)
static graphene_simd4f_t graphene_simd4f_clamp(const graphene_simd4f_t v, const graphene_simd4f_t min, const graphene_simd4f_t max)
#define graphene_simd4f_get_x(s)
#define graphene_simd4f_cmp_lt(a, b)
#define graphene_simd4f_init_4f(v)
#define graphene_simd4f_get_z(s)
#define graphene_simd4f_splat_x(s)
static graphene_simd4f_t graphene_simd4f_sum(const graphene_simd4f_t v)
#define graphene_simd4f_init_3f(v)
static bool graphene_simd4f_is_zero3(const graphene_simd4f_t v)
static graphene_simd4f_t graphene_simd4f_length4(const graphene_simd4f_t v)
#define graphene_simd4f_dot3(a, b)
#define graphene_simd4f_init(x, y, z, w)
#define graphene_simd4f_splat(v)
static graphene_simd4f_t graphene_simd4f_madd(const graphene_simd4f_t m1, const graphene_simd4f_t m2, const graphene_simd4f_t a)
#define graphene_simd4f_flip_sign_0101(s)
#define graphene_simd4f_get_w(s)
#define graphene_simd4f_min(a, b)
static graphene_simd4f_t graphene_simd4f_length2(const graphene_simd4f_t v)
#define graphene_simd4f_reciprocal(s)
#define graphene_simd4f_div(a, b)
#define graphene_simd4f_shuffle_yzwx(s)
#define graphene_simd4f_sub(a, b)
static graphene_simd4f_t graphene_simd4f_length3(const graphene_simd4f_t v)
#define graphene_simd4f_splat_w(s)
static graphene_simd4f_t graphene_simd4f_min_val(const graphene_simd4f_t v)
#define graphene_simd4f_dup_2f(s, v)
#define graphene_simd4f_cross3(a, b)
#define graphene_simd4f_add(a, b)
#define GRAPHENE_AVAILABLE_IN_1_4
#define GRAPHENE_AVAILABLE_IN_1_2
#define GRAPHENE_AVAILABLE_IN_1_0
CURL_EXTERN CURLMcode curl_socket_t s
Definition multi.h:318