28#if !defined(GRAPHENE_H_INSIDE) && !defined(GRAPHENE_COMPILATION)
29#error "Only graphene.h can be included directly."
177#if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
185} graphene_simd4f_union_t;
188# if defined(__GNUC__)
192# define graphene_simd4f_init(x,y,z,w) \
194 (graphene_simd4f_t) { (x), (y), (z), (w) }; \
197# define graphene_simd4f_init_zero() \
199 (graphene_simd4f_t) _mm_setzero_ps(); \
202# define graphene_simd4f_init_4f(v) \
204 (graphene_simd4f_t) _mm_loadu_ps (v); \
207# define graphene_simd4f_init_3f(v) \
209 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
212# define graphene_simd4f_init_2f(v) \
214 (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
217# define graphene_simd4f_dup_4f(s,v) \
219 _mm_storeu_ps ((v), (s)); \
222# define graphene_simd4f_dup_3f(s,v) \
224 memcpy ((v), &(s), sizeof (float) * 3); \
227# define graphene_simd4f_dup_2f(s,v) \
229 memcpy ((v), &(s), sizeof (float) * 2); \
232# define graphene_simd4f_get(s,i) \
234 graphene_simd4f_union_t __u = { (s) }; \
235 (float) __u.f[(i)]; \
238# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0)
239# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1)
240# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2)
241# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3)
243# define graphene_simd4f_splat(v) \
245 (graphene_simd4f_t) _mm_set1_ps ((v)); \
248# define graphene_simd4f_splat_x(v) \
250 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0)); \
253# define graphene_simd4f_splat_y(v) \
255 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1)); \
258# define graphene_simd4f_splat_z(v) \
260 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2)); \
263# define graphene_simd4f_splat_w(v) \
265 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3)); \
268# define graphene_simd4f_add(a,b) \
270 (graphene_simd4f_t) _mm_add_ps ((a), (b)); \
273# define graphene_simd4f_sub(a,b) \
275 (graphene_simd4f_t) _mm_sub_ps ((a), (b)); \
278# define graphene_simd4f_mul(a,b) \
280 (graphene_simd4f_t) _mm_mul_ps ((a), (b)); \
283# define graphene_simd4f_div(a,b) \
285 (graphene_simd4f_t) _mm_div_ps ((a), (b)); \
288# define graphene_simd4f_sqrt(v) \
290 (graphene_simd4f_t) _mm_sqrt_ps ((v)); \
293# define graphene_simd4f_reciprocal(v) \
295 const graphene_simd4f_t __zero = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 0.0f); \
296 const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); \
297 const graphene_simd4f_t __s = _mm_rcp_ps ((v)); \
298 const graphene_simd4f_t __m = graphene_simd4f_mul ((v), \
299 _mm_andnot_ps (_mm_cmpeq_ps ((v), __zero), \
301 graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, __m)); \
304# define graphene_simd4f_rsqrt(v) \
306 const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); \
307 const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); \
308 graphene_simd4f_t __s = _mm_rsqrt_ps ((v)); \
309 graphene_simd4f_mul (graphene_simd4f_mul (__s, __half), \
310 graphene_simd4f_sub (__three, \
311 graphene_simd4f_mul (__s, graphene_simd4f_mul ((v), __s)))); \
314# define graphene_simd4f_cross3(a,b) \
316 const graphene_simd4f_t __a_yzx = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 0, 2, 1)); \
317 const graphene_simd4f_t __a_zxy = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 1, 0, 2)); \
318 const graphene_simd4f_t __b_yzx = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 0, 2, 1)); \
319 const graphene_simd4f_t __b_zxy = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 1, 0, 2)); \
320 (graphene_simd4f_t) _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); \
323# if defined(GRAPHENE_USE_SSE4_1)
324# define graphene_simd4f_dot3(a,b) \
326 (graphene_simd4f_t) _mm_dp_ps ((a), (b), 0x7f); \
329# define graphene_simd4f_dot3(a,b) \
331 const unsigned int __mask_bits[] GRAPHENE_ALIGN16 = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
332 const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); \
333 const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); \
334 const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask); \
335 const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0)); \
336 const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); \
337 (graphene_simd4f_t) _mm_shuffle_ps (__s2, __s2, 0); \
341# define graphene_simd4f_dot3_scalar(a,b) \
344 _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); \
348# define graphene_simd4f_min(a,b) \
350 (graphene_simd4f_t) _mm_min_ps ((a), (b)); \
353# define graphene_simd4f_max(a,b) \
355 (graphene_simd4f_t) _mm_max_ps ((a), (b)); \
358# define graphene_simd4f_shuffle_wxyz(v) \
360 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 1, 0, 3)); \
363# define graphene_simd4f_shuffle_zwxy(v) \
365 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 0, 3, 2)); \
368# define graphene_simd4f_shuffle_yzwx(v) \
370 (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 3, 2, 1)); \
373# define graphene_simd4f_zero_w(v) \
375 graphene_simd4f_t __s = _mm_unpackhi_ps ((v), _mm_setzero_ps ()); \
376 (graphene_simd4f_t) _mm_movelh_ps ((v), __s); \
379# define graphene_simd4f_zero_zw(v) \
381 (graphene_simd4f_t) _mm_movelh_ps ((v), _mm_setzero_ps ()); \
384# define graphene_simd4f_merge_w(s,v) \
386 graphene_simd4f_t __s = _mm_unpackhi_ps ((s), _mm_set1_ps ((v))); \
387 (graphene_simd4f_t) _mm_movelh_ps ((s), __s); \
390# define graphene_simd4f_merge_high(a,b) \
392 (graphene_simd4f_t) _mm_movehl_ps ((b), (a)); \
395# define graphene_simd4f_merge_low(a,b) \
397 (graphene_simd4f_t) _mm_movelh_ps ((a), (b)); \
403} graphene_simd4f_uif_t;
405# define graphene_simd4f_flip_sign_0101(v) \
407 const graphene_simd4f_uif_t __pnpn = { { \
413 (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__pnpn.f)); \
416# define graphene_simd4f_flip_sign_1010(v) \
418 const graphene_simd4f_uif_t __npnp = { { \
424 (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__npnp.f)); \
427# define graphene_simd4f_cmp_eq(a,b) \
429 __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
430 (bool) (_mm_movemask_epi8 (__res) == 0); \
433# define graphene_simd4f_cmp_neq(a,b) \
435 __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
436 (bool) (_mm_movemask_epi8 (__res) != 0); \
439# define graphene_simd4f_cmp_lt(a,b) \
441 __m128i __res = (__m128i) _mm_cmplt_ps ((a), (b)); \
442 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
445# define graphene_simd4f_cmp_le(a,b) \
447 __m128i __res = (__m128i) _mm_cmple_ps ((a), (b)); \
448 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
451# define graphene_simd4f_cmp_ge(a,b) \
453 __m128i __res = (__m128i) _mm_cmpge_ps ((a), (b)); \
454 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
457# define graphene_simd4f_cmp_gt(a,b) \
459 __m128i __res = (__m128i) _mm_cmpgt_ps ((a), (b)); \
460 (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
463# define graphene_simd4f_neg(s) \
465 const graphene_simd4f_uif_t __mask = { { \
471 (graphene_simd4f_t) _mm_xor_ps ((s), _mm_load_ps (__mask.f)); \
475# elif defined (_MSC_VER)
479#define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
482_simd4f_init (
float x,
float y,
float z,
float w)
488#define graphene_simd4f_init_zero() \
491#define graphene_simd4f_init_4f(v) \
494#define graphene_simd4f_init_3f(v) \
495 graphene_simd4f_init (v[0], v[1], v[2], 0.f)
497#define graphene_simd4f_init_2f(v) \
498 graphene_simd4f_init (v[0], v[1], 0.f, 0.f)
500#define graphene_simd4f_dup_4f(s,v) \
503#define graphene_simd4f_dup_3f(s,v) \
504 memcpy (v, &s, sizeof (float) * 3)
506#define graphene_simd4f_dup_2f(s,v) \
507 memcpy (v, &s, sizeof (float) * 2)
509#define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i)
510#define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0)
511#define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1)
512#define graphene_simd4f_get_z(s) _simd4f_get_xyzw(s, 2)
513#define graphene_simd4f_get_w(s) _simd4f_get_xyzw(s, 3)
523 graphene_simd4f_union_t u;
528#define graphene_simd4f_splat(v) \
531#define graphene_simd4f_splat_x(v) \
532 _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 0, 0, 0))
534#define graphene_simd4f_splat_y(v) \
535 _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 1, 1, 1))
537#define graphene_simd4f_splat_z(v) \
538 _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 2, 2, 2))
540#define graphene_simd4f_splat_w(v) \
541 _mm_shuffle_ps (v, v, _MM_SHUFFLE (3, 3, 3, 3))
543#define graphene_simd4f_add(a,b) \
546#define graphene_simd4f_sub(a,b) \
549#define graphene_simd4f_mul(a,b) \
552#define graphene_simd4f_div(a,b) \
555#define graphene_simd4f_sqrt(v) \
558#define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v)
567 _mm_andnot_ps (_mm_cmpeq_ps (v, __zero),
572#define graphene_simd4f_rsqrt(v) _simd4f_rsqrt(v)
585#define graphene_simd4f_cross3(a,b) \
594 const graphene_simd4f_t __b_yzx = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 0, 2, 1));
595 const graphene_simd4f_t __b_zxy = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 1, 0, 2));
597 return _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx));
600#define graphene_simd4f_dot3(a,b) \
607#if defined(GRAPHENE_USE_SSE4_1)
608 return _mm_dp_ps (
a, b, 0x7f);
610 GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
615 const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1));
617 return _mm_shuffle_ps (__s2, __s2, 0);
621#define graphene_simd4f_dot3_scalar(a,b) \
622 _simd4f_dot3_scalar(a,b)
633#define graphene_simd4f_min(a,b) \
636#define graphene_simd4f_max(a,b) \
640#define graphene_simd4f_shuffle_wxyz(v) \
641 _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3))
643#define graphene_simd4f_shuffle_zwxy(v) \
644 _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2))
646#define graphene_simd4f_shuffle_yzwx(v) \
647 _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1))
649#define graphene_simd4f_zero_w(v) \
650 _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ()))
652#define graphene_simd4f_zero_zw(v) \
653 _mm_movelh_ps (v, _mm_setzero_ps ())
655#define graphene_simd4f_merge_w(s,v) \
656 _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v)))
658#define graphene_simd4f_merge_high(a,b) \
661#define graphene_simd4f_merge_low(a,b) \
667} graphene_simd4f_uif_t;
669#define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v)
674 const graphene_simd4f_uif_t __pnpn = { {
681 return _mm_xor_ps (v, _mm_load_ps (__pnpn.f));
684#define graphene_simd4f_flip_sign_1010(v) _simd4f_flip_sign_1010(v)
689 const graphene_simd4f_uif_t __npnp = { {
696 return _mm_xor_ps (v, _mm_load_ps (__npnp.f));
699#define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
705 __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (
a, b));
706 return (_mm_movemask_epi8 (__res) == 0);
709#define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
715 __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (
a, b));
716 return (_mm_movemask_epi8 (__res) != 0);
719#define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
725 __m128i __res = _mm_castps_si128 (_mm_cmplt_ps (
a, b));
726 return (_mm_movemask_epi8 (__res) == 0xffff);
729#define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
735 __m128i __res = _mm_castps_si128 (_mm_cmple_ps (
a, b));
736 return (_mm_movemask_epi8 (__res) == 0xffff);
739#define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
745 __m128i __res = _mm_castps_si128 (_mm_cmpge_ps (
a, b));
746 return (_mm_movemask_epi8 (__res) == 0xffff);
749#define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
755 __m128i __res = _mm_castps_si128 (_mm_cmpgt_ps (
a, b));
756 return (_mm_movemask_epi8 (__res) == 0xffff);
759#define graphene_simd4f_neg(s) _simd4f_neg(s)
764 const graphene_simd4f_uif_t __mask = { {
771 return _mm_xor_ps (
s, _mm_load_ps (__mask.f));
776# error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
782#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_GCC)
786typedef int graphene_simd4i_t __attribute__((vector_size (16)));
788# define graphene_simd4f_init(x,y,z,w) \
790 (graphene_simd4f_t) { (x), (y), (z), (w) }; \
793# define graphene_simd4f_init_zero() \
795 (graphene_simd4f_t) { 0.f, 0.f, 0.f, 0.f }; \
798# define graphene_simd4f_init_4f(v) \
800 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], (v)[3] }; \
803# define graphene_simd4f_init_3f(v) \
805 (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
808# define graphene_simd4f_init_2f(v) \
810 (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
813# define graphene_simd4f_dup_4f(s,v) \
815 memcpy ((v), &(s), sizeof (float) * 4); \
818# define graphene_simd4f_dup_3f(s,v) \
820 memcpy ((v), &(s), sizeof (float) * 3); \
823# define graphene_simd4f_dup_2f(s,v) \
825 memcpy ((v), &(s), sizeof (float) * 2); \
828# define graphene_simd4f_get(s,i) (__extension__ ({ (float) (s)[(i)]; }))
829# define graphene_simd4f_get_x(s) graphene_simd4f_get ((s), 0)
830# define graphene_simd4f_get_y(s) graphene_simd4f_get ((s), 1)
831# define graphene_simd4f_get_z(s) graphene_simd4f_get ((s), 2)
832# define graphene_simd4f_get_w(s) graphene_simd4f_get ((s), 3)
834# define graphene_simd4f_splat(v) \
836 (graphene_simd4f_t) { (v), (v), (v), (v) }; \
839# define graphene_simd4f_splat_x(v) \
841 float __val = graphene_simd4f_get_x ((v)); \
842 (graphene_simd4f_t) { __val, __val, __val, __val }; \
845# define graphene_simd4f_splat_y(v) \
847 float __val = graphene_simd4f_get_y ((v)); \
848 (graphene_simd4f_t) { __val, __val, __val, __val }; \
851# define graphene_simd4f_splat_z(v) \
853 float __val = graphene_simd4f_get_z ((v)); \
854 (graphene_simd4f_t) { __val, __val, __val, __val }; \
857# define graphene_simd4f_splat_w(v) \
859 float __val = graphene_simd4f_get_w ((v)); \
860 (graphene_simd4f_t) { __val, __val, __val, __val }; \
863# define graphene_simd4f_reciprocal(v) \
865 (graphene_simd4f_t) { \
866 fabsf ((v)[0]) > FLT_EPSILON ? 1.f / (v)[0] : copysignf (INFINITY, (v)[0]), \
867 fabsf ((v)[1]) > FLT_EPSILON ? 1.f / (v)[1] : copysignf (INFINITY, (v)[1]), \
868 fabsf ((v)[2]) > FLT_EPSILON ? 1.f / (v)[2] : copysignf (INFINITY, (v)[2]), \
869 fabsf ((v)[3]) > FLT_EPSILON ? 1.f / (v)[3] : copysignf (INFINITY, (v)[3]), \
873# define graphene_simd4f_sqrt(v) \
875 (graphene_simd4f_t) { \
883# define graphene_simd4f_rsqrt(v) \
885 _Pragma ("GCC diagnostic push") \
886 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
887 const graphene_simd4f_t __val = (graphene_simd4f_t) { \
888 (v)[0] != 0.f ? 1.f / sqrtf ((v)[0]) : 0.f, \
889 (v)[1] != 0.f ? 1.f / sqrtf ((v)[1]) : 0.f, \
890 (v)[2] != 0.f ? 1.f / sqrtf ((v)[2]) : 0.f, \
891 (v)[3] != 0.f ? 1.f / sqrtf ((v)[3]) : 0.f, \
893 _Pragma ("GCC diagnostic pop") \
897# define graphene_simd4f_add(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) + (b)); }))
898# define graphene_simd4f_sub(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) - (b)); }))
899# define graphene_simd4f_mul(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) * (b)); }))
900# define graphene_simd4f_div(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) / (b)); }))
902# define graphene_simd4f_cross3(a,b) \
904 const graphene_simd4f_t __cross_a = (a); \
905 const graphene_simd4f_t __cross_b = (b); \
906 graphene_simd4f_init (__cross_a[1] * __cross_b[2] - __cross_a[2] * __cross_b[1], \
907 __cross_a[2] * __cross_b[0] - __cross_a[0] * __cross_b[2], \
908 __cross_a[0] * __cross_b[1] - __cross_a[1] * __cross_b[0], \
912# define graphene_simd4f_dot3(a,b) \
914 const graphene_simd4f_t __dot_a = (a); \
915 const graphene_simd4f_t __dot_b = (b); \
916 const float __res = __dot_a[0] * __dot_b[0] + __dot_a[1] * __dot_b[1] + __dot_a[2] * __dot_b[2]; \
917 graphene_simd4f_init (__res, __res, __res, __res); \
920# define graphene_simd4f_dot3_scalar(a,b) \
922 graphene_simd4f_get_x (graphene_simd4f_dot3 (a, b)); \
925# define graphene_simd4f_min(a,b) \
927 const graphene_simd4f_t __a = (a); \
928 const graphene_simd4f_t __b = (b); \
929 graphene_simd4f_init (__a[0] < __b[0] ? __a[0] : __b[0], \
930 __a[1] < __b[1] ? __a[1] : __b[1], \
931 __a[2] < __b[2] ? __a[2] : __b[2], \
932 __a[3] < __b[3] ? __a[3] : __b[3]); \
935# define graphene_simd4f_max(a,b) \
937 const graphene_simd4f_t __a = (a); \
938 const graphene_simd4f_t __b = (b); \
939 graphene_simd4f_init (__a[0] > __b[0] ? __a[0] : __b[0], \
940 __a[1] > __b[1] ? __a[1] : __b[1], \
941 __a[2] > __b[2] ? __a[2] : __b[2], \
942 __a[3] > __b[3] ? __a[3] : __b[3]); \
945# define graphene_simd4f_shuffle_wxyz(v) \
947 const graphene_simd4i_t __mask = { 3, 0, 1, 2 }; \
948 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
951# define graphene_simd4f_shuffle_zwxy(v) \
953 const graphene_simd4i_t __mask = { 2, 3, 0, 1 }; \
954 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
957# define graphene_simd4f_shuffle_yzwx(v) \
959 const graphene_simd4i_t __mask = { 1, 2, 3, 0 }; \
960 (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
963# define graphene_simd4f_zero_w(v) \
965 const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
966 (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
969# define graphene_simd4f_zero_zw(v) \
971 const graphene_simd4i_t __mask = { 0, 1, 4, 4 }; \
972 (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
975# define graphene_simd4f_merge_w(s,v) \
977 const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
978 (graphene_simd4f_t) __builtin_shuffle ((s), graphene_simd4f_splat ((v)), __mask); \
981# define graphene_simd4f_merge_high(a,b) \
983 const graphene_simd4i_t __mask = { 2, 3, 6, 7 }; \
984 (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
987# define graphene_simd4f_merge_low(a,b) \
989 const graphene_simd4i_t __mask = { 0, 1, 4, 5 }; \
990 (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
993# define graphene_simd4f_flip_sign_0101(v) \
995 const graphene_simd4f_t __v = (v); \
996 graphene_simd4f_init (__v[0], -__v[1], __v[2], -__v[3]); \
999# define graphene_simd4f_flip_sign_1010(v) \
1001 const graphene_simd4f_t __v = (v); \
1002 graphene_simd4f_init (-__v[0], __v[1], -__v[2], __v[3]); \
1005# define graphene_simd4f_cmp_eq(a,b) \
1007 _Pragma ("GCC diagnostic push") \
1008 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1009 const graphene_simd4i_t __res = (a) == (b); \
1010 const bool __val = (bool) (__res[0] != 0 && \
1014 _Pragma ("GCC diagnostic pop") \
1018# define graphene_simd4f_cmp_neq(a,b) (!graphene_simd4f_cmp_eq (a,b))
1020# define graphene_simd4f_cmp_lt(a,b) \
1022 _Pragma ("GCC diagnostic push") \
1023 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1024 const graphene_simd4i_t __res = (a) < (b); \
1025 const bool __val = (bool) (__res[0] != 0 && \
1029 _Pragma ("GCC diagnostic pop") \
1033# define graphene_simd4f_cmp_le(a,b) \
1035 _Pragma ("GCC diagnostic push") \
1036 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1037 const graphene_simd4i_t __res = (a) <= (b); \
1038 const bool __val = (bool) (__res[0] != 0 && \
1042 _Pragma ("GCC diagnostic pop") \
1046# define graphene_simd4f_cmp_ge(a,b) \
1048 _Pragma ("GCC diagnostic push") \
1049 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1050 const graphene_simd4i_t __res = (a) >= (b); \
1051 const bool __val = (bool) (__res[0] != 0 && \
1055 _Pragma ("GCC diagnostic pop") \
1059# define graphene_simd4f_cmp_gt(a,b) \
1061 _Pragma ("GCC diagnostic push") \
1062 _Pragma ("GCC diagnostic ignored \"-Wfloat-equal\"") \
1063 const graphene_simd4i_t __res = (a) > (b); \
1064 const bool __val = (bool) (__res[0] != 0 && \
1068 _Pragma ("GCC diagnostic pop") \
1072# define graphene_simd4f_neg(s) \
1074 const graphene_simd4f_t __s = (s); \
1075 const graphene_simd4f_t __minus_one = graphene_simd4f_splat (-1.f); \
1076 graphene_simd4f_mul (__s, __minus_one); \
1079#elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
1087} graphene_simd4f_union_t;
1090typedef float32x2_t graphene_simd2f_t;
1093# define graphene_simd4f_init(x,y,z,w) \
1095 const float32_t __v[4] = { (x), (y), (z), (w) }; \
1096 (graphene_simd4f_t) vld1q_f32 (__v); \
1099# define graphene_simd4f_init_zero() \
1101 (graphene_simd4f_t) vdupq_n_f32 (0.f); \
1104# define graphene_simd4f_init_4f(v) \
1106 const float32_t *__v32 = (const float32_t *) (v); \
1107 (graphene_simd4f_t) vld1q_f32 (__v32); \
1110# define graphene_simd4f_init_3f(v) \
1112 graphene_simd4f_init (v[0], v[1], v[2], 0.f); \
1115# define graphene_simd4f_init_2f(v) \
1117 const float32_t *__v32 = (const float32_t *) (v); \
1118 const graphene_simd2f_t __low = vld1_f32 (__v32); \
1119 const float32_t __zero = 0; \
1120 const graphene_simd2f_t __high = vld1_dup_f32 (&__zero); \
1121 (graphene_simd4f_t) vcombine_f32 (__low, __high); \
1124# define graphene_simd4f_dup_4f(s,v) \
1126 vst1q_f32 ((float32_t *) (v), (s)); \
1129# define graphene_simd4f_dup_3f(s,v) \
1132 vst1q_lane_f32 (__v++, (s), 0); \
1133 vst1q_lane_f32 (__v++, (s), 1); \
1134 vst1q_lane_f32 (__v, (s), 2); \
1137# define graphene_simd4f_dup_2f(s,v) \
1139 const graphene_simd2f_t __low = vget_low_f32 ((s)); \
1140 vst1_f32 ((float32_t *) (v), __low); \
1143# define graphene_simd4f_get(s,i) \
1145 (float) vgetq_lane_f32 ((s), (i)); \
1148# define graphene_simd4f_splat(v) \
1150 (graphene_simd4f_t) vdupq_n_f32 ((v)); \
1153# define graphene_simd4f_splat_x(s) \
1155 graphene_simd4f_splat (graphene_simd4f_get_x ((s))); \
1158# define graphene_simd4f_splat_y(s) \
1160 graphene_simd4f_splat (graphene_simd4f_get_y ((s))); \
1163# define graphene_simd4f_splat_z(s) \
1165 graphene_simd4f_splat (graphene_simd4f_get_z ((s))); \
1168# define graphene_simd4f_splat_w(s) \
1170 graphene_simd4f_splat (graphene_simd4f_get_w ((s))); \
1173# define graphene_simd4f_reciprocal(s) \
1175 graphene_simd4f_t __est = vrecpeq_f32 ((s)); \
1176 __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1177 (graphene_simd4f_t) vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1180# define graphene_simd4f_add(a,b) \
1182 (graphene_simd4f_t) vaddq_f32 ((a), (b)); \
1185# define graphene_simd4f_sub(a,b) \
1187 (graphene_simd4f_t) vsubq_f32 ((a), (b)); \
1190# define graphene_simd4f_mul(a,b) \
1192 (graphene_simd4f_t) vmulq_f32 ((a), (b)); \
1195# define graphene_simd4f_div(a,b) \
1197 graphene_simd4f_t __rec = graphene_simd4f_reciprocal ((b)); \
1198 (graphene_simd4f_t) vmulq_f32 ((a), __rec); \
1201# define _simd4f_rsqrt_iter(v,estimate) \
1203 const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); \
1204 (graphene_simd4f_t) vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); \
1207# define graphene_simd4f_rsqrt(s) \
1209 graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); \
1210 __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1211 __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1212 _simd4f_rsqrt_iter ((s), __estimate); \
1215# define graphene_simd4f_sqrt(s) \
1217 graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s)); \
1218 graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq); \
1219 uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \
1220 (graphene_simd4f_t) vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); \
1223# define graphene_simd4f_cross3(a,b) \
1225 const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
1226 const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); \
1227 const graphene_simd4f_t __a = (a), __b = (b); \
1228 const graphene_simd2f_t __a_low = vget_low_f32 (__a); \
1229 const graphene_simd2f_t __b_low = vget_low_f32 (__b); \
1230 const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low); \
1231 const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low); \
1232 graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a), \
1233 graphene_simd4f_mul (__a_yzx, __b)); \
1234 graphene_simd2f_t __s3_low = vget_low_f32 (__s3); \
1235 __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low); \
1236 (graphene_simd4f_t) vandq_s32 ((int32x4_t) __s3, __mask); \
1239# define graphene_simd4f_dot3(a,b) \
1241 graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)); \
1244# define graphene_simd4f_dot3_scalar(a,b) \
1246 const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); \
1247 const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); \
1248 (float) vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); \
1251# define graphene_simd4f_min(a,b) \
1253 (graphene_simd4f_t) vminq_f32 ((a), (b)); \
1256# define graphene_simd4f_max(a,b) \
1258 (graphene_simd4f_t) vmaxq_f32 (a, b); \
1261# define graphene_simd4f_shuffle_wxyz(v) \
1263 graphene_simd4f_union_t __u = { (v) }; \
1264 graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); \
1267# define graphene_simd4f_shuffle_zwxy(v) \
1269 graphene_simd4f_union_t __u = { (v) }; \
1270 graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); \
1273# define graphene_simd4f_shuffle_yzwx(v) \
1275 graphene_simd4f_union_t __u = { (v) }; \
1276 graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); \
1279# define graphene_simd4f_zero_w(v) \
1281 graphene_simd4f_union_t __u = { (v) }; \
1282 graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); \
1285# define graphene_simd4f_zero_zw(v) \
1287 graphene_simd4f_union_t __u = { (v) }; \
1288 graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); \
1291# define graphene_simd4f_merge_w(s,v) \
1293 graphene_simd4f_union_t __u = { (s) }; \
1294 graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); \
1297# define graphene_simd4f_merge_high(a,b) \
1299 graphene_simd4f_union_t __u_a = { (a) }; \
1300 graphene_simd4f_union_t __u_b = { (b) }; \
1301 graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); \
1304# define graphene_simd4f_merge_low(a,b) \
1306 graphene_simd4f_union_t __u_a = { (a) }; \
1307 graphene_simd4f_union_t __u_b = { (b) }; \
1308 graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]); \
1311# define graphene_simd4f_flip_sign_0101(s) \
1313 const unsigned int __upnpn[4] = { \
1319 const uint32x4_t __pnpn = vld1q_u32 (__upnpn); \
1320 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); \
1323# define graphene_simd4f_flip_sign_1010(s) \
1325 const unsigned int __unpnp[4] = { \
1331 const uint32x4_t __npnp = vld1q_u32 (__unpnp); \
1332 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); \
1335# define graphene_simd4f_cmp_eq(a,b) \
1337 const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1338 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1339 vgetq_lane_u32 (__mask, 1) != 0 && \
1340 vgetq_lane_u32 (__mask, 2) != 0 && \
1341 vgetq_lane_u32 (__mask, 3) != 0); \
1344# define graphene_simd4f_cmp_neq(a,b) \
1346 const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1347 (bool) (vgetq_lane_u32 (__mask, 0) == 0 || \
1348 vgetq_lane_u32 (__mask, 1) == 0 || \
1349 vgetq_lane_u32 (__mask, 2) == 0 || \
1350 vgetq_lane_u32 (__mask, 3) == 0); \
1353# define graphene_simd4f_cmp_lt(a,b) \
1355 const uint32x4_t __mask = vcltq_f32 ((a), (b)); \
1356 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1357 vgetq_lane_u32 (__mask, 1) != 0 && \
1358 vgetq_lane_u32 (__mask, 2) != 0 && \
1359 vgetq_lane_u32 (__mask, 3) != 0); \
1362# define graphene_simd4f_cmp_le(a,b) \
1364 const uint32x4_t __mask = vcleq_f32 ((a), (b)); \
1365 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1366 vgetq_lane_u32 (__mask, 1) != 0 && \
1367 vgetq_lane_u32 (__mask, 2) != 0 && \
1368 vgetq_lane_u32 (__mask, 3) != 0); \
1371# define graphene_simd4f_cmp_ge(a,b) \
1373 const uint32x4_t __mask = vcgeq_f32 ((a), (b)); \
1374 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1375 vgetq_lane_u32 (__mask, 1) != 0 && \
1376 vgetq_lane_u32 (__mask, 2) != 0 && \
1377 vgetq_lane_u32 (__mask, 3) != 0); \
1380# define graphene_simd4f_cmp_gt(a,b) \
1382 const uint32x4_t __mask = vcgtq_f32 ((a), (b)); \
1383 (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1384 vgetq_lane_u32 (__mask, 1) != 0 && \
1385 vgetq_lane_u32 (__mask, 2) != 0 && \
1386 vgetq_lane_u32 (__mask, 3) != 0); \
1389# define graphene_simd4f_neg(s) \
1391 const unsigned int __umask[4] = { \
1397 const uint32x4_t __mask = vld1q_u32 (__umask); \
1398 (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask)); \
1401#elif defined _MSC_VER
1403# define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
1405_simd4f_init (
float x,
float y,
float z,
float w)
1407 const float32_t __v[4] = { (x), (y), (z), (w) };
1408 return vld1q_f32 (__v);
1411# define graphene_simd4f_init_zero() vdupq_n_f32 (0.f)
1413# define graphene_simd4f_init_4f(v) vld1q_f32 (v)
1415# define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f)
1417# define graphene_simd4f_init_2f(v) _simd4f_init_2f(v)
1419_simd4f_init_2f (
const float *v)
1421 const float32_t *__v32 = (
const float32_t *) (v);
1422 const graphene_simd2f_t __low = vld1_f32 (__v32);
1423 const float32_t __zero = 0;
1424 const graphene_simd2f_t __high = vld1_dup_f32 (&__zero);
1425 return vcombine_f32 (__low, __high);
1428# define graphene_simd4f_dup_4f(s,v) vst1q_f32 ((float32_t *) (v), (s))
1430# define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v)
1436 vst1q_lane_f32 (__v++, (
s), 0);
1437 vst1q_lane_f32 (__v++, (
s), 1);
1438 vst1q_lane_f32 (__v, (
s), 2);
1441# define graphene_simd4f_dup_2f(s,v) vst1_f32 (v, vget_low_f32 (s))
1443# define graphene_simd4f_get(s,i) vgetq_lane_f32 ((s), (i))
1445# define graphene_simd4f_splat(v) vdupq_n_f32 ((v))
1447# define graphene_simd4f_splat_x(s) graphene_simd4f_splat (graphene_simd4f_get_x ((s)))
1449# define graphene_simd4f_splat_y(s) graphene_simd4f_splat (graphene_simd4f_get_y ((s)))
1451# define graphene_simd4f_splat_z(s) graphene_simd4f_splat (graphene_simd4f_get_z ((s)))
1453# define graphene_simd4f_splat_w(s) graphene_simd4f_splat (graphene_simd4f_get_w ((s)))
1455# define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s)
1460 __est = vmulq_f32 (vrecpsq_f32 (__est, (
s)), __est);
1461 return vmulq_f32 (vrecpsq_f32 (__est, (
s)), __est);
1464# define graphene_simd4f_add(a,b) vaddq_f32 ((a), (b))
1466# define graphene_simd4f_sub(a,b) vsubq_f32 ((a), (b))
1468# define graphene_simd4f_mul(a,b) vmulq_f32 ((a), (b))
1470# define graphene_simd4f_div(a,b) vmulq_f32 (a, graphene_simd4f_reciprocal (b))
1477 return vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate)));
1480# define graphene_simd4f_rsqrt(s) _simd4f_rsqrt(s)
1485 __estimate = _simd4f_rsqrt_iter ((
s), __estimate);
1486 __estimate = _simd4f_rsqrt_iter ((
s), __estimate);
1487 return _simd4f_rsqrt_iter ((
s), __estimate);
1490# define graphene_simd4f_sqrt(s) _simd4f_sqrt(s)
1496 uint32x4_t __tmp = vreinterpretq_u32_f32 ((
s)); \
1497 return vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq)));
1500# define graphene_simd4f_cross3(a,b) _simd4f_cross3(a,b)
1505 const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
1506 const int32x4_t __mask = vld1q_s32 ((
const int32_t *) __mask_bits);
1508 const graphene_simd2f_t __a_low = vget_low_f32 (__a);
1509 const graphene_simd2f_t __b_low = vget_low_f32 (__b);
1510 const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low);
1511 const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low);
1514 graphene_simd2f_t __s3_low = vget_low_f32 (__s3);
1515 __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low);
1516 return vandq_s32 (__s3, __mask);
1519# define graphene_simd4f_dot3(a,b) graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b))
1521# define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b)
1527 const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m));
1528 return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0);
1531# define graphene_simd4f_min(a,b) vminq_f32 ((a), (b))
1533# define graphene_simd4f_max(a,b) vmaxq_f32 (a, b)
1535# define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v)
1539 graphene_simd4f_union_t __u = { (v) };
1543# define graphene_simd4f_shuffle_zwxy(v) _simd4f_shuffle_zwxy(v)
1547 graphene_simd4f_union_t __u = { (v) };
1551# define graphene_simd4f_shuffle_yzwx(v) _simd4f_shuffle_yzwx(v)
1555 graphene_simd4f_union_t __u = { (v) };
1559# define graphene_simd4f_zero_w(v) _simd4f_zero_w(v)
1563 graphene_simd4f_union_t __u = { (v) };
1567# define graphene_simd4f_zero_zw(v) _simd4f_zero_zw(v)
1571 graphene_simd4f_union_t __u = { (v) };
1575# define graphene_simd4f_merge_w(s,v) _simd4f_merge_w(s,v)
1580 graphene_simd4f_union_t __u = { (
s) };
1584# define graphene_simd4f_merge_high(a,b) _simd4f_merge_high(a,b)
1589 graphene_simd4f_union_t __u_a = { (
a) };
1590 graphene_simd4f_union_t __u_b = { (b) };
1594# define graphene_simd4f_merge_low(a,b) _simd4f_merge_low(a,b)
1599 graphene_simd4f_union_t __u_a = { (
a) };
1600 graphene_simd4f_union_t __u_b = { (b) };
1605# define graphene_simd4f_flip_sign_0101(s) _simd4f_flip_sign_0101(s)
1609 const unsigned int __upnpn[4] = {
1615 const uint32x4_t __pnpn = vld1q_u32 (__upnpn);
1616 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((
s)), __pnpn));
1619# define graphene_simd4f_flip_sign_1010(s) _simd4f_flip_sign_1010(s)
1623 const unsigned int __unpnp[4] = {
1630 const uint32x4_t __npnp = vld1q_u32 (__unpnp);
1631 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((
s)), __npnp));
1634# define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
1639 const uint32x4_t __mask = vceqq_f32 ((
a), (b));
1640 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1641 vgetq_lane_u32 (__mask, 1) != 0 &&
1642 vgetq_lane_u32 (__mask, 2) != 0 &&
1643 vgetq_lane_u32 (__mask, 3) != 0);
1646# define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
1651 const uint32x4_t __mask = vceqq_f32 ((
a), (b));
1652 return (vgetq_lane_u32 (__mask, 0) == 0 ||
1653 vgetq_lane_u32 (__mask, 1) == 0 ||
1654 vgetq_lane_u32 (__mask, 2) == 0 ||
1655 vgetq_lane_u32 (__mask, 3) == 0);
1658# define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
1663 const uint32x4_t __mask = vcltq_f32 ((
a), (b));
1664 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1665 vgetq_lane_u32 (__mask, 1) != 0 &&
1666 vgetq_lane_u32 (__mask, 2) != 0 &&
1667 vgetq_lane_u32 (__mask, 3) != 0);
1670# define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
1675 const uint32x4_t __mask = vcleq_f32 ((
a), (b));
1676 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1677 vgetq_lane_u32 (__mask, 1) != 0 &&
1678 vgetq_lane_u32 (__mask, 2) != 0 &&
1679 vgetq_lane_u32 (__mask, 3) != 0);
1682# define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
1687 const uint32x4_t __mask = vcgeq_f32 ((
a), (b));
1688 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1689 vgetq_lane_u32 (__mask, 1) != 0 &&
1690 vgetq_lane_u32 (__mask, 2) != 0 &&
1691 vgetq_lane_u32 (__mask, 3) != 0);
1694# define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
1699 const uint32x4_t __mask = vcgtq_f32 ((
a), (b));
1700 return (vgetq_lane_u32 (__mask, 0) != 0 &&
1701 vgetq_lane_u32 (__mask, 1) != 0 &&
1702 vgetq_lane_u32 (__mask, 2) != 0 &&
1703 vgetq_lane_u32 (__mask, 3) != 0);
1706# define graphene_simd4f_neg(s) _simd4f_neg(s)
1710 const unsigned int __umask[4] = {
1716 const uint32x4_t __mask = vld1q_u32 (__umask);
1717 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((
s)), __mask));
1722# error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
1729# define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0)
1730# define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1)
1731# define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2)
1732# define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3)
1734#elif defined(__GI_SCANNER__) || defined(GRAPHENE_USE_SCALAR)
1738#define graphene_simd4f_init(x,y,z,w) \
1739 (graphene_simd4f_init ((x), (y), (z), (w)))
1740#define graphene_simd4f_init_zero() \
1741 (graphene_simd4f_init_zero ())
1742#define graphene_simd4f_init_4f(v) \
1743 (graphene_simd4f_init_4f ((const float *) (v)))
1744#define graphene_simd4f_init_3f(v) \
1745 (graphene_simd4f_init_3f ((const float *) (v)))
1746#define graphene_simd4f_init_2f(v) \
1747 (graphene_simd4f_init_2f ((const float *) (v)))
1748#define graphene_simd4f_dup_4f(s,v) \
1749 (graphene_simd4f_dup_4f ((s), (float *) (v)))
1750#define graphene_simd4f_dup_3f(s,v) \
1751 (graphene_simd4f_dup_3f ((s), (float *) (v)))
1752#define graphene_simd4f_dup_2f(s,v) \
1753 (graphene_simd4f_dup_2f ((s), (float *) (v)))
1754#define graphene_simd4f_get(s,i) \
1755 (graphene_simd4f_get ((s), (i)))
1756#define graphene_simd4f_get_x(s) \
1757 (graphene_simd4f_get_x ((s)))
1758#define graphene_simd4f_get_y(s) \
1759 (graphene_simd4f_get_y ((s)))
1760#define graphene_simd4f_get_z(s) \
1761 (graphene_simd4f_get_z ((s)))
1762#define graphene_simd4f_get_w(s) \
1763 (graphene_simd4f_get_w ((s)))
1764#define graphene_simd4f_splat(v) \
1765 (graphene_simd4f_splat ((v)))
1766#define graphene_simd4f_splat_x(s) \
1767 (graphene_simd4f_splat_x ((s)))
1768#define graphene_simd4f_splat_y(s) \
1769 (graphene_simd4f_splat_y ((s)))
1770#define graphene_simd4f_splat_z(s) \
1771 (graphene_simd4f_splat_z ((s)))
1772#define graphene_simd4f_splat_w(s) \
1773 (graphene_simd4f_splat_w ((s)))
1774#define graphene_simd4f_add(a,b) \
1775 (graphene_simd4f_add ((a), (b)))
1776#define graphene_simd4f_sub(a,b) \
1777 (graphene_simd4f_sub ((a), (b)))
1778#define graphene_simd4f_mul(a,b) \
1779 (graphene_simd4f_mul ((a), (b)))
1780#define graphene_simd4f_div(a,b) \
1781 (graphene_simd4f_div ((a), (b)))
1782#define graphene_simd4f_sqrt(s) \
1783 (graphene_simd4f_sqrt ((s)))
1784#define graphene_simd4f_rsqrt(s) \
1785 (graphene_simd4f_rsqrt ((s)))
1786#define graphene_simd4f_reciprocal(s) \
1787 (graphene_simd4f_reciprocal ((s)))
1788#define graphene_simd4f_cross3(a,b) \
1789 (graphene_simd4f_cross3 ((a), (b)))
1790#define graphene_simd4f_dot3(a,b) \
1791 (graphene_simd4f_dot3 ((a), (b)))
1792#define graphene_simd4f_dot3_scalar(a,b) \
1793 (graphene_simd4f_dot3_scalar ((a), (b)))
1794#define graphene_simd4f_min(a,b) \
1795 (graphene_simd4f_min ((a), (b)))
1796#define graphene_simd4f_max(a,b) \
1797 (graphene_simd4f_max ((a), (b)))
1798#define graphene_simd4f_shuffle_wxyz(s) \
1799 (graphene_simd4f_shuffle_wxyz ((s)))
1800#define graphene_simd4f_shuffle_zwxy(s) \
1801 (graphene_simd4f_shuffle_zwxy ((s)))
1802#define graphene_simd4f_shuffle_yzwx(s) \
1803 (graphene_simd4f_shuffle_yzwx ((s)))
1804#define graphene_simd4f_flip_sign_0101(s) \
1805 (graphene_simd4f_flip_sign_0101 ((s)))
1806#define graphene_simd4f_flip_sign_1010(s) \
1807 (graphene_simd4f_flip_sign_1010 ((s)))
1808#define graphene_simd4f_zero_w(v) \
1809 (graphene_simd4f_zero_w ((v)))
1810#define graphene_simd4f_zero_zw(v) \
1811 (graphene_simd4f_zero_zw ((v)))
1812#define graphene_simd4f_merge_w(s,v) \
1813 (graphene_simd4f_merge_w ((s), (v)))
1814#define graphene_simd4f_merge_high(a,b) \
1815 (graphene_simd4f_merge_high ((a), (b)))
1816#define graphene_simd4f_merge_low(a,b) \
1817 (graphene_simd4f_merge_low ((a), (b)))
1818#define graphene_simd4f_cmp_eq(a,b) \
1819 (graphene_simd4f_cmp_eq ((a), (b)))
1820#define graphene_simd4f_cmp_neq(a,b) \
1821 (graphene_simd4f_cmp_neq ((a), (b)))
1822#define graphene_simd4f_cmp_lt(a,b) \
1823 (graphene_simd4f_cmp_lt ((a), (b)))
1824#define graphene_simd4f_cmp_le(a,b) \
1825 (graphene_simd4f_cmp_le ((a), (b)))
1826#define graphene_simd4f_cmp_ge(a,b) \
1827 (graphene_simd4f_cmp_ge ((a), (b)))
1828#define graphene_simd4f_cmp_gt(a,b) \
1829 (graphene_simd4f_cmp_gt ((a), (b)))
1830#define graphene_simd4f_neg(s) \
1831 (graphene_simd4f_neg ((s)))
1834# error "Unsupported simd4f implementation."
#define GRAPHENE_END_DECLS
#define GRAPHENE_BEGIN_DECLS
static bool graphene_simd4f_is_zero4(const graphene_simd4f_t v)
#define graphene_simd4f_merge_low(a, b)
#define graphene_simd4f_get(s, i)
#define graphene_simd4f_get_y(s)
#define graphene_simd4f_shuffle_wxyz(s)
#define graphene_simd4f_rsqrt(s)
#define graphene_simd4f_cmp_eq(a, b)
static graphene_simd4f_t graphene_simd4f_normalize3(const graphene_simd4f_t v)
#define graphene_simd4f_dup_3f(s, v)
#define graphene_simd4f_zero_w(v)
#define graphene_simd4f_neg(s)
#define graphene_simd4f_mul(a, b)
#define graphene_simd4f_shuffle_zwxy(s)
static graphene_simd4f_t graphene_simd4f_clamp_scalar(const graphene_simd4f_t v, float min, float max)
static graphene_simd4f_t graphene_simd4f_normalize4(const graphene_simd4f_t v)
#define graphene_simd4f_merge_w(s, v)
#define graphene_simd4f_cmp_neq(a, b)
static float graphene_simd4f_sum_scalar(const graphene_simd4f_t v)
#define graphene_simd4f_cmp_ge(a, b)
static graphene_simd4f_t graphene_simd4f_dot2(const graphene_simd4f_t a, const graphene_simd4f_t b)
#define graphene_simd4f_init_zero()
#define graphene_simd4f_max(a, b)
static graphene_simd4f_t graphene_simd4f_interpolate(const graphene_simd4f_t a, const graphene_simd4f_t b, float f)
#define graphene_simd4f_sqrt(s)
#define graphene_simd4f_splat_z(s)
#define graphene_simd4f_cmp_le(a, b)
#define graphene_simd4f_init_2f(v)
#define graphene_simd4f_flip_sign_1010(s)
#define graphene_simd4f_cmp_gt(a, b)
static graphene_simd4f_t graphene_simd4f_dot4(const graphene_simd4f_t a, const graphene_simd4f_t b)
#define graphene_simd4f_dup_4f(s, v)
#define graphene_simd4f_splat_y(s)
#define graphene_simd4f_dot3_scalar(a, b)
#define graphene_simd4f_zero_zw(v)
static graphene_simd4f_t graphene_simd4f_normalize2(const graphene_simd4f_t v)
static bool graphene_simd4f_is_zero2(const graphene_simd4f_t v)
#define graphene_simd4f_merge_high(a, b)
static graphene_simd4f_t graphene_simd4f_max_val(const graphene_simd4f_t v)
static graphene_simd4f_t graphene_simd4f_clamp(const graphene_simd4f_t v, const graphene_simd4f_t min, const graphene_simd4f_t max)
#define graphene_simd4f_get_x(s)
#define graphene_simd4f_cmp_lt(a, b)
#define graphene_simd4f_init_4f(v)
#define graphene_simd4f_get_z(s)
#define graphene_simd4f_splat_x(s)
static graphene_simd4f_t graphene_simd4f_sum(const graphene_simd4f_t v)
#define graphene_simd4f_init_3f(v)
static bool graphene_simd4f_is_zero3(const graphene_simd4f_t v)
static graphene_simd4f_t graphene_simd4f_length4(const graphene_simd4f_t v)
#define graphene_simd4f_dot3(a, b)
#define graphene_simd4f_init(x, y, z, w)
#define graphene_simd4f_splat(v)
static graphene_simd4f_t graphene_simd4f_madd(const graphene_simd4f_t m1, const graphene_simd4f_t m2, const graphene_simd4f_t a)
#define graphene_simd4f_flip_sign_0101(s)
#define graphene_simd4f_get_w(s)
#define graphene_simd4f_min(a, b)
static graphene_simd4f_t graphene_simd4f_length2(const graphene_simd4f_t v)
#define graphene_simd4f_reciprocal(s)
#define graphene_simd4f_div(a, b)
#define graphene_simd4f_shuffle_yzwx(s)
#define graphene_simd4f_sub(a, b)
static graphene_simd4f_t graphene_simd4f_length3(const graphene_simd4f_t v)
#define graphene_simd4f_splat_w(s)
static graphene_simd4f_t graphene_simd4f_min_val(const graphene_simd4f_t v)
#define graphene_simd4f_dup_2f(s, v)
#define graphene_simd4f_cross3(a, b)
#define graphene_simd4f_add(a, b)
#define GRAPHENE_AVAILABLE_IN_1_4
#define GRAPHENE_AVAILABLE_IN_1_2
#define GRAPHENE_AVAILABLE_IN_1_0
CURL_EXTERN CURLMcode curl_socket_t s