From 48e271ebb49eac1dd305a58d68c4d0d98a767aff Mon Sep 17 00:00:00 2001 From: Michael Zuckerman <Michael.zuckerman@intel.com> Date: Tue, 7 Jun 2016 14:00:20 +0000 Subject: [PATCH] [clang][AVX512][Intrinsics] Adding intrinsics reduce_[round]_{ss|sd} to clang Differential Revision: http://reviews.llvm.org/D21014 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@272012 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 2 + lib/Headers/avx512dqintrin.h | 72 +++++++++++++++++++++++++++++ test/CodeGen/avx512dq-builtins.c | 72 +++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 7f95c3f1f21..68f68e26efe 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -1531,6 +1531,8 @@ TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "", "avx512vl,a TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "", "avx512vl,avx512dq") TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "", "avx512vl,avx512dq") TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "", "avx512vl,avx512dq") +TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "", "avx512vl,avx512dq") +TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "", "avx512vl,avx512dq") TARGET_BUILTIN(__builtin_ia32_pmaddubsw128_mask, "V8sV16cV16cV8sUc", "", "avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaddubsw256_mask, "V16sV32cV32cV16sUs", "", "avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaddwd128_mask, "V4iV8sV8sV4iUc", "", "avx512vl,avx512bw") diff --git a/lib/Headers/avx512dqintrin.h b/lib/Headers/avx512dqintrin.h index e3ceffabb89..981688420ec 100644 --- a/lib/Headers/avx512dqintrin.h +++ b/lib/Headers/avx512dqintrin.h @@ -851,6 +851,78 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U), (int)(R)); }) +#define _mm_reduce_ss(A, B, C) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128 )(A), (__v4sf)(__m128 )(B), \ + (__v4sf)(__m128 )(W), \ + (__mmask8)(U), \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), (__v4sf)(__m128 )(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), (__v4sf)(__m128 )(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, \ + (int)(C),(int)(R)); }) + +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128 )(A), (__v4sf)(__m128 )(B), \ + (__v4sf)(__m128 )(W), \ + (__mmask8)(U), \ + (int)(C),(int)(R)); }) + +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({ \ + (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), (__v4sf)(__m128 )(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + (int)(C),(int)(R)); }) + +#define _mm_reduce_sd(A, B, C) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128)(A), (__v2df)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128 )(A), (__v2df)(__m128)(B), \ + (__v2df)(__m128 )(W), \ + (__mmask8)(U), \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128)(A), (__v2df)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), \ + (int)(C),_MM_FROUND_CUR_DIRECTION); }) + +#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128)(A), (__v2df)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, \ + (int)(C),(int)(R)); }) + +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128 )(A), (__v2df)(__m128)(B), \ + (__v2df)(__m128 )(W), \ + (__mmask8)(U), \ + (int)(C),(int)(R)); }) + +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({ \ + (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128)(A), (__v2df)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), \ + (int)(C),(int)(R)); }) + static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_movepi32_mask (__m512i __A) { diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c index f9bc504a5e0..c1dea8c69a3 100644 --- a/test/CodeGen/avx512dq-builtins.c +++ b/test/CodeGen/avx512dq-builtins.c @@ -743,6 +743,78 @@ __m512 test_mm512_maskz_reduce_round_ps(__mmask16 __U, __m512 __A) { return _mm512_maskz_reduce_round_ps(__U, __A, 4, 8); } +__m128 test_mm_reduce_ss(__m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_reduce_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_reduce_ss(__A, __B, 4); +} + +__m128 test_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_mask_reduce_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_mask_reduce_ss(__W, __U, __A, __B, 4); +} + +__m128 test_mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_maskz_reduce_ss(__U, __A, __B, 4); +} + +__m128 test_mm_reduce_round_ss(__m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_reduce_round_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_reduce_round_ss(__A, __B, 4, 8); +} + +__m128 test_mm_mask_reduce_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_mask_reduce_round_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_mask_reduce_round_ss(__W, __U, __A, __B, 4, 8); +} + +__m128 test_mm_maskz_reduce_round_ss(__mmask8 __U, __m128 __A, __m128 __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_round_ss + // CHECK: @llvm.x86.avx512.mask.reduce.ss + return _mm_maskz_reduce_round_ss(__U, __A, __B, 4, 8); +} + +__m128d test_mm_reduce_sd(__m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_reduce_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_reduce_sd(__A, __B, 4); +} + +__m128d test_mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_mask_reduce_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_mask_reduce_sd(__W, __U, __A, __B, 4); +} + +__m128d test_mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_maskz_reduce_sd(__U, __A, __B, 4); +} + +__m128d test_mm_reduce_round_sd(__m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_reduce_round_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_reduce_round_sd(__A, __B, 4, 8); +} + +__m128d test_mm_mask_reduce_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_mask_reduce_round_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_mask_reduce_round_sd(__W, __U, __A, __B, 4, 8); +} + +__m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_round_sd + // CHECK: @llvm.x86.avx512.mask.reduce.sd + return _mm_maskz_reduce_round_sd(__U, __A, __B, 4, 8); +} + __mmask16 test_mm512_movepi32_mask(__m512i __A) { // CHECK-LABEL: @test_mm512_movepi32_mask // CHECK: @llvm.x86.avx512.cvtd2mask.512 -- GitLab