diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 48ed65e3c7a3213167fa64385deeac9a675d3869..6678ebe71cf2d4cf5cf6911f4d0ecd5254abd283 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -2132,7 +2132,9 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ops[2] = Builder.CreateBitCast(Ops[2], Ty); - return Builder.CreateCall3(F, Ops[0], Ops[1], Ops[2]); + + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]); } case ARM::BI__builtin_neon_vpadal_v: case ARM::BI__builtin_neon_vpadalq_v: { diff --git a/test/CodeGen/arm-neon-fma.c b/test/CodeGen/arm-neon-fma.c new file mode 100644 index 0000000000000000000000000000000000000000..7511fe16c6bdb5ccd2e11bf1021f74ff91948118 --- /dev/null +++ b/test/CodeGen/arm-neon-fma.c @@ -0,0 +1,16 @@ +// REQUIRES: arm-registered-target +// RUN: %clang -target thumbv7-none-linux-gnueabihf \ +// RUN: -mcpu=cortex-a8 -mfloat-abi=hard \ +// RUN: -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include <arm_neon.h> + +float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) { + return vfma_f32(accum, lhs, rhs); +// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum) +} + +float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) { + return vfmaq_f32(accum, lhs, rhs); +// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum) +}