diff --git a/include/clang/Basic/BuiltinsNVPTX.def b/include/clang/Basic/BuiltinsNVPTX.def
index b784bade0a39cac183a109458283a1052f1bf092..bb02c2aa8b240c93958703c4639ff55f7b4a52f0 100644
--- a/include/clang/Basic/BuiltinsNVPTX.def
+++ b/include/clang/Basic/BuiltinsNVPTX.def
@@ -402,6 +402,17 @@ BUILTIN(__nvvm_bar0_popc, "ii", "")
 BUILTIN(__nvvm_bar0_and, "ii", "")
 BUILTIN(__nvvm_bar0_or, "ii", "")
 
+// Shuffle
+
+BUILTIN(__builtin_ptx_shfl_down_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_down_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_up_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_up_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_bfly_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_bfly_f32, "ffii", "")
+BUILTIN(__builtin_ptx_shfl_idx_i32, "iiii", "")
+BUILTIN(__builtin_ptx_shfl_idx_f32, "ffii", "")
+
 // Membar
 
 BUILTIN(__nvvm_membar_cta, "v", "")
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h
index 4ca2940d24a346072ea9eb1bf95a0da1f5087197..de5171720ce5f8abbb1cedf7d8d7a52c5eff3657 100644
--- a/lib/Headers/__clang_cuda_intrinsics.h
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -26,6 +26,76 @@
 #error "This file is for CUDA compilation only."
 #endif
 
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
+  inline __device__ int __FnName(int __in, int __offset,                       \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__in, __offset,                                      \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(float __in, int __offset,                   \
+                                   int __width = warpSize) {                   \
+    return __FloatIntrinsic(__in, __offset,                                    \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __in, int __offset,     \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(static_cast<int>(__in), __offset, __width));                \
+  }                                                                            \
+  inline __device__ long long __FnName(long long __in, int __offset,           \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__in) == sizeof(__Bits));                            \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__in, &__tmp, sizeof(__in));                                       \
+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
+    long long __out;                                                           \
+    memcpy(&__out, &__tmp, sizeof(__tmp));                                     \
+    return __out;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned long long __in, int __offset, int __width = warpSize) {         \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(static_cast<unsigned long long>(__in), __offset, __width)); \
+  }                                                                            \
+  inline __device__ double __FnName(double __in, int __offset,                 \
+                                    int __width = warpSize) {                  \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__in));                             \
+    memcpy(&__tmp, &__in, sizeof(__in));                                       \
+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
+    double __out;                                                              \
+    memcpy(&__out, &__tmp, sizeof(__out));                                     \
+    return __out;                                                              \
+  }
+
+__MAKE_SHUFFLES(__shfl, __builtin_ptx_shfl_idx_i32, __builtin_ptx_shfl_idx_f32,
+                0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __builtin_ptx_shfl_up_i32, __builtin_ptx_shfl_up_f32,
+                0);
+__MAKE_SHUFFLES(__shfl_down, __builtin_ptx_shfl_down_i32,
+                __builtin_ptx_shfl_down_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl_xor, __builtin_ptx_shfl_bfly_i32,
+                __builtin_ptx_shfl_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
 
 // Prevent the vanilla sm_32 intrinsics header from being included.
diff --git a/lib/Headers/__clang_cuda_runtime_wrapper.h b/lib/Headers/__clang_cuda_runtime_wrapper.h
index bce9d72af354f1e180e63d9a446c4ea4cb7c1554..6445f9b76b8f9cd3948d15b746a00d144664e998 100644
--- a/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -198,13 +198,14 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 #include "sm_20_atomic_functions.hpp"
 #include "sm_20_intrinsics.hpp"
 #include "sm_32_atomic_functions.hpp"
-// sm_30_intrinsics.h has declarations that use default argument, so
-// we have to include it and it will in turn include .hpp
-#include "sm_30_intrinsics.h"
 
-// Don't include sm_32_intrinsics.h.  That header defines __ldg using inline
-// asm, but we want to define it using builtins, because we can't use the
-// [addr+imm] addressing mode if we use the inline asm in the header.
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions.  In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
 
 #undef __MATH_FUNCTIONS_HPP__