diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 7514373897e2ca290213993635f942fbb2e13f0b..766bfa1e12f62c6f21ec38209ca4a73a1b8fd9a0 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -8794,12 +8794,18 @@ public:
 
   CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D);
 
+  // CUDA function call preference. Must be ordered numerically from
+  // worst to best.
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
-    CFP_LastResort, // Lowest priority. Only in effect if
+    CFP_WrongSide,  // Calls from host-device to host or device
+                    // function that do not match current compilation
+                    // mode. Only in effect if
                     // LangOpts.CUDADisableTargetCallChecks is true.
-    CFP_Fallback,   // Low priority caller/callee combination
-    CFP_Best,       // Preferred caller/callee combination
+    CFP_HostDevice, // Any calls to host/device functions.
+    CFP_SameSide,   // Calls from host-device to host or device
+                    // function matching current compilation mode.
+    CFP_Native,     // host-to-host or device-to-device calls.
   };
 
   /// Identifies relative preference of a given Caller/Callee
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index 84fccd5ef59f6245b0505bd05cd1ffa623c166b2..4e59a0a0aaa422a33239c5d148ef02d1cef2c2be 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -68,26 +68,26 @@ Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
 // Ph - preference in host mode
 // Pd - preference in device mode
 // H  - handled in (x)
-// Preferences: b-best, f-fallback, l-last resort, n-never.
+// Preferences: N:native, HD:host-device, SS:same side, WS:wrong side, --:never.
 //
-// | F  | T  | Ph | Pd |  H  |
-// |----+----+----+----+-----+
-// | d  | d  | b  | b  | (b) |
-// | d  | g  | n  | n  | (a) |
-// | d  | h  | l  | l  | (e) |
-// | d  | hd | f  | f  | (c) |
-// | g  | d  | b  | b  | (b) |
-// | g  | g  | n  | n  | (a) |
-// | g  | h  | l  | l  | (e) |
-// | g  | hd | f  | f  | (c) |
-// | h  | d  | l  | l  | (e) |
-// | h  | g  | b  | b  | (b) |
-// | h  | h  | b  | b  | (b) |
-// | h  | hd | f  | f  | (c) |
-// | hd | d  | l  | f  | (d) |
-// | hd | g  | f  | n  |(d/a)|
-// | hd | h  | f  | l  | (d) |
-// | hd | hd | b  | b  | (b) |
+// | F  | T  | Ph  | Pd  |  H  |
+// |----+----+-----+-----+-----+
+// | d  | d  | N   | N   | (c) |
+// | d  | g  | --  | --  | (a) |
+// | d  | h  | --  | --  | (e) |
+// | d  | hd | HD  | HD  | (b) |
+// | g  | d  | N   | N   | (c) |
+// | g  | g  | --  | --  | (a) |
+// | g  | h  | --  | --  | (e) |
+// | g  | hd | HD  | HD  | (b) |
+// | h  | d  | --  | --  | (e) |
+// | h  | g  | N   | N   | (c) |
+// | h  | h  | N   | N   | (c) |
+// | h  | hd | HD  | HD  | (b) |
+// | hd | d  | WS  | SS  | (d) |
+// | hd | g  | SS  | --  |(d/a)|
+// | hd | h  | SS  | WS  | (d) |
+// | hd | hd | HD  | HD  | (b) |
 
 Sema::CUDAFunctionPreference
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
@@ -112,39 +112,38 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
        (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice)))
     return CFP_Never;
 
-  // (b) Best case scenarios
+  // (b) Calling HostDevice is OK for everyone.
+  if (CalleeTarget == CFT_HostDevice)
+    return CFP_HostDevice;
+
+  // (c) Best case scenarios
   if (CalleeTarget == CallerTarget ||
       (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
-    return CFP_Best;
-
-  // (c) Calling HostDevice is OK as a fallback that works for everyone.
-  if (CalleeTarget == CFT_HostDevice)
-    return CFP_Fallback;
-
-  // Figure out what should be returned 'last resort' cases. Normally
-  // those would not be allowed, but we'll consider them if
-  // CUDADisableTargetCallChecks is true.
-  CUDAFunctionPreference QuestionableResult =
-      getLangOpts().CUDADisableTargetCallChecks ? CFP_LastResort : CFP_Never;
+    return CFP_Native;
 
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
-    // Calling a function that matches compilation mode is OK.
-    // Calling a function from the other side is frowned upon.
-    if (getLangOpts().CUDAIsDevice)
-      return CalleeTarget == CFT_Device ? CFP_Fallback : QuestionableResult;
-    else
-      return (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)
-                 ? CFP_Fallback
-                 : QuestionableResult;
+    // It's OK to call a compilation-mode matching function from an HD one.
+    if ((getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Device) ||
+        (!getLangOpts().CUDAIsDevice &&
+         (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)))
+      return CFP_SameSide;
+
+    // We'll allow calls to non-mode-matching functions if target call
+    // checks are disabled. This is needed to avoid complaining about
+    // HD->H calls when we compile for device side and vice versa.
+    if (getLangOpts().CUDADisableTargetCallChecks)
+      return CFP_WrongSide;
+
+    return CFP_Never;
   }
 
   // (e) Calling across device/host boundary is not something you should do.
   if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) ||
       (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
-    return QuestionableResult;
+    return CFP_Never;
 
   llvm_unreachable("All cases should've been handled by now.");
 }
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index d71b307d15b7588f0298f48895439070d8bd4d50..f190872f0aac1341c84cbcff0e8714862a18eb27 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -8722,14 +8722,44 @@ OverloadingResult
 OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
                                          iterator &Best,
                                          bool UserDefinedConversion) {
+  llvm::SmallVector<OverloadCandidate *, 16> Candidates;
+  std::transform(begin(), end(), std::back_inserter(Candidates),
+                 [](OverloadCandidate &Cand) { return &Cand; });
+
+  // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA
+  // but accepted by both clang and NVCC. However during a particular
+  // compilation mode only one call variant is viable. We need to
+  // exclude non-viable overload candidates from consideration based
+  // only on their host/device attributes. Specifically, if one
+  // candidate call is WrongSide and the other is SameSide, we ignore
+  // the WrongSide candidate.
+  if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads) {
+    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    bool ContainsSameSideCandidate =
+        llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
+          return Cand->Function &&
+                 S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+                     Sema::CFP_SameSide;
+        });
+    if (ContainsSameSideCandidate) {
+      auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) {
+        return Cand->Function &&
+               S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+                   Sema::CFP_WrongSide;
+      };
+      Candidates.erase(std::remove_if(Candidates.begin(), Candidates.end(),
+                                      IsWrongSideCandidate),
+                       Candidates.end());
+    }
+  }
+
   // Find the best viable function.
   Best = end();
-  for (iterator Cand = begin(); Cand != end(); ++Cand) {
+  for (auto *Cand : Candidates)
     if (Cand->Viable)
       if (Best == end() || isBetterOverloadCandidate(S, *Cand, *Best, Loc,
                                                      UserDefinedConversion))
         Best = Cand;
-  }
 
   // If we didn't find any viable functions, abort.
   if (Best == end())
@@ -8739,7 +8769,7 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
 
   // Make sure that this function is better than every other viable
   // function. If not, we have an ambiguity.
-  for (iterator Cand = begin(); Cand != end(); ++Cand) {
+  for (auto *Cand : Candidates) {
     if (Cand->Viable &&
         Cand != Best &&
         !isBetterOverloadCandidate(S, *Best, *Cand, Loc,
diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu
index a12ef82773a240a73557be9de260b5975ce79aad..3f8f930106ff3346f87c038e49136d50ab883000 100644
--- a/test/CodeGenCUDA/function-overload.cu
+++ b/test/CodeGenCUDA/function-overload.cu
@@ -7,7 +7,8 @@
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \
 // RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \
+// RUN:       -check-prefix=CHECK-DEVICE-STRICT %s
 
 // Check target overloads handling with disabled call target checks.
 // RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
@@ -77,12 +78,112 @@ extern "C" __host__ int ch(void) {return 13;}
 extern "C" __host__ __device__ int chd(void) {return 14;}
 // CHECK-BOTH:     ret i32 14
 
+// HD functions are sometimes allowed to call H or D functions -- this
+// is an artifact of the source-to-source splitting performed by nvcc
+// that we need to mimic. During device mode compilation in nvcc, host
+// functions aren't present at all, so don't participate in
+// overloading. But in clang, H and D functions are present in both
+// compilation modes. Clang normally uses the target attribute as a
+// tiebreaker between overloads with otherwise identical priority, but
+// in order to match nvcc's behavior, we sometimes need to wholly
+// discard overloads that would not be present during compilation
+// under nvcc.
+
+template <typename T> T template_vs_function(T arg) { return 15; }
+__device__ float template_vs_function(float arg) { return 16; }
+
+// Here we expect to call the templated function during host
+// compilation, even if -fcuda-disable-target-call-checks is passed,
+// and even though C++ overload rules prefer the non-templated
+// function.
+// CHECK-BOTH-LABEL: define void @_Z5hd_tfv()
+__host__ __device__ void hd_tf(void) {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// Calls from __host__ and __device__ functions should always call the
+// overloaded function that matches their mode.
+// CHECK-HOST-LABEL: define void @_Z4h_tfv()
+__host__ void h_tf() {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z4d_tfv()
+__device__ void d_tf() {
+  template_vs_function(1.0f);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// In case we have a mix of HD and H-only or D-only candidates in the
+// overload set, normal C++ overload resolution rules apply first.
+template <typename T> T template_vs_hd_function(T arg) { return 15; }
+__host__ __device__ float template_vs_hd_function(float arg) { return 16; }
+
+// CHECK-BOTH-LABEL: define void @_Z7hd_thdfv()
+__host__ __device__ void hd_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1);
+  // CHECK-HOST: call i32 @_Z23template_vs_hd_functionIiET_S0_(i32
+  // CHECK-DEVICE-STRICT: call float @_Z23template_vs_hd_functionf(float
+  // CHECK-DEVICE-NC: call i32 @_Z23template_vs_hd_functionIiET_S0_(i32
+}
+
+// CHECK-HOST-LABEL: define void @_Z6h_thdfv()
+__host__ void h_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1);
+  // CHECK-HOST: call i32 @_Z23template_vs_hd_functionIiET_S0_(i32
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z6d_thdfv()
+__device__ void d_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1);
+  // Host-only function template is not callable with strict call checks,
+  // so for device side HD function will be the only choice.
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+}
+
+// Check that overloads still work the same way on both host and
+// device side when the overload set contains only functions from one
+// side of compilation.
+__device__ float device_only_function(int arg) { return 17; }
+__device__ float device_only_function(float arg) { return 18; }
+
+__host__ float host_only_function(int arg) { return 19; }
+__host__ float host_only_function(float arg) { return 20; }
+
+// CHECK-BOTH-LABEL: define void @_Z6hd_dofv()
+__host__ __device__ void hd_dof() {
+#ifdef NOCHECKS
+  device_only_function(1.0f);
+  // CHECK-BOTH-NC: call float @_Z20device_only_functionf(float
+  device_only_function(1);
+  // CHECK-BOTH-NC: call float @_Z20device_only_functioni(i32
+  host_only_function(1.0f);
+  // CHECK-BOTH-NC: call float @_Z18host_only_functionf(float
+  host_only_function(1);
+  // CHECK-BOTH-NC: call float @_Z18host_only_functioni(i32
+#endif
+}
+
+
 // CHECK-HOST-LABEL: define void @_Z5hostfv()
 __host__ void hostf(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp,
-#endif
   fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp,
   fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp,
   fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp,
@@ -91,10 +192,6 @@ __host__ void hostf(void) {
   fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp,
   gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp,
 
-#if defined (NOCHECKS)
-  d();     // CHECK-HOST-NC: call i32 @_Z1dv()
-  cd();    // CHECK-HOST-NC: call i32 @cd()
-#endif
   h();     // CHECK-HOST: call i32 @_Z1hv()
   ch();    // CHECK-HOST: call i32 @ch()
   dh();    // CHECK-HOST: call i32 @_Z2dhv()
@@ -106,10 +203,6 @@ __host__ void hostf(void) {
 __device__ void devicef(void) {
   fp_t dp = d;   // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp,
   fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp,
-#if defined (NOCHECKS)
-  fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
   fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp,
   fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp,
   fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp,
@@ -117,10 +210,6 @@ __device__ void devicef(void) {
 
   d();     // CHECK-DEVICE: call i32 @_Z1dv()
   cd();    // CHECK-DEVICE: call i32 @cd()
-#if defined (NOCHECKS)
-  h();     // CHECK-DEVICE-NC: call i32 @_Z1hv()
-  ch();    // CHECK-DEVICE-NC: call i32 @ch()
-#endif
   dh();    // CHECK-DEVICE: call i32 @_Z2dhv()
   cdh();   // CHECK-DEVICE: call i32 @cdh()
 }
diff --git a/test/SemaCUDA/function-overload.cu b/test/SemaCUDA/function-overload.cu
index bd3fb508bfab8fc8c51d7ee121747139d8d2732f..bc9fe2a1ebb66418ceee6a898ee252fba998a6fd 100644
--- a/test/SemaCUDA/function-overload.cu
+++ b/test/SemaCUDA/function-overload.cu
@@ -70,13 +70,11 @@ extern "C" __host__ int ch(void) {return 11;}
 
 __host__ void hostf(void) {
   fp_t dp = d;
-  fp_t cdp = cd;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __device__ function 'd' in __host__ function}}
+  // expected-error@-1 {{reference to __device__ function 'd' in __host__ function}}
   // expected-note@65 {{'d' declared here}}
-  // expected-error@-4 {{reference to __device__ function 'cd' in __host__ function}}
+  fp_t cdp = cd;
+  // expected-error@-1 {{reference to __device__ function 'cd' in __host__ function}}
   // expected-note@68 {{'cd' declared here}}
-#endif
   fp_t hp = h;
   fp_t chp = ch;
   fp_t dhp = dh;
@@ -84,13 +82,11 @@ __host__ void hostf(void) {
   gp_t gp = g;
 
   d();
-  cd();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'd'}}
+  // expected-error@-1 {{no matching function for call to 'd'}}
   // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ function}}
-  // expected-error@-4 {{no matching function for call to 'cd'}}
+  cd();
+  // expected-error@-1 {{no matching function for call to 'cd'}}
   // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ function}}
-#endif
   h();
   ch();
   dh();
@@ -104,13 +100,11 @@ __device__ void devicef(void) {
   fp_t dp = d;
   fp_t cdp = cd;
   fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __device__ function}}
+  // expected-error@-1 {{reference to __host__ function 'h' in __device__ function}}
   // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __device__ function}}
+  fp_t chp = ch;
+  // expected-error@-1 {{reference to __host__ function 'ch' in __device__ function}}
   // expected-note@69 {{'ch' declared here}}
-#endif
   fp_t dhp = dh;
   fp_t cdhp = cdh;
   gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
@@ -118,14 +112,10 @@ __device__ void devicef(void) {
 
   d();
   cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
+  h(); // expected-error {{no matching function for call to 'h'}}
   // expected-note@66 {{candidate function not viable: call to __host__ function from __device__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
+  ch(); // expected-error {{no matching function for call to 'ch'}}
   // expected-note@69 {{candidate function not viable: call to __host__ function from __device__ function}}
-#endif
   dh();
   cdh();
   g(); // expected-error {{no matching function for call to 'g'}}
@@ -138,28 +128,25 @@ __global__ void globalf(void) {
   fp_t dp = d;
   fp_t cdp = cd;
   fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __global__ function}}
+  // expected-error@-1 {{reference to __host__ function 'h' in __global__ function}}
   // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __global__ function}}
+  fp_t chp = ch;
+  // expected-error@-1 {{reference to __host__ function 'ch' in __global__ function}}
   // expected-note@69 {{'ch' declared here}}
-#endif
   fp_t dhp = dh;
   fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
-               // expected-note@67 {{'g' declared here}}
+  gp_t gp = g;
+  // expected-error@-1 {{reference to __global__ function 'g' in __global__ function}}
+  // expected-note@67 {{'g' declared here}}
 
   d();
   cd();
   h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
+  // expected-error@-1 {{no matching function for call to 'h'}}
   // expected-note@66 {{candidate function not viable: call to __host__ function from __global__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
+  ch();
+  // expected-error@-1 {{no matching function for call to 'ch'}}
   // expected-note@69 {{candidate function not viable: call to __host__ function from __global__ function}}
-#endif
   dh();
   cdh();
   g(); // expected-error {{no matching function for call to 'g'}}