diff --git a/cmake/modules/FindTBB.cmake b/cmake/modules/FindTBB.cmake
index a95f0e4d37a59afa85910d00cb650678b1707110..be1402d866dbeba115c2cac534439858be731d41 100644
--- a/cmake/modules/FindTBB.cmake
+++ b/cmake/modules/FindTBB.cmake
@@ -307,6 +307,8 @@ endif()
 # set variable for config.h
 set(HAVE_TBB ${TBB_FOUND})
 
+# provide include_sys_dir
+include(XtCompilerSupport)
 # perform DUNE-specific setup tasks
 if(TBB_FOUND)
   set(TBB_CACHE_ALIGNED_ALLOCATOR_ALIGNMENT 128)
@@ -320,6 +322,9 @@ if(TBB_FOUND)
                               ${TBB_INCLUDE_DIRS}
                               LIBRARIES
                               ${TBB_LIBRARIES})
+  foreach(_idir ${TBB_INCLUDE_DIRS})
+    include_sys_dir(${_idir})
+  endforeach()
 endif()
 
 # function for adding TBB flags to a list of targets
diff --git a/cmake/modules/XtCompilerSupport.cmake b/cmake/modules/XtCompilerSupport.cmake
index ec509c42fa0a2f076cc2c4907f39335a007a4292..8991f937255e42cdef409a726e24211fa9ba20d1 100644
--- a/cmake/modules/XtCompilerSupport.cmake
+++ b/cmake/modules/XtCompilerSupport.cmake
@@ -30,7 +30,7 @@ macro(INCLUDE_SYS_DIR)
       if(${ARG} MATCHES "/usr/include")
         message(AUTHOR_WARNING "-isystem not supported for ${ARG}")
       else()
-        add_definitions("-isystem ${_idir}")
+        add_definitions("-isystem ${ARG}")
       endif()
     else(IS_DIRECTORY ${ARG})
       message(STATUS "Include directory ${ARG} does not exist")
diff --git a/dune/xt/common/disable_warnings.hh b/dune/xt/common/disable_warnings.hh
index 19145378e8caa5a9fca741f7836cae3c3f23d52c..ac0bc850dd588ef023d374f5813753af07ef4df8 100644
--- a/dune/xt/common/disable_warnings.hh
+++ b/dune/xt/common/disable_warnings.hh
@@ -23,7 +23,6 @@
 #pragma GCC diagnostic ignored "-Wfloat-equal"
 #pragma GCC diagnostic ignored "-Wignored-qualifiers"
 #pragma GCC diagnostic ignored "-Wlogical-not-parentheses"
-#pragma GCC diagnostic ignored "-Wlogical-op"
 #pragma GCC diagnostic ignored "-Wlogical-op-parentheses"
 #pragma GCC diagnostic ignored "-Wmismatched-tags"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
diff --git a/dune/xt/common/lpsolve.cc b/dune/xt/common/lpsolve.cc
index 5cbd7f5060254aeced08d2994aedad0bcd1993b5..64748b52dcdcdae118cce4cea741da9eb71a156b 100644
--- a/dune/xt/common/lpsolve.cc
+++ b/dune/xt/common/lpsolve.cc
@@ -37,6 +37,11 @@ namespace lp_solve {
 
 
 #if HAVE_LPSOLVE
+LinearProgram::LinearProgram()
+  : lp_(nullptr)
+{
+}
+
 LinearProgram::LinearProgram(int rows, int cols)
   : lp_(::make_lp(rows, cols))
 {
@@ -55,6 +60,11 @@ lprec* LinearProgram::data()
 }
 
 #else // HAVE_LPSOLVE
+LinearProgram::LinearProgram()
+{
+  DUNE_THROW(Exceptions::dependency_missing, "You are missing lp_solve, check available() first!");
+}
+
 LinearProgram::LinearProgram(int /*rows*/, int /*cols*/)
 {
   DUNE_THROW(Exceptions::dependency_missing, "You are missing lp_solve, check available() first!");
diff --git a/dune/xt/common/lpsolve.hh b/dune/xt/common/lpsolve.hh
index b81b593132b28d248b3506bb164ae57e7033173f..3879018a06e5386b8c450899dff2dc691bc1d8a5 100644
--- a/dune/xt/common/lpsolve.hh
+++ b/dune/xt/common/lpsolve.hh
@@ -28,6 +28,7 @@ namespace lp_solve {
 
 struct LinearProgram
 {
+  LinearProgram();
   LinearProgram(int rows, int cols);
   ~LinearProgram();
 
diff --git a/dune/xt/common/parallel/threadstorage.hh b/dune/xt/common/parallel/threadstorage.hh
index 1dca7cc97f95dd84f0b29231f9d9efb543fe93b3..f3d733285c27c35fe1b5ef045d27648ffd906c71 100644
--- a/dune/xt/common/parallel/threadstorage.hh
+++ b/dune/xt/common/parallel/threadstorage.hh
@@ -11,57 +11,300 @@
 #ifndef DUNE_XT_COMMON_PARALLEL_THREADSTORAGE_HH
 #define DUNE_XT_COMMON_PARALLEL_THREADSTORAGE_HH
 
-#include <deque>
+#if HAVE_TBB
+// Hack to fix compilation with clang as tbb does not detect C++11 feature correctly for clang. Recent versions of TBB
+// allow to set the macro TBB_USE_GLIBCXX_VERSION to the proper version of libstdc++ to fix this issue, see
+// https://www.threadingbuildingblocks.org/docs/help/reference/appendices/known_issues/linux_os.html. For older versions
+// we need the hack below.
+#include <tbb/tbb_config.h>
+#undef __TBB_CPP11_RVALUE_REF_PRESENT
+#undef __TBB_CPP11_VARIADIC_TEMPLATES_PRESENT
+#undef __TBB_CPP11_DECLTYPE_PRESENT
+#undef __TBB_CPP11_LAMBDAS_PRESENT
+#define __TBB_CPP11_RVALUE_REF_PRESENT 1
+#define __TBB_CPP11_VARIADIC_TEMPLATES_PRESENT 1
+#define __TBB_CPP11_DECLTYPE_PRESENT 1
+#define __TBB_CPP11_LAMBDAS_PRESENT 1
+#include <tbb/enumerable_thread_specific.h>
+#endif
+
 #include <algorithm>
+#include <list>
 #include <numeric>
 #include <type_traits>
-#include <numeric>
-#include <list>
-#include <functional>
 
+// TODO: the following includes can be removed when UnsafePerThreadValue is removed
+#include <deque>
+#include <memory>
 #include <boost/noncopyable.hpp>
-
-#include <dune/xt/common/type_traits.hh>
-#include <dune/xt/common/memory.hh>
 #include <dune/xt/common/parallel/threadmanager.hh>
 
 namespace Dune {
 namespace XT {
 namespace Common {
+namespace internal {
+
+
+#if HAVE_TBB
+
+template <class ValueImp>
+class EnumerableThreadSpecificWrapper
+{
+  // enumerable_thread_specific does not compile with ConstValueType as template param
+  using BackendType = typename tbb::enumerable_thread_specific<std::remove_const_t<ValueImp>>;
+
+public:
+  using ValueType = ValueImp;
+  using ConstValueType = std::add_const_t<ValueType>;
+  using iterator = typename BackendType::iterator;
+  using const_iterator = typename BackendType::const_iterator;
+
+  template <class... InitTypes>
+  explicit EnumerableThreadSpecificWrapper(InitTypes&&... ctor_args)
+    : values_(std::forward<InitTypes>(ctor_args)...)
+  {
+  }
+
+  ValueType& local()
+  {
+    return values_.local();
+  }
+
+  // tbb does not provide a const version of local (as elements may be inserted when a new thread accesses values_), so
+  // values_ has to be mutable
+  const ValueType& local() const
+  {
+    return values_.local();
+  }
+
+  typename BackendType::iterator begin()
+  {
+    return values_.begin();
+  }
+
+  typename BackendType::iterator end()
+  {
+    return values_.end();
+  }
+
+  typename BackendType::const_iterator begin() const
+  {
+    return values_.begin();
+  }
+
+  typename BackendType::const_iterator end() const
+  {
+    return values_.end();
+  }
+
+  template <class BinaryOperation>
+  ValueType combine(BinaryOperation op) const
+  {
+    return values_.combine(op);
+  }
+
+private:
+  mutable BackendType values_;
+}; // class EnumerableThreadSpecificWrapper<ValueImp>
+
+#else // HAVE_TBB
+
+template <class ValueImp>
+class EnumerableThreadSpecificWrapper
+{
+  using BackendType = std::array<std::remove_const_t<ValueImp>, 1>;
+
+public:
+  using ValueType = ValueImp;
+  using ConstValueType = std::add_const_t<ValueType>;
+  using iterator = typename BackendType::iterator;
+  using const_iterator = typename BackendType::const_iterator;
+
+  //! Initialization by copy construction of ValueType
+  explicit EnumerableThreadSpecificWrapper(ConstValueType& value)
+    : values_{value}
+  {
+  }
+
+  //! Initialization by in-place construction ValueType with \param ctor_args
+  template <class... InitTypes>
+  explicit EnumerableThreadSpecificWrapper(InitTypes&&... ctor_args)
+    : values_{ValueType(std::forward<InitTypes>(ctor_args)...)}
+  {
+  }
+
+  ValueType& local()
+  {
+    return values_[0];
+  }
+
+  const ValueType& local() const
+  {
+    return values_[0];
+  }
+
+  iterator begin()
+  {
+    return values_.begin();
+  }
+
+  iterator end()
+  {
+    return values_.end();
+  }
+
+  const_iterator begin() const
+  {
+    return values_.begin();
+  }
+
+  const_iterator end() const
+  {
+    return values_.end();
+  }
+
+  template <class BinaryOperation>
+  ValueType combine(BinaryOperation /*op*/) const
+  {
+    return values_[0];
+  }
+
+private:
+  BackendType values_;
+}; // class EnumerableThreadSpecificWrapper<ValueImp>
+
+#endif // HAVE_TBB
+
+
+} // namespace interal
+
 
 /** Automatic Storage of non-static, N thread-local values
  **/
 template <class ValueImp>
-class PerThreadValue : public boost::noncopyable
+class PerThreadValue
+{
+  using ContainerType = internal::EnumerableThreadSpecificWrapper<ValueImp>;
+
+public:
+  using ValueType = typename ContainerType::ValueType;
+  using ConstValueType = typename ContainerType::ConstValueType;
+
+  //! Initialization by copy construction of ValueType
+  explicit PerThreadValue(ConstValueType& value)
+    : values_(value)
+  {
+  }
+
+  //! Initialization by in-place construction ValueType with \param ctor_args
+  template <class... InitTypes>
+  explicit PerThreadValue(InitTypes&&... ctor_args)
+    : values_(std::forward<InitTypes>(ctor_args)...)
+  {
+  }
+
+  operator ValueType() const
+  {
+    return values_.local();
+  }
+
+  ValueType& operator*()
+  {
+    return values_.local();
+  }
+
+  ConstValueType& operator*() const
+  {
+    return values_.local();
+  }
+
+  ValueType* operator->()
+  {
+    return &values_.local();
+  }
+
+  ConstValueType* operator->() const
+  {
+    return &values_.local();
+  }
+
+  template <class BinaryOperation>
+  ValueType accumulate(ValueType init, BinaryOperation op) const
+  {
+    return op(init, values_.combine(op));
+  }
+
+  ValueType sum() const
+  {
+    return accumulate(ValueType(0), std::plus<ValueType>());
+  }
+
+  typename ContainerType::iterator begin()
+  {
+    return values_.begin();
+  }
+  typename ContainerType::iterator end()
+  {
+    return values_.end();
+  }
+
+  typename ContainerType::const_iterator begin() const
+  {
+    return values_.begin();
+  }
+  typename ContainerType::const_iterator end() const
+  {
+    return values_.end();
+  }
+
+private:
+  ContainerType values_;
+}; // class PerThreadValue<ValueImp>
+
+
+/**
+ * Previous implementation of PerThreadValue. This implementation suffers from the fact that it is not possible (or
+ * at least we did not find a way yet) to set a hard upper limit on the number of threads TBB uses. Setting max_threads
+ * via tbb::task_scheduler_init apparently only sets a soft limit on the number of threads. In addition, even if TBB
+ * uses only N threads at a time, it might be possible that a thread is destroyed and later in the program another
+ * thread with a different id replaces it, which will then get a number greater than or equal to N in our
+ * implementation (see ThreadManager::thread()). This occasionally leads to segfaults.
+ * We keep this implementation around as it is currently used by TimingData (see dune/xt/common/timings.hh) and the
+ * new implementation can't replace it in that context, as the new implementation based on
+ * tbb::enumerable_thread_specific lazily initalizes the values in each thread.
+ * \todo Either fix TimingData and remove this class or fix this class.
+ **/
+template <class ValueImp>
+class UnsafePerThreadValue : public boost::noncopyable
 {
 public:
   typedef ValueImp ValueType;
   typedef typename std::conditional<std::is_const<ValueImp>::value, ValueImp, const ValueImp>::type ConstValueType;
 
 private:
-  typedef PerThreadValue<ValueImp> ThisType;
+  typedef UnsafePerThreadValue<ValueImp> ThisType;
   typedef std::deque<std::unique_ptr<ValueType>> ContainerType;
 
 public:
   //! Initialization by copy construction of ValueType
-  explicit PerThreadValue(ConstValueType& value)
+  explicit UnsafePerThreadValue(ConstValueType& value)
     : values_(threadManager().max_threads())
   {
-    std::generate(values_.begin(), values_.end(), [=]() { return Common::make_unique<ValueType>(value); });
+    std::generate(values_.begin(), values_.end(), [=]() { return std::make_unique<ValueType>(value); });
   }
 
   //! Initialization by in-place construction ValueType with \param ctor_args
   template <class... InitTypes>
-  explicit PerThreadValue(InitTypes&&... ctor_args)
+  explicit UnsafePerThreadValue(InitTypes&&... ctor_args)
     : values_(threadManager().max_threads())
   {
     for (auto&& val : values_)
-      val = Common::make_unique<ValueType>(ctor_args...);
+      val = std::make_unique<ValueType>(ctor_args...);
   }
 
   ThisType& operator=(ConstValueType&& value)
   {
-    std::generate(values_.begin(), values_.end(), [=]() { return Common::make_unique<ValueType>(value); });
+    std::generate(values_.begin(), values_.end(), [=]() { return std::make_unique<ValueType>(value); });
     return *this;
   }
 
@@ -128,7 +371,8 @@ public:
 
 private:
   ContainerType values_;
-};
+}; // class UnsafePerThreadValue<...>
+
 
 template <class Imp, typename Result, class Reduction = std::plus<Result>>
 class ThreadResultPropagator
@@ -167,6 +411,8 @@ private:
   Imp* imp_;
   std::list<Imp*> copies_;
 };
+
+
 } // namespace Common
 } // namespace XT
 } // namespace Dune
diff --git a/dune/xt/common/test/parallel.cc b/dune/xt/common/test/parallel.cc
index 65dd8261ec4425c4b0eef061c626bd35de55e766..27c5bcc451275d46420877a6fc32f9765aa8d958 100644
--- a/dune/xt/common/test/parallel.cc
+++ b/dune/xt/common/test/parallel.cc
@@ -11,11 +11,11 @@
 
 #include <dune/xt/common/test/main.hxx>
 
-#include <string>
-#include <memory>
 #include <array>
-#include <initializer_list>
+#include <thread>
+#include <type_traits>
 #include <vector>
+
 #include <dune/xt/common/parallel/threadmanager.hh>
 #include <dune/xt/common/parallel/threadstorage.hh>
 #include <dune/xt/common/parallel/helper.hh>
@@ -80,7 +80,7 @@ TYPED_TEST(ThreadValueTest, All)
     typename PTVType::ValueType value(1);
     PTVType foo(value);
     check_eq(foo, value);
-    foo = typename PTVType::ValueType(1);
+    foo = PTVType(1);
     check_eq(foo, value);
     const auto new_value = *foo;
     const PTVType bar(*foo);
@@ -89,12 +89,22 @@ TYPED_TEST(ThreadValueTest, All)
   {
     typename PTVType::ValueType zero(0);
     PTVType foo(zero);
+    size_t num_threads = Dune::XT::Common::threadManager().max_threads();
+    std::vector<std::thread> threads(num_threads);
+    for (size_t ii = 0; ii < num_threads; ++ii)
+      threads[ii] = std::thread([&foo, &zero]() { EXPECT_EQ(*foo, zero); });
+    for (size_t ii = 0; ii < num_threads; ++ii)
+      threads[ii].join();
     auto sum = foo.accumulate(0, std::plus<typename PTVType::ValueType>());
-    EXPECT_EQ(Dune::XT::Common::threadManager().max_threads() * zero, sum);
+    EXPECT_EQ(num_threads * zero, sum);
     typename PTVType::ValueType one = 1;
     PTVType bar(one);
+    for (size_t ii = 0; ii < num_threads; ++ii)
+      threads[ii] = std::thread([&bar, &one]() { EXPECT_EQ(*bar, one); });
+    for (size_t ii = 0; ii < num_threads; ++ii)
+      threads[ii].join();
     sum = bar.accumulate(0, std::plus<typename PTVType::ValueType>());
-    EXPECT_EQ(Dune::XT::Common::threadManager().max_threads() * one, sum);
+    EXPECT_EQ(num_threads * one, sum);
   }
 }
 
diff --git a/dune/xt/common/timings.cc b/dune/xt/common/timings.cc
index b42f5b5635171ec9b676d67fb14840e6a247f818..ecde894b7a2e6e90337e059fabd8f3ff9c30fc9d 100644
--- a/dune/xt/common/timings.cc
+++ b/dune/xt/common/timings.cc
@@ -39,6 +39,7 @@
 #include <dune/xt/common/filesystem.hh>
 #include <dune/xt/common/logging.hh>
 #include <dune/xt/common/parallel/threadmanager.hh>
+#include <dune/xt/common/parallel/threadstorage.hh>
 
 #include <map>
 #include <string>
@@ -97,7 +98,6 @@ void Timings::start(std::string section_name)
   if (section != known_timers_map_.end()) {
     if (section->second.first) // timer currently running
       return;
-
     section->second.first = true; // set active, start with new
     section->second.second = TimingData(section_name);
   } else {
diff --git a/dune/xt/common/timings.hh b/dune/xt/common/timings.hh
index 2af4ad4ce3a930704cd75072516e78d6498589db..87220c07189303dcc2fd58a0c79847af8e2fdaf2 100644
--- a/dune/xt/common/timings.hh
+++ b/dune/xt/common/timings.hh
@@ -82,7 +82,7 @@ class Timings
 private:
   Timings();
 
-  typedef std::map<std::string, std::pair<std::atomic<bool>, PerThreadValue<TimingData>>> KnownTimersMap;
+  typedef std::map<std::string, std::pair<std::atomic<bool>, UnsafePerThreadValue<TimingData>>> KnownTimersMap;
   //! section name -> seconds
   typedef std::map<std::string, TimingData::DeltaType> DeltaMap;