From 4c3ccdfbbe8bdd28ec2ef2509672c8077370d2cb Mon Sep 17 00:00:00 2001
From: Felix Schindler <felix.schindler@wwu.de>
Date: Tue, 21 Nov 2017 12:47:28 +0100
Subject: [PATCH] [pybind11] add state of
 https://github.com/pybind/pybind11.git v2.2.1

---
 pybind11/.appveyor.yml                        |   66 +
 pybind11/.readthedocs.yml                     |    3 +
 pybind11/.travis.yml                          |  212 +++
 pybind11/CMakeLists.txt                       |   87 +-
 pybind11/ISSUE_TEMPLATE.md                    |   17 +
 pybind11/MANIFEST.in                          |    2 +-
 pybind11/README.md                            |    9 +-
 pybind11/docs/Doxyfile                        |   20 +
 pybind11/docs/advanced/cast/chrono.rst        |    4 +-
 pybind11/docs/advanced/cast/custom.rst        |    6 +
 pybind11/docs/advanced/cast/eigen.rst         |  316 +++-
 pybind11/docs/advanced/cast/functional.rst    |    6 +-
 pybind11/docs/advanced/cast/index.rst         |    1 +
 pybind11/docs/advanced/cast/overview.rst      |   17 +
 pybind11/docs/advanced/cast/stl.rst           |  121 +-
 pybind11/docs/advanced/cast/strings.rst       |  303 +++
 pybind11/docs/advanced/classes.rst            |  538 +++++-
 pybind11/docs/advanced/embedding.rst          |  261 +++
 pybind11/docs/advanced/functions.rst          |  243 ++-
 pybind11/docs/advanced/misc.rst               |   93 +-
 pybind11/docs/advanced/pycpp/numpy.rst        |  156 +-
 pybind11/docs/advanced/pycpp/object.rst       |   80 +-
 pybind11/docs/advanced/pycpp/utilities.rst    |  103 +-
 pybind11/docs/advanced/smart_ptrs.rst         |   27 +-
 pybind11/docs/basics.rst                      |   58 +-
 pybind11/docs/benchmark.py                    |    4 +-
 pybind11/docs/benchmark.rst                   |    4 +-
 pybind11/docs/changelog.rst                   |  454 +++++
 pybind11/docs/classes.rst                     |   81 +-
 pybind11/docs/compiling.rst                   |  124 +-
 pybind11/docs/conf.py                         |   34 +-
 pybind11/docs/faq.rst                         |   64 +-
 pybind11/docs/index.rst                       |    2 +
 pybind11/docs/reference.rst                   |  241 +--
 pybind11/docs/release.rst                     |    7 +-
 pybind11/docs/requirements.txt                |    1 +
 pybind11/docs/upgrade.rst                     |  404 ++++
 pybind11/include/pybind11/attr.h              |  217 ++-
 pybind11/include/pybind11/buffer_info.h       |  108 ++
 pybind11/include/pybind11/cast.h              | 1648 +++++++++++------
 pybind11/include/pybind11/chrono.h            |   12 +-
 pybind11/include/pybind11/common.h            |  619 +------
 pybind11/include/pybind11/complex.h           |   24 +-
 pybind11/include/pybind11/detail/class.h      |  606 ++++++
 pybind11/include/pybind11/detail/common.h     |  800 ++++++++
 .../include/pybind11/{ => detail}/descr.h     |   20 +-
 pybind11/include/pybind11/detail/init.h       |  325 ++++
 pybind11/include/pybind11/detail/internals.h  |  247 +++
 .../include/pybind11/{ => detail}/typeid.h    |    6 +-
 pybind11/include/pybind11/eigen.h             |  581 ++++--
 pybind11/include/pybind11/embed.h             |  194 ++
 pybind11/include/pybind11/eval.h              |   37 +-
 pybind11/include/pybind11/functional.h        |   36 +-
 pybind11/include/pybind11/iostream.h          |  200 ++
 pybind11/include/pybind11/numpy.h             |  927 +++++++---
 pybind11/include/pybind11/operators.h         |   52 +-
 pybind11/include/pybind11/options.h           |    6 +-
 pybind11/include/pybind11/pybind11.h          | 1308 +++++++------
 pybind11/include/pybind11/pytypes.h           |  637 +++++--
 pybind11/include/pybind11/stl.h               |  157 +-
 pybind11/include/pybind11/stl_bind.h          |  162 +-
 pybind11/pybind11/__main__.py                 |   37 +
 pybind11/pybind11/_version.py                 |    2 +-
 pybind11/setup.py                             |   61 +-
 pybind11/tests/CMakeLists.txt                 |  242 ++-
 pybind11/tests/conftest.py                    |   10 +-
 pybind11/tests/constructor_stats.h            |    6 +-
 pybind11/tests/local_bindings.h               |   64 +
 pybind11/tests/object.h                       |    4 +-
 .../tests/pybind11_cross_module_tests.cpp     |  123 ++
 pybind11/tests/pybind11_tests.cpp             |   62 +-
 pybind11/tests/pybind11_tests.h               |   59 +-
 pybind11/tests/pytest.ini                     |   15 +
 pybind11/tests/test_alias_initialization.cpp  |   62 -
 pybind11/tests/test_alias_initialization.py   |   80 -
 pybind11/tests/test_buffers.cpp               |  196 +-
 pybind11/tests/test_buffers.py                |   59 +-
 pybind11/tests/test_builtin_casters.cpp       |  156 ++
 pybind11/tests/test_builtin_casters.py        |  322 ++++
 pybind11/tests/test_call_policies.cpp         |   98 +
 pybind11/tests/test_call_policies.py          |  187 ++
 pybind11/tests/test_callbacks.cpp             |  150 +-
 pybind11/tests/test_callbacks.py              |   81 +-
 pybind11/tests/test_chrono.cpp                |   78 +-
 pybind11/tests/test_chrono.py                 |   43 +-
 pybind11/tests/test_class.cpp                 |  357 ++++
 pybind11/tests/test_class.py                  |  235 +++
 pybind11/tests/test_class_args.cpp            |   68 -
 pybind11/tests/test_class_args.py             |    8 -
 .../tests/test_cmake_build/CMakeLists.txt     |   58 +
 pybind11/tests/test_cmake_build/embed.cpp     |   21 +
 .../installed_embed/CMakeLists.txt            |   15 +
 .../installed_target/CMakeLists.txt           |    4 +
 pybind11/tests/test_cmake_build/main.cpp      |    6 +-
 .../subdirectory_embed/CMakeLists.txt         |   25 +
 .../tests/test_constants_and_functions.cpp    |   25 +-
 .../tests/test_constants_and_functions.py     |   38 +-
 pybind11/tests/test_copy_move.cpp             |  213 +++
 pybind11/tests/test_copy_move.py              |  112 ++
 pybind11/tests/test_copy_move_policies.cpp    |   41 -
 pybind11/tests/test_copy_move_policies.py     |   15 -
 pybind11/tests/test_docstring_options.cpp     |   26 +-
 pybind11/tests/test_docstring_options.py      |   36 +-
 pybind11/tests/test_eigen.cpp                 |  335 +++-
 pybind11/tests/test_eigen.py                  |  676 ++++++-
 pybind11/tests/test_embed/CMakeLists.txt      |   34 +
 pybind11/tests/test_embed/catch.cpp           |   16 +
 .../tests/test_embed/test_interpreter.cpp     |  269 +++
 pybind11/tests/test_embed/test_interpreter.py |    9 +
 pybind11/tests/test_enum.cpp                  |   73 +-
 pybind11/tests/test_enum.py                   |  119 +-
 pybind11/tests/test_eval.cpp                  |   28 +-
 pybind11/tests/test_eval.py                   |   16 +-
 pybind11/tests/test_exceptions.cpp            |   69 +-
 pybind11/tests/test_exceptions.py             |  130 +-
 pybind11/tests/test_factory_constructors.cpp  |  337 ++++
 pybind11/tests/test_factory_constructors.py   |  459 +++++
 pybind11/tests/test_inheritance.cpp           |  100 -
 pybind11/tests/test_inheritance.py            |   55 -
 pybind11/tests/test_iostream.cpp              |   73 +
 pybind11/tests/test_iostream.py               |  203 ++
 pybind11/tests/test_issues.cpp                |  401 ----
 pybind11/tests/test_issues.py                 |  251 ---
 pybind11/tests/test_keep_alive.cpp            |   40 -
 pybind11/tests/test_keep_alive.py             |   97 -
 pybind11/tests/test_kwargs_and_defaults.cpp   |   81 +-
 pybind11/tests/test_kwargs_and_defaults.py    |  122 +-
 pybind11/tests/test_local_bindings.cpp        |  101 +
 pybind11/tests/test_local_bindings.py         |  226 +++
 .../tests/test_methods_and_attributes.cpp     |  396 +++-
 pybind11/tests/test_methods_and_attributes.py |  367 +++-
 pybind11/tests/test_modules.cpp               |  104 +-
 pybind11/tests/test_modules.py                |   42 +-
 pybind11/tests/test_multiple_inheritance.cpp  |  229 ++-
 pybind11/tests/test_multiple_inheritance.py   |  317 +++-
 pybind11/tests/test_numpy_array.cpp           |  224 ++-
 pybind11/tests/test_numpy_array.py            |  376 ++--
 pybind11/tests/test_numpy_dtypes.cpp          |  396 ++--
 pybind11/tests/test_numpy_dtypes.py           |  279 +--
 pybind11/tests/test_numpy_vectorize.cpp       |   62 +-
 pybind11/tests/test_numpy_vectorize.py        |  152 +-
 pybind11/tests/test_opaque_types.cpp          |   17 +-
 pybind11/tests/test_opaque_types.py           |   37 +-
 pybind11/tests/test_operator_overloading.cpp  |  102 +-
 pybind11/tests/test_operator_overloading.py   |   80 +-
 pybind11/tests/test_pickling.cpp              |   97 +-
 pybind11/tests/test_pickling.py               |   17 +-
 pybind11/tests/test_python_types.cpp          |  429 -----
 pybind11/tests/test_python_types.py           |  412 -----
 pybind11/tests/test_pytypes.cpp               |  272 +++
 pybind11/tests/test_pytypes.py                |  240 +++
 .../tests/test_sequences_and_iterators.cpp    |  405 ++--
 .../tests/test_sequences_and_iterators.py     |  116 +-
 pybind11/tests/test_smart_ptr.cpp             |  374 ++--
 pybind11/tests/test_smart_ptr.py              |  139 +-
 pybind11/tests/test_stl.cpp                   |  238 +++
 pybind11/tests/test_stl.py                    |  200 ++
 pybind11/tests/test_stl_binders.cpp           |   49 +-
 pybind11/tests/test_stl_binders.py            |  155 +-
 pybind11/tests/test_virtual_functions.cpp     |  191 +-
 pybind11/tests/test_virtual_functions.py      |  270 ++-
 pybind11/tools/FindCatch.cmake                |   57 +
 pybind11/tools/FindPythonLibsNew.cmake        |    8 +-
 pybind11/tools/check-style.sh                 |  109 +-
 pybind11/tools/clang/cindex.py                |  133 ++
 pybind11/tools/mkdoc.py                       |   45 +-
 pybind11/tools/pybind11Config.cmake.in        |   26 +-
 pybind11/tools/pybind11Tools.cmake            |  163 +-
 168 files changed, 20794 insertions(+), 7449 deletions(-)
 create mode 100644 pybind11/.appveyor.yml
 create mode 100644 pybind11/.readthedocs.yml
 create mode 100644 pybind11/.travis.yml
 create mode 100644 pybind11/ISSUE_TEMPLATE.md
 create mode 100644 pybind11/docs/Doxyfile
 create mode 100644 pybind11/docs/advanced/cast/strings.rst
 create mode 100644 pybind11/docs/advanced/embedding.rst
 create mode 100644 pybind11/docs/requirements.txt
 create mode 100644 pybind11/docs/upgrade.rst
 create mode 100644 pybind11/include/pybind11/buffer_info.h
 create mode 100644 pybind11/include/pybind11/detail/class.h
 create mode 100644 pybind11/include/pybind11/detail/common.h
 rename pybind11/include/pybind11/{ => detail}/descr.h (92%)
 create mode 100644 pybind11/include/pybind11/detail/init.h
 create mode 100644 pybind11/include/pybind11/detail/internals.h
 rename pybind11/include/pybind11/{ => detail}/typeid.h (89%)
 create mode 100644 pybind11/include/pybind11/embed.h
 create mode 100644 pybind11/include/pybind11/iostream.h
 create mode 100644 pybind11/pybind11/__main__.py
 create mode 100644 pybind11/tests/local_bindings.h
 create mode 100644 pybind11/tests/pybind11_cross_module_tests.cpp
 create mode 100644 pybind11/tests/pytest.ini
 delete mode 100644 pybind11/tests/test_alias_initialization.cpp
 delete mode 100644 pybind11/tests/test_alias_initialization.py
 create mode 100644 pybind11/tests/test_builtin_casters.cpp
 create mode 100644 pybind11/tests/test_builtin_casters.py
 create mode 100644 pybind11/tests/test_call_policies.cpp
 create mode 100644 pybind11/tests/test_call_policies.py
 create mode 100644 pybind11/tests/test_class.cpp
 create mode 100644 pybind11/tests/test_class.py
 delete mode 100644 pybind11/tests/test_class_args.cpp
 delete mode 100644 pybind11/tests/test_class_args.py
 create mode 100644 pybind11/tests/test_cmake_build/CMakeLists.txt
 create mode 100644 pybind11/tests/test_cmake_build/embed.cpp
 create mode 100644 pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
 create mode 100644 pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
 create mode 100644 pybind11/tests/test_copy_move.cpp
 create mode 100644 pybind11/tests/test_copy_move.py
 delete mode 100644 pybind11/tests/test_copy_move_policies.cpp
 delete mode 100644 pybind11/tests/test_copy_move_policies.py
 create mode 100644 pybind11/tests/test_embed/CMakeLists.txt
 create mode 100644 pybind11/tests/test_embed/catch.cpp
 create mode 100644 pybind11/tests/test_embed/test_interpreter.cpp
 create mode 100644 pybind11/tests/test_embed/test_interpreter.py
 create mode 100644 pybind11/tests/test_factory_constructors.cpp
 create mode 100644 pybind11/tests/test_factory_constructors.py
 delete mode 100644 pybind11/tests/test_inheritance.cpp
 delete mode 100644 pybind11/tests/test_inheritance.py
 create mode 100644 pybind11/tests/test_iostream.cpp
 create mode 100644 pybind11/tests/test_iostream.py
 delete mode 100644 pybind11/tests/test_issues.cpp
 delete mode 100644 pybind11/tests/test_issues.py
 delete mode 100644 pybind11/tests/test_keep_alive.cpp
 delete mode 100644 pybind11/tests/test_keep_alive.py
 create mode 100644 pybind11/tests/test_local_bindings.cpp
 create mode 100644 pybind11/tests/test_local_bindings.py
 delete mode 100644 pybind11/tests/test_python_types.cpp
 delete mode 100644 pybind11/tests/test_python_types.py
 create mode 100644 pybind11/tests/test_pytypes.cpp
 create mode 100644 pybind11/tests/test_pytypes.py
 create mode 100644 pybind11/tests/test_stl.cpp
 create mode 100644 pybind11/tests/test_stl.py
 create mode 100644 pybind11/tools/FindCatch.cmake

diff --git a/pybind11/.appveyor.yml b/pybind11/.appveyor.yml
new file mode 100644
index 000000000..b150f1014
--- /dev/null
+++ b/pybind11/.appveyor.yml
@@ -0,0 +1,66 @@
+version: 1.0.{build}
+image:
+- Visual Studio 2017
+- Visual Studio 2015
+test: off
+build:
+  parallel: true
+platform:
+- x64
+- x86
+environment:
+  matrix:
+  - PYTHON: 36
+    CPP: 14
+    CONFIG: Debug
+  - PYTHON: 27
+    CPP: 14
+    CONFIG: Debug
+  - CONDA: 36
+    CPP: latest
+    CONFIG: Release
+matrix:
+  exclude:
+    - image: Visual Studio 2015
+      platform: x86
+    - image: Visual Studio 2015
+      CPP: latest
+    - image: Visual Studio 2017
+      CPP: latest
+      platform: x86
+install:
+- ps: |
+    if ($env:PLATFORM -eq "x64") { $env:CMAKE_ARCH = "x64" }
+    if ($env:APPVEYOR_JOB_NAME -like "*Visual Studio 2017*") {
+      $env:CMAKE_GENERATOR = "Visual Studio 15 2017"
+      $env:CMAKE_INCLUDE_PATH = "C:\Libraries\boost_1_64_0"
+    } else {
+      $env:CMAKE_GENERATOR = "Visual Studio 14 2015"
+    }
+    if ($env:PYTHON) {
+      if ($env:PLATFORM -eq "x64") { $env:PYTHON = "$env:PYTHON-x64" }
+      $env:PATH = "C:\Python$env:PYTHON\;C:\Python$env:PYTHON\Scripts\;$env:PATH"
+      pip install --disable-pip-version-check --user --upgrade pip wheel
+      pip install pytest numpy
+    } elseif ($env:CONDA) {
+      if ($env:CONDA -eq "27") { $env:CONDA = "" }
+      if ($env:PLATFORM -eq "x64") { $env:CONDA = "$env:CONDA-x64" }
+      $env:PATH = "C:\Miniconda$env:CONDA\;C:\Miniconda$env:CONDA\Scripts\;$env:PATH"
+      $env:PYTHONHOME = "C:\Miniconda$env:CONDA"
+      conda install -y -q pytest numpy scipy
+    }
+- ps: |
+    Start-FileDownload 'http://bitbucket.org/eigen/eigen/get/3.3.3.zip'
+    7z x 3.3.3.zip -y > $null
+    $env:CMAKE_INCLUDE_PATH = "eigen-eigen-67e894c6cd8f;$env:CMAKE_INCLUDE_PATH"
+build_script:
+- cmake -G "%CMAKE_GENERATOR%" -A "%CMAKE_ARCH%"
+    -DPYBIND11_CPP_STANDARD=/std:c++%CPP%
+    -DPYBIND11_WERROR=ON
+    -DDOWNLOAD_CATCH=ON
+    -DCMAKE_SUPPRESS_REGENERATION=1
+- set MSBuildLogger="C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmake --build . --config %CONFIG% --target pytest -- /m /v:m /logger:%MSBuildLogger%
+- cmake --build . --config %CONFIG% --target cpptest -- /m /v:m /logger:%MSBuildLogger%
+- if "%CPP%"=="latest" (cmake --build . --config %CONFIG% --target test_cmake_build -- /m /v:m /logger:%MSBuildLogger%)
+on_failure: if exist "tests\test_cmake_build" type tests\test_cmake_build\*.log*
diff --git a/pybind11/.readthedocs.yml b/pybind11/.readthedocs.yml
new file mode 100644
index 000000000..c9c61617c
--- /dev/null
+++ b/pybind11/.readthedocs.yml
@@ -0,0 +1,3 @@
+python:
+  version: 3
+requirements_file: docs/requirements.txt
diff --git a/pybind11/.travis.yml b/pybind11/.travis.yml
new file mode 100644
index 000000000..2853ac7ad
--- /dev/null
+++ b/pybind11/.travis.yml
@@ -0,0 +1,212 @@
+language: cpp
+dist: trusty
+sudo: false
+matrix:
+  include:
+  # This config does a few things:
+  # - Checks C++ and Python code styles (check-style.sh and flake8).
+  # - Makes sure sphinx can build the docs without any errors or warnings.
+  # - Tests setup.py sdist and install (all header files should be present).
+  # - Makes sure that everything still works without optional deps (numpy/scipy/eigen) and
+  #   also tests the automatic discovery functions in CMake (Python version, C++ standard).
+  - os: linux
+    env: STYLE DOCS PIP
+    cache: false
+    before_install:
+    - pyenv global $(pyenv whence 2to3)  # activate all python versions
+    - PY_CMD=python3
+    - $PY_CMD -m pip install --user --upgrade pip wheel
+    install:
+    - $PY_CMD -m pip install --user --upgrade sphinx sphinx_rtd_theme breathe flake8 pep8-naming pytest
+    - curl -fsSL ftp://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.12.linux.bin.tar.gz | tar xz
+    - export PATH="$PWD/doxygen-1.8.12/bin:$PATH"
+    script:
+    - tools/check-style.sh
+    - flake8
+    - $PY_CMD -m sphinx -W -b html docs docs/.build
+    - |
+      # Make sure setup.py distributes and installs all the headers
+      $PY_CMD setup.py sdist
+      $PY_CMD -m pip install --user -U ./dist/*
+      installed=$($PY_CMD -c "import pybind11; print(pybind11.get_include(True) + '/pybind11')")
+      diff -rq $installed ./include/pybind11
+    - |
+      # Barebones build
+      cmake -DCMAKE_BUILD_TYPE=Debug -DPYBIND11_WERROR=ON -DDOWNLOAD_CATCH=ON
+      make pytest -j 2
+      make cpptest -j 2
+  # The following are regular test configurations, including optional dependencies.
+  # With regard to each other they differ in Python version, C++ standard and compiler.
+  - os: linux
+    env: PYTHON=2.7 CPP=11 GCC=4.8
+    addons:
+      apt:
+        packages: [cmake=2.\*, cmake-data=2.\*]
+  - os: linux
+    env: PYTHON=3.6 CPP=11 GCC=4.8
+    addons:
+      apt:
+        sources: [deadsnakes]
+        packages: [python3.6-dev python3.6-venv, cmake=2.\*, cmake-data=2.\*]
+  - sudo: true
+    services: docker
+    env: PYTHON=2.7 CPP=14 GCC=6 CMAKE=1
+  - sudo: true
+    services: docker
+    env: PYTHON=3.5 CPP=14 GCC=6 DEBUG=1
+  - sudo: true
+    services: docker
+    env: PYTHON=3.6 CPP=17 GCC=7
+  - os: linux
+    env: PYTHON=3.6 CPP=17 CLANG=5.0
+    addons:
+      apt:
+        sources: [deadsnakes, llvm-toolchain-trusty-5.0, ubuntu-toolchain-r-test]
+        packages: [python3.6-dev python3.6-venv clang-5.0 llvm-5.0-dev, lld-5.0]
+  - os: osx
+    osx_image: xcode7.3
+    env: PYTHON=2.7 CPP=14 CLANG CMAKE=1
+  - os: osx
+    osx_image: xcode8.3
+    env: PYTHON=3.6 CPP=14 CLANG DEBUG=1
+  # Test a PyPy 2.7 build
+  - os: linux
+    env: PYPY=5.8 PYTHON=2.7 CPP=11 GCC=4.8
+    addons:
+      apt:
+        packages: [libblas-dev, liblapack-dev, gfortran]
+  # Build in 32-bit mode and tests against the CMake-installed version
+  - sudo: true
+    services: docker
+    env: ARCH=i386 PYTHON=3.5 CPP=14 GCC=6 INSTALL=1
+    script:
+      - |
+        $SCRIPT_RUN_PREFIX sh -c "set -e
+        cmake ${CMAKE_EXTRA_ARGS} -DPYBIND11_INSTALL=1 -DPYBIND11_TEST=0
+        make install
+        cp -a tests /pybind11-tests
+        mkdir /build-tests && cd /build-tests
+        cmake ../pybind11-tests ${CMAKE_EXTRA_ARGS} -DPYBIND11_WERROR=ON
+        make pytest -j 2"
+cache:
+  directories:
+  - $HOME/.local/bin
+  - $HOME/.local/lib
+  - $HOME/.local/include
+  - $HOME/Library/Python
+before_install:
+- |
+  # Configure build variables
+  if [ "$TRAVIS_OS_NAME" = "linux" ]; then
+    if [ -n "$CLANG" ]; then
+      export CXX=clang++-$CLANG CC=clang-$CLANG
+      COMPILER_PACKAGES="clang-$CLANG llvm-$CLANG-dev"
+    else
+      if [ -z "$GCC" ]; then GCC=4.8
+      else COMPILER_PACKAGES=g++-$GCC
+      fi
+      export CXX=g++-$GCC CC=gcc-$GCC
+    fi
+    if [ "$GCC" = "6" ]; then DOCKER=${ARCH:+$ARCH/}debian:stretch
+    elif [ "$GCC" = "7" ]; then DOCKER=debian:buster
+    fi
+  elif [ "$TRAVIS_OS_NAME" = "osx" ]; then
+    export CXX=clang++ CC=clang;
+  fi
+  if [ -n "$CPP" ]; then CPP=-std=c++$CPP; fi
+  if [ "${PYTHON:0:1}" = "3" ]; then PY=3; fi
+  if [ -n "$DEBUG" ]; then CMAKE_EXTRA_ARGS="${CMAKE_EXTRA_ARGS} -DCMAKE_BUILD_TYPE=Debug"; fi
+- |
+  # Initialize environment
+  set -e
+  if [ -n "$DOCKER" ]; then
+    docker pull $DOCKER
+
+    containerid=$(docker run --detach --tty \
+      --volume="$PWD":/pybind11 --workdir=/pybind11 \
+      --env="CC=$CC" --env="CXX=$CXX" --env="DEBIAN_FRONTEND=$DEBIAN_FRONTEND" \
+      --env=GCC_COLORS=\  \
+      $DOCKER)
+    SCRIPT_RUN_PREFIX="docker exec --tty $containerid"
+    $SCRIPT_RUN_PREFIX sh -c 'for s in 0 15; do sleep $s; apt-get update && apt-get -qy dist-upgrade && break; done'
+  else
+    if [ "$PYPY" = "5.8" ]; then
+      curl -fSL https://bitbucket.org/pypy/pypy/downloads/pypy2-v5.8.0-linux64.tar.bz2 | tar xj
+      PY_CMD=$(echo `pwd`/pypy2-v5.8.0-linux64/bin/pypy)
+      CMAKE_EXTRA_ARGS="${CMAKE_EXTRA_ARGS} -DPYTHON_EXECUTABLE:FILEPATH=$PY_CMD"
+    else
+      PY_CMD=python$PYTHON
+      if [ "$TRAVIS_OS_NAME" = "osx" ]; then
+        if [ "$PY" = "3" ]; then
+          brew install python$PY;
+        else
+          curl -fsSL https://bootstrap.pypa.io/get-pip.py | $PY_CMD - --user
+        fi
+      fi
+    fi
+    if [ "$PY" = 3 ] || [ -n "$PYPY" ]; then
+      $PY_CMD -m ensurepip --user
+    fi
+    $PY_CMD -m pip install --user --upgrade pip wheel
+  fi
+  set +e
+install:
+- |
+  # Install dependencies
+  set -e
+  if [ -n "$DOCKER" ]; then
+    if [ -n "$DEBUG" ]; then
+      PY_DEBUG="python$PYTHON-dbg python$PY-scipy-dbg"
+      CMAKE_EXTRA_ARGS="${CMAKE_EXTRA_ARGS} -DPYTHON_EXECUTABLE=/usr/bin/python${PYTHON}dm"
+    fi
+    $SCRIPT_RUN_PREFIX sh -c "for s in 0 15; do sleep \$s; \
+      apt-get -qy --no-install-recommends install \
+        $PY_DEBUG python$PYTHON-dev python$PY-pytest python$PY-scipy \
+        libeigen3-dev libboost-dev cmake make ${COMPILER_PACKAGES} && break; done"
+  else
+
+    if [ "$CLANG" = "5.0" ]; then
+      if ! [ -d ~/.local/include/c++/v1 ]; then
+        # Neither debian nor llvm provide a libc++ 5.0 deb; luckily it's fairly quick
+        # to build, install (and cache), so do it ourselves:
+        git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
+        git clone https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx -b release_50
+        git clone https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi -b release_50
+        mkdir llvm-build && cd llvm-build
+        # Building llvm requires a newer cmake than is provided by the trusty container:
+        CMAKE_VER=cmake-3.8.0-Linux-x86_64
+        curl https://cmake.org/files/v3.8/$CMAKE_VER.tar.gz | tar xz
+        ./$CMAKE_VER/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=~/.local ../llvm-source
+        make -j2 install-cxxabi install-cxx
+        cp -a include/c++/v1/*cxxabi*.h ~/.local/include/c++/v1
+        cd ..
+      fi
+      export CXXFLAGS="-isystem $HOME/.local/include/c++/v1 -stdlib=libc++"
+      export LDFLAGS="-L$HOME/.local/lib -fuse-ld=lld-$CLANG"
+      export LD_LIBRARY_PATH="$HOME/.local/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+      if [ "$CPP" = "-std=c++17" ]; then CPP="-std=c++1z"; fi
+    fi
+
+    export NPY_NUM_BUILD_JOBS=2
+    echo "Installing pytest, numpy, scipy..."
+    ${PYPY:+travis_wait 30} $PY_CMD -m pip install --user --upgrade pytest numpy scipy \
+        ${PYPY:+--extra-index-url https://imaginary.ca/trusty-pypi}
+    echo "done."
+
+    wget -q -O eigen.tar.gz https://bitbucket.org/eigen/eigen/get/3.3.3.tar.gz
+    tar xzf eigen.tar.gz
+    export CMAKE_INCLUDE_PATH="${CMAKE_INCLUDE_PATH:+:}$PWD/eigen-eigen-67e894c6cd8f"
+  fi
+  set +e
+script:
+- $SCRIPT_RUN_PREFIX cmake ${CMAKE_EXTRA_ARGS}
+    -DPYBIND11_PYTHON_VERSION=$PYTHON
+    -DPYBIND11_CPP_STANDARD=$CPP
+    -DPYBIND11_WERROR=${WERROR:-ON}
+    -DDOWNLOAD_CATCH=ON
+- $SCRIPT_RUN_PREFIX make pytest -j 2
+- $SCRIPT_RUN_PREFIX make cpptest -j 2
+- if [ -n "$CMAKE" ]; then $SCRIPT_RUN_PREFIX make test_cmake_build; fi
+after_failure: cat tests/test_cmake_build/*.log*
+after_script:
+- if [ -n "$DOCKER" ]; then docker stop "$containerid"; docker rm "$containerid"; fi
diff --git a/pybind11/CMakeLists.txt b/pybind11/CMakeLists.txt
index 341f845e4..4280ba742 100644
--- a/pybind11/CMakeLists.txt
+++ b/pybind11/CMakeLists.txt
@@ -12,7 +12,12 @@ if (POLICY CMP0048)
   cmake_policy(SET CMP0048 NEW)
 endif()
 
-project(pybind11)
+# CMake versions < 3.4.0 do not support try_compile/pthread checks without C as active language.
+if(CMAKE_VERSION VERSION_LESS 3.4.0)
+  project(pybind11)
+else()
+  project(pybind11 CXX)
+endif()
 
 # Check if pybind11 is being used directly or via add_subdirectory
 set(PYBIND11_MASTER_PROJECT OFF)
@@ -22,7 +27,6 @@ endif()
 
 option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
 option(PYBIND11_TEST    "Build pybind11 test suite?"     ${PYBIND11_MASTER_PROJECT})
-option(PYBIND11_WERROR  "Report all warnings as errors"  OFF)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/tools")
 
@@ -35,32 +39,23 @@ set(PYTHON_LIBRARIES ${PYTHON_LIBRARIES} CACHE INTERNAL "")
 set(PYTHON_MODULE_PREFIX ${PYTHON_MODULE_PREFIX} CACHE INTERNAL "")
 set(PYTHON_MODULE_EXTENSION ${PYTHON_MODULE_EXTENSION} CACHE INTERNAL "")
 
-# Compile with compiler warnings turned on
-function(pybind11_enable_warnings target_name)
-  if(MSVC)
-    target_compile_options(${target_name} PRIVATE /W4)
-  else()
-    target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion)
-  endif()
-
-  if(PYBIND11_WERROR)
-    if(MSVC)
-      target_compile_options(${target_name} PRIVATE /WX)
-    else()
-      target_compile_options(${target_name} PRIVATE -Werror)
-    endif()
-  endif()
-endfunction()
-
+# NB: when adding a header don't forget to also add it to setup.py
 set(PYBIND11_HEADERS
+  include/pybind11/detail/class.h
+  include/pybind11/detail/common.h
+  include/pybind11/detail/descr.h
+  include/pybind11/detail/init.h
+  include/pybind11/detail/internals.h
+  include/pybind11/detail/typeid.h
   include/pybind11/attr.h
+  include/pybind11/buffer_info.h
   include/pybind11/cast.h
   include/pybind11/chrono.h
   include/pybind11/common.h
   include/pybind11/complex.h
-  include/pybind11/descr.h
   include/pybind11/options.h
   include/pybind11/eigen.h
+  include/pybind11/embed.h
   include/pybind11/eval.h
   include/pybind11/functional.h
   include/pybind11/numpy.h
@@ -69,7 +64,6 @@ set(PYBIND11_HEADERS
   include/pybind11/pytypes.h
   include/pybind11/stl.h
   include/pybind11/stl_bind.h
-  include/pybind11/typeid.h
 )
 string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/"
        PYBIND11_HEADERS "${PYBIND11_HEADERS}")
@@ -82,7 +76,7 @@ include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
 # extract project version from source
-file(STRINGS "${PYBIND11_INCLUDE_DIR}/pybind11/common.h" pybind11_version_defines
+file(STRINGS "${PYBIND11_INCLUDE_DIR}/pybind11/detail/common.h" pybind11_version_defines
      REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
 foreach(ver ${pybind11_version_defines})
   if (ver MATCHES "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$")
@@ -92,34 +86,53 @@ endforeach()
 set(${PROJECT_NAME}_VERSION ${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH})
 message(STATUS "pybind11 v${${PROJECT_NAME}_VERSION}")
 
+option (USE_PYTHON_INCLUDE_DIR "Install pybind11 headers in Python include directory instead of default installation prefix" OFF)
+if (USE_PYTHON_INCLUDE_DIR)
+    file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
 if(NOT (CMAKE_VERSION VERSION_LESS 3.0))  # CMake >= 3.0
   # Build an interface library target:
+  add_library(pybind11 INTERFACE)
+  add_library(pybind11::pybind11 ALIAS pybind11)  # to match exported target
+  target_include_directories(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  target_compile_options(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_CPP_STANDARD}>)
+
   add_library(module INTERFACE)
-  target_include_directories(module INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
-                                              $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>
-                                              $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  add_library(pybind11::module ALIAS module)
+  if(NOT MSVC)
+    target_compile_options(module INTERFACE -fvisibility=hidden)
+  endif()
+  target_link_libraries(module INTERFACE pybind11::pybind11)
   if(WIN32 OR CYGWIN)
     target_link_libraries(module INTERFACE $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
   elseif(APPLE)
     target_link_libraries(module INTERFACE "-undefined dynamic_lookup")
   endif()
-  target_compile_options(module INTERFACE $<BUILD_INTERFACE:${PYBIND11_CPP_STANDARD}>)
 
-  add_library(pybind11::module ALIAS module)  # to match exported target
+  add_library(embed INTERFACE)
+  add_library(pybind11::embed ALIAS embed)
+  target_link_libraries(embed INTERFACE pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
 endif()
 
 if (PYBIND11_INSTALL)
-  install(FILES ${PYBIND11_HEADERS}
-          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/pybind11)
+  install(DIRECTORY ${PYBIND11_INCLUDE_DIR}/pybind11 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
   # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
   set(PYBIND11_CMAKECONFIG_INSTALL_DIR "share/cmake/${PROJECT_NAME}" CACHE STRING "install path for pybind11Config.cmake")
 
   configure_package_config_file(tools/${PROJECT_NAME}Config.cmake.in
                                 "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
                                 INSTALL_DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+  # Remove CMAKE_SIZEOF_VOID_P from ConfigVersion.cmake since the library does
+  # not depend on architecture specific settings or libraries.
+  set(_PYBIND11_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+  unset(CMAKE_SIZEOF_VOID_P)
   write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
                                    VERSION ${${PROJECT_NAME}_VERSION}
                                    COMPATIBILITY AnyNewerVersion)
+  set(CMAKE_SIZEOF_VOID_P ${_PYBIND11_CMAKE_SIZEOF_VOID_P})
   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
                 ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
                 tools/FindPythonLibsNew.cmake
@@ -127,10 +140,16 @@ if (PYBIND11_INSTALL)
           DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
 
   if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
-    install(TARGETS module
-            EXPORT "${PROJECT_NAME}Targets")
-    install(EXPORT "${PROJECT_NAME}Targets"
-            NAMESPACE "${PROJECT_NAME}::"
-            DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+    if(NOT PYBIND11_EXPORT_NAME)
+      set(PYBIND11_EXPORT_NAME "${PROJECT_NAME}Targets")
+    endif()
+
+    install(TARGETS pybind11 module embed
+            EXPORT "${PYBIND11_EXPORT_NAME}")
+    if(PYBIND11_MASTER_PROJECT)
+      install(EXPORT "${PYBIND11_EXPORT_NAME}"
+              NAMESPACE "${PROJECT_NAME}::"
+              DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+    endif()
   endif()
 endif()
diff --git a/pybind11/ISSUE_TEMPLATE.md b/pybind11/ISSUE_TEMPLATE.md
new file mode 100644
index 000000000..75df39981
--- /dev/null
+++ b/pybind11/ISSUE_TEMPLATE.md
@@ -0,0 +1,17 @@
+Make sure you've completed the following steps before submitting your issue -- thank you!
+
+1. Check if your question has already been answered in the [FAQ](http://pybind11.readthedocs.io/en/latest/faq.html) section.
+2. Make sure you've read the [documentation](http://pybind11.readthedocs.io/en/latest/). Your issue may be addressed there.
+3. If those resources didn't help and you only have a short question (not a bug report), consider asking in the [Gitter chat room](https://gitter.im/pybind/Lobby).
+4. If you have a genuine bug report or a more complex question which is not answered in the previous items (or not suitable for chat), please fill in the details below.
+5. Include a self-contained and minimal piece of code that reproduces the problem. If that's not possible, try to make the description as clear as possible.
+
+*After reading, remove this checklist and the template text in parentheses below.*
+
+## Issue description
+
+(Provide a short description, state the expected behavior and what actually happens.)
+
+## Reproducible example code
+
+(The code should be minimal, have no external dependencies, isolate the function(s) that cause breakage. Submit matched and complete C++ and Python snippets that can be easily compiled and run to diagnose the issue.)
diff --git a/pybind11/MANIFEST.in b/pybind11/MANIFEST.in
index aa51d0110..6e57baeee 100644
--- a/pybind11/MANIFEST.in
+++ b/pybind11/MANIFEST.in
@@ -1,2 +1,2 @@
-include include/pybind11/*.h
+recursive-include include/pybind11 *.h
 include LICENSE README.md CONTRIBUTING.md
diff --git a/pybind11/README.md b/pybind11/README.md
index 60ac871d0..447788240 100644
--- a/pybind11/README.md
+++ b/pybind11/README.md
@@ -96,9 +96,9 @@ In addition to the core functionality, pybind11 provides some extra goodies:
 
 ## Supported compilers
 
-1. Clang/LLVM (any non-ancient version with C++11 support)
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
 2. GCC 4.8 or newer
-3. Microsoft Visual Studio 2015 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
 4. Intel C++ compiler 16 or newer (15 with a [workaround](https://github.com/pybind/pybind11/issues/276))
 5. Cygwin/GCC (tested on 2.5.1)
 
@@ -118,8 +118,9 @@ Dean Moldovan,
 Ben Pritchard,
 Jason Rhinelander,
 Boris Schäling,
-Pim Schellart, and
-Ivan Smirnov.
+Pim Schellart,
+Ivan Smirnov, and
+Patrick Stewart.
 
 ### License
 
diff --git a/pybind11/docs/Doxyfile b/pybind11/docs/Doxyfile
new file mode 100644
index 000000000..1b9d1297c
--- /dev/null
+++ b/pybind11/docs/Doxyfile
@@ -0,0 +1,20 @@
+PROJECT_NAME           = pybind11
+INPUT                  = ../include/pybind11/
+RECURSIVE              = YES
+
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = .build/doxygenxml
+XML_PROGRAMLISTING     = YES
+
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+EXPAND_AS_DEFINED      = PYBIND11_RUNTIME_EXCEPTION
+
+ALIASES                = "rst=\verbatim embed:rst"
+ALIASES               += "endrst=\endverbatim"
+
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = NO
diff --git a/pybind11/docs/advanced/cast/chrono.rst b/pybind11/docs/advanced/cast/chrono.rst
index 6d4a5ee55..8c6b3d7e5 100644
--- a/pybind11/docs/advanced/cast/chrono.rst
+++ b/pybind11/docs/advanced/cast/chrono.rst
@@ -4,8 +4,8 @@ Chrono
 When including the additional header file :file:`pybind11/chrono.h` conversions
 from C++11 chrono datatypes to python datetime objects are automatically enabled.
 This header also enables conversions of python floats (often from sources such
-as `time.monotonic()`, `time.perf_counter()` and `time.process_time()`) into
-durations.
+as ``time.monotonic()``, ``time.perf_counter()`` and ``time.process_time()``)
+into durations.
 
 An overview of clocks in C++11
 ------------------------------
diff --git a/pybind11/docs/advanced/cast/custom.rst b/pybind11/docs/advanced/cast/custom.rst
index c854e7fcd..e4f99ac5b 100644
--- a/pybind11/docs/advanced/cast/custom.rst
+++ b/pybind11/docs/advanced/cast/custom.rst
@@ -78,6 +78,12 @@ type is explicitly allowed.
         };
     }} // namespace pybind11::detail
 
+.. note::
+
+    A ``type_caster<T>`` defined with ``PYBIND11_TYPE_CASTER(T, ...)`` requires
+    that ``T`` is default-constructible (``value`` is first default constructed
+    and then ``load()`` assigns to it).
+
 .. warning::
 
     When using custom type casters, it's important to declare them consistently
diff --git a/pybind11/docs/advanced/cast/eigen.rst b/pybind11/docs/advanced/cast/eigen.rst
index b83ca9af9..acdb51de6 100644
--- a/pybind11/docs/advanced/cast/eigen.rst
+++ b/pybind11/docs/advanced/cast/eigen.rst
@@ -1,48 +1,308 @@
 Eigen
-=====
+#####
 
 `Eigen <http://eigen.tuxfamily.org>`_ is C++ header-based library for dense and
 sparse linear algebra. Due to its popularity and widespread adoption, pybind11
-provides transparent conversion support between Eigen and Scientific Python linear
-algebra data types.
+provides transparent conversion and limited mapping support between Eigen and
+Scientific Python linear algebra data types.
 
-Specifically, when including the optional header file :file:`pybind11/eigen.h`,
-pybind11 will automatically and transparently convert
+To enable the built-in Eigen support you must include the optional header file
+:file:`pybind11/eigen.h`.
 
-1. Static and dynamic Eigen dense vectors and matrices to instances of
-   ``numpy.ndarray`` (and vice versa).
+Pass-by-value
+=============
 
-2. Returned matrix expressions such as blocks (including columns or rows) and
-   diagonals will be converted to ``numpy.ndarray`` of the expression
-   values.
+When binding a function with ordinary Eigen dense object arguments (for
+example, ``Eigen::MatrixXd``), pybind11 will accept any input value that is
+already (or convertible to) a ``numpy.ndarray`` with dimensions compatible with
+the Eigen type, copy its values into a temporary Eigen variable of the
+appropriate type, then call the function with this temporary variable.
 
-3. Returned matrix-like objects such as Eigen::DiagonalMatrix or
-   Eigen::SelfAdjointView will be converted to ``numpy.ndarray`` containing the
-   expressed value.
+Sparse matrices are similarly copied to or from
+``scipy.sparse.csr_matrix``/``scipy.sparse.csc_matrix`` objects.
 
-4. Eigen sparse vectors and matrices to instances of
-   ``scipy.sparse.csr_matrix``/``scipy.sparse.csc_matrix`` (and vice versa).
+Pass-by-reference
+=================
 
-This makes it possible to bind most kinds of functions that rely on these types.
-One major caveat are functions that take Eigen matrices *by reference* and modify
-them somehow, in which case the information won't be propagated to the caller.
+One major limitation of the above is that every data conversion implicitly
+involves a copy, which can be both expensive (for large matrices) and disallows
+binding functions that change their (Matrix) arguments.  Pybind11 allows you to
+work around this by using Eigen's ``Eigen::Ref<MatrixType>`` class much as you
+would when writing a function taking a generic type in Eigen itself (subject to
+some limitations discussed below).
+
+When calling a bound function accepting a ``Eigen::Ref<const MatrixType>``
+type, pybind11 will attempt to avoid copying by using an ``Eigen::Map`` object
+that maps into the source ``numpy.ndarray`` data: this requires both that the
+data types are the same (e.g. ``dtype='float64'`` and ``MatrixType::Scalar`` is
+``double``); and that the storage is layout compatible.  The latter limitation
+is discussed in detail in the section below, and requires careful
+consideration: by default, numpy matrices and eigen matrices are *not* storage
+compatible.
+
+If the numpy matrix cannot be used as is (either because its types differ, e.g.
+passing an array of integers to an Eigen paramater requiring doubles, or
+because the storage is incompatible), pybind11 makes a temporary copy and
+passes the copy instead.
+
+When a bound function parameter is instead ``Eigen::Ref<MatrixType>`` (note the
+lack of ``const``), pybind11 will only allow the function to be called if it
+can be mapped *and* if the numpy array is writeable (that is
+``a.flags.writeable`` is true).  Any access (including modification) made to
+the passed variable will be transparently carried out directly on the
+``numpy.ndarray``.
+
+This means you can can write code such as the following and have it work as
+expected:
 
 .. code-block:: cpp
 
-    /* The Python bindings of these functions won't replicate
-       the intended effect of modifying the function arguments */
-    void scale_by_2(Eigen::Vector3f &v) {
+    void scale_by_2(Eigen::Ref<Eigen::VectorXd> v) {
         v *= 2;
     }
-    void scale_by_2(Eigen::Ref<Eigen::MatrixXd> &v) {
-        v *= 2;
+
+Note, however, that you will likely run into limitations due to numpy and
+Eigen's difference default storage order for data; see the below section on
+:ref:`storage_orders` for details on how to bind code that won't run into such
+limitations.
+
+.. note::
+
+    Passing by reference is not supported for sparse types.
+
+Returning values to Python
+==========================
+
+When returning an ordinary dense Eigen matrix type to numpy (e.g.
+``Eigen::MatrixXd`` or ``Eigen::RowVectorXf``) pybind11 keeps the matrix and
+returns a numpy array that directly references the Eigen matrix: no copy of the
+data is performed.  The numpy array will have ``array.flags.owndata`` set to
+``False`` to indicate that it does not own the data, and the lifetime of the
+stored Eigen matrix will be tied to the returned ``array``.
+
+If you bind a function with a non-reference, ``const`` return type (e.g.
+``const Eigen::MatrixXd``), the same thing happens except that pybind11 also
+sets the numpy array's ``writeable`` flag to false.
+
+If you return an lvalue reference or pointer, the usual pybind11 rules apply,
+as dictated by the binding function's return value policy (see the
+documentation on :ref:`return_value_policies` for full details).  That means,
+without an explicit return value policy, lvalue references will be copied and
+pointers will be managed by pybind11.  In order to avoid copying, you should
+explictly specify an appropriate return value policy, as in the following
+example:
+
+.. code-block:: cpp
+
+    class MyClass {
+        Eigen::MatrixXd big_mat = Eigen::MatrixXd::Zero(10000, 10000);
+    public:
+        Eigen::MatrixXd &getMatrix() { return big_mat; }
+        const Eigen::MatrixXd &viewMatrix() { return big_mat; }
+    };
+
+    // Later, in binding code:
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def("copy_matrix", &MyClass::getMatrix) // Makes a copy!
+        .def("get_matrix", &MyClass::getMatrix, py::return_value_policy::reference_internal)
+        .def("view_matrix", &MyClass::viewMatrix, py::return_value_policy::reference_internal)
+        ;
+
+.. code-block:: python
+
+    a = MyClass()
+    m = a.get_matrix()   # flags.writeable = True,  flags.owndata = False
+    v = a.view_matrix()  # flags.writeable = False, flags.owndata = False
+    c = a.copy_matrix()  # flags.writeable = True,  flags.owndata = True
+    # m[5,6] and v[5,6] refer to the same element, c[5,6] does not.
+
+Note in this example that ``py::return_value_policy::reference_internal`` is
+used to tie the life of the MyClass object to the life of the returned arrays.
+
+You may also return an ``Eigen::Ref``, ``Eigen::Map`` or other map-like Eigen
+object (for example, the return value of ``matrix.block()`` and related
+methods) that map into a dense Eigen type.  When doing so, the default
+behaviour of pybind11 is to simply reference the returned data: you must take
+care to ensure that this data remains valid!  You may ask pybind11 to
+explicitly *copy* such a return value by using the
+``py::return_value_policy::copy`` policy when binding the function.  You may
+also use ``py::return_value_policy::reference_internal`` or a
+``py::keep_alive`` to ensure the data stays valid as long as the returned numpy
+array does.
+
+When returning such a reference of map, pybind11 additionally respects the
+readonly-status of the returned value, marking the numpy array as non-writeable
+if the reference or map was itself read-only.
+
+.. note::
+
+    Sparse types are always copied when returned.
+
+.. _storage_orders:
+
+Storage orders
+==============
+
+Passing arguments via ``Eigen::Ref`` has some limitations that you must be
+aware of in order to effectively pass matrices by reference.  First and
+foremost is that the default ``Eigen::Ref<MatrixType>`` class requires
+contiguous storage along columns (for column-major types, the default in Eigen)
+or rows if ``MatrixType`` is specifically an ``Eigen::RowMajor`` storage type.
+The former, Eigen's default, is incompatible with ``numpy``'s default row-major
+storage, and so you will not be able to pass numpy arrays to Eigen by reference
+without making one of two changes.
+
+(Note that this does not apply to vectors (or column or row matrices): for such
+types the "row-major" and "column-major" distinction is meaningless).
+
+The first approach is to change the use of ``Eigen::Ref<MatrixType>`` to the
+more general ``Eigen::Ref<MatrixType, 0, Eigen::Stride<Eigen::Dynamic,
+Eigen::Dynamic>>`` (or similar type with a fully dynamic stride type in the
+third template argument).  Since this is a rather cumbersome type, pybind11
+provides a ``py::EigenDRef<MatrixType>`` type alias for your convenience (along
+with EigenDMap for the equivalent Map, and EigenDStride for just the stride
+type).
+
+This type allows Eigen to map into any arbitrary storage order.  This is not
+the default in Eigen for performance reasons: contiguous storage allows
+vectorization that cannot be done when storage is not known to be contiguous at
+compile time.  The default ``Eigen::Ref`` stride type allows non-contiguous
+storage along the outer dimension (that is, the rows of a column-major matrix
+or columns of a row-major matrix), but not along the inner dimension.
+
+This type, however, has the added benefit of also being able to map numpy array
+slices.  For example, the following (contrived) example uses Eigen with a numpy
+slice to multiply by 2 all coefficients that are both on even rows (0, 2, 4,
+...) and in columns 2, 5, or 8:
+
+.. code-block:: cpp
+
+    m.def("scale", [](py::EigenDRef<Eigen::MatrixXd> m, double c) { m *= c; });
+
+.. code-block:: python
+
+    # a = np.array(...)
+    scale_by_2(myarray[0::2, 2:9:3])
+
+The second approach to avoid copying is more intrusive: rearranging the
+underlying data types to not run into the non-contiguous storage problem in the
+first place.  In particular, that means using matrices with ``Eigen::RowMajor``
+storage, where appropriate, such as:
+
+.. code-block:: cpp
+
+    using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    // Use RowMatrixXd instead of MatrixXd
+
+Now bound functions accepting ``Eigen::Ref<RowMatrixXd>`` arguments will be
+callable with numpy's (default) arrays without involving a copying.
+
+You can, alternatively, change the storage order that numpy arrays use by
+adding the ``order='F'`` option when creating an array:
+
+.. code-block:: python
+
+    myarray = np.array(source, order='F')
+
+Such an object will be passable to a bound function accepting an
+``Eigen::Ref<MatrixXd>`` (or similar column-major Eigen type).
+
+One major caveat with this approach, however, is that it is not entirely as
+easy as simply flipping all Eigen or numpy usage from one to the other: some
+operations may alter the storage order of a numpy array.  For example, ``a2 =
+array.transpose()`` results in ``a2`` being a view of ``array`` that references
+the same data, but in the opposite storage order!
+
+While this approach allows fully optimized vectorized calculations in Eigen, it
+cannot be used with array slices, unlike the first approach.
+
+When *returning* a matrix to Python (either a regular matrix, a reference via
+``Eigen::Ref<>``, or a map/block into a matrix), no special storage
+consideration is required: the created numpy array will have the required
+stride that allows numpy to properly interpret the array, whatever its storage
+order.
+
+Failing rather than copying
+===========================
+
+The default behaviour when binding ``Eigen::Ref<const MatrixType>`` eigen
+references is to copy matrix values when passed a numpy array that does not
+conform to the element type of ``MatrixType`` or does not have a compatible
+stride layout.  If you want to explicitly avoid copying in such a case, you
+should bind arguments using the ``py::arg().noconvert()`` annotation (as
+described in the :ref:`nonconverting_arguments` documentation).
+
+The following example shows an example of arguments that don't allow data
+copying to take place:
+
+.. code-block:: cpp
+
+    // The method and function to be bound:
+    class MyClass {
+        // ...
+        double some_method(const Eigen::Ref<const MatrixXd> &matrix) { /* ... */ }
+    };
+    float some_function(const Eigen::Ref<const MatrixXf> &big,
+                        const Eigen::Ref<const MatrixXf> &small) {
+        // ...
     }
 
-To see why this is, refer to the section on :ref:`opaque` (although that
-section specifically covers STL data types, the underlying issue is the same).
-The :ref:`numpy` sections discuss an efficient alternative for exposing the
-underlying native Eigen types as opaque objects in a way that still integrates
-with NumPy and SciPy.
+    // The associated binding code:
+    using namespace pybind11::literals; // for "arg"_a
+    py::class_<MyClass>(m, "MyClass")
+        // ... other class definitions
+        .def("some_method", &MyClass::some_method, py::arg().noconvert());
+
+    m.def("some_function", &some_function,
+        "big"_a.noconvert(), // <- Don't allow copying for this arg
+        "small"_a            // <- This one can be copied if needed
+    );
+
+With the above binding code, attempting to call the the ``some_method(m)``
+method on a ``MyClass`` object, or attempting to call ``some_function(m, m2)``
+will raise a ``RuntimeError`` rather than making a temporary copy of the array.
+It will, however, allow the ``m2`` argument to be copied into a temporary if
+necessary.
+
+Note that explicitly specifying ``.noconvert()`` is not required for *mutable*
+Eigen references (e.g. ``Eigen::Ref<MatrixXd>`` without ``const`` on the
+``MatrixXd``): mutable references will never be called with a temporary copy.
+
+Vectors versus column/row matrices
+==================================
+
+Eigen and numpy have fundamentally different notions of a vector.  In Eigen, a
+vector is simply a matrix with the number of columns or rows set to 1 at
+compile time (for a column vector or row vector, respectively).  Numpy, in
+contast, has comparable 2-dimensional 1xN and Nx1 arrays, but *also* has
+1-dimensional arrays of size N.
+
+When passing a 2-dimensional 1xN or Nx1 array to Eigen, the Eigen type must
+have matching dimensions: That is, you cannot pass a 2-dimensional Nx1 numpy
+array to an Eigen value expecting a row vector, or a 1xN numpy array as a
+column vector argument.
+
+On the other hand, pybind11 allows you to pass 1-dimensional arrays of length N
+as Eigen parameters.  If the Eigen type can hold a column vector of length N it
+will be passed as such a column vector.  If not, but the Eigen type constraints
+will accept a row vector, it will be passed as a row vector.  (The column
+vector takes precendence when both are supported, for example, when passing a
+1D numpy array to a MatrixXd argument).  Note that the type need not be
+expicitly a vector: it is permitted to pass a 1D numpy array of size 5 to an
+Eigen ``Matrix<double, Dynamic, 5>``: you would end up with a 1x5 Eigen matrix.
+Passing the same to an ``Eigen::MatrixXd`` would result in a 5x1 Eigen matrix.
+
+When returning an eigen vector to numpy, the conversion is ambiguous: a row
+vector of length 4 could be returned as either a 1D array of length 4, or as a
+2D array of size 1x4.  When encoutering such a situation, pybind11 compromises
+by considering the returned Eigen type: if it is a compile-time vector--that
+is, the type has either the number of rows or columns set to 1 at compile
+time--pybind11 converts to a 1D numpy array when returning the value.  For
+instances that are a vector only at run-time (e.g. ``MatrixXd``,
+``Matrix<float, Dynamic, 4>``), pybind11 returns the vector as a 2D array to
+numpy.  If this isn't want you want, you can use ``array.reshape(...)`` to get
+a view of the same data in the desired dimensions.
 
 .. seealso::
 
diff --git a/pybind11/docs/advanced/cast/functional.rst b/pybind11/docs/advanced/cast/functional.rst
index 5d0a01d13..d9b460575 100644
--- a/pybind11/docs/advanced/cast/functional.rst
+++ b/pybind11/docs/advanced/cast/functional.rst
@@ -56,14 +56,10 @@ trivial to generate binding code for all of these functions.
 
     #include <pybind11/functional.h>
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         m.def("func_arg", &func_arg);
         m.def("func_ret", &func_ret);
         m.def("func_cpp", &func_cpp);
-
-        return m.ptr();
     }
 
 The following interactive session shows how to call them from Python.
diff --git a/pybind11/docs/advanced/cast/index.rst b/pybind11/docs/advanced/cast/index.rst
index 36586af5c..54c10570b 100644
--- a/pybind11/docs/advanced/cast/index.rst
+++ b/pybind11/docs/advanced/cast/index.rst
@@ -33,6 +33,7 @@ the last case of the above list.
    :maxdepth: 1
 
    overview
+   strings
    stl
    functional
    chrono
diff --git a/pybind11/docs/advanced/cast/overview.rst b/pybind11/docs/advanced/cast/overview.rst
index ab37b90be..2ac7d3009 100644
--- a/pybind11/docs/advanced/cast/overview.rst
+++ b/pybind11/docs/advanced/cast/overview.rst
@@ -94,16 +94,31 @@ as arguments and return values, refer to the section on binding :ref:`classes`.
 +------------------------------------+---------------------------+-------------------------------+
 | ``char``                           | Character literal         | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
+| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
 | ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
 | ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
+| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
 | ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
 | ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
+| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
 | ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
+| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`   |
+| ``std::u16string_view``, etc.      |                           |                               |
++------------------------------------+---------------------------+-------------------------------+
 | ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`   |
 +------------------------------------+---------------------------+-------------------------------+
 | ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`   |
@@ -132,6 +147,8 @@ as arguments and return values, refer to the section on binding :ref:`classes`.
 +------------------------------------+---------------------------+-------------------------------+
 | ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`        |
 +------------------------------------+---------------------------+-------------------------------+
+| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
 | ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h` |
 +------------------------------------+---------------------------+-------------------------------+
 | ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`     |
diff --git a/pybind11/docs/advanced/cast/stl.rst b/pybind11/docs/advanced/cast/stl.rst
index bbd23732b..3f30c0290 100644
--- a/pybind11/docs/advanced/cast/stl.rst
+++ b/pybind11/docs/advanced/cast/stl.rst
@@ -5,10 +5,12 @@ Automatic conversion
 ====================
 
 When including the additional header file :file:`pybind11/stl.h`, conversions
-between ``std::vector<>``, ``std::list<>``, ``std::set<>``, and ``std::map<>``
-and the Python ``list``, ``set`` and ``dict`` data structures are automatically
-enabled. The types ``std::pair<>`` and ``std::tuple<>`` are already supported
-out of the box with just the core :file:`pybind11/pybind11.h` header.
+between ``std::vector<>``/``std::list<>``/``std::array<>``,
+``std::set<>``/``std::unordered_set<>``, and
+``std::map<>``/``std::unordered_map<>`` and the Python ``list``, ``set`` and
+``dict`` data structures are automatically enabled. The types ``std::pair<>``
+and ``std::tuple<>`` are already supported out of the box with just the core
+:file:`pybind11/pybind11.h` header.
 
 The major downside of these implicit conversions is that containers must be
 converted (i.e. copied) on every Python->C++ and C++->Python transition, which
@@ -21,9 +23,62 @@ next sections for more details and alternative approaches that avoid this.
 
 .. seealso::
 
-    The file :file:`tests/test_python_types.cpp` contains a complete
+    The file :file:`tests/test_stl.cpp` contains a complete
     example that demonstrates how to pass STL data types in more detail.
 
+.. _cpp17_container_casters:
+
+C++17 library containers
+========================
+
+The :file:`pybind11/stl.h` header also includes support for ``std::optional<>``
+and ``std::variant<>``. These require a C++17 compiler and standard library.
+In C++14 mode, ``std::experimental::optional<>`` is supported if available.
+
+Various versions of these containers also exist for C++11 (e.g. in Boost).
+pybind11 provides an easy way to specialize the ``type_caster`` for such
+types:
+
+.. code-block:: cpp
+
+    // `boost::optional` as an example -- can be any `std::optional`-like container
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct type_caster<boost::optional<T>> : optional_caster<boost::optional<T>> {};
+    }}
+
+The above should be placed in a header file and included in all translation units
+where automatic conversion is needed. Similarly, a specialization can be provided
+for custom variant types:
+
+.. code-block:: cpp
+
+    // `boost::variant` as an example -- can be any `std::variant`-like container
+    namespace pybind11 { namespace detail {
+        template <typename... Ts>
+        struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+        // Specifies the function used to visit the variant -- `apply_visitor` instead of `visit`
+        template <>
+        struct visit_helper<boost::variant> {
+            template <typename... Args>
+            static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+                return boost::apply_visitor(args...);
+            }
+        };
+    }} // namespace pybind11::detail
+
+The ``visit_helper`` specialization is not required if your ``name::variant`` provides
+a ``name::visit()`` function. For any other function name, the specialization must be
+included to tell pybind11 how to visit the variant.
+
+.. note::
+
+    pybind11 only supports the modern implementation of ``boost::variant``
+    which makes use of variadic templates. This requires Boost 1.56 or newer.
+    Additionally, on Windows, MSVC 2017 is required because ``boost::variant``
+    falls back to the old non-variadic implementation on MSVC 2015.
+
 .. _opaque:
 
 Making opaque types
@@ -72,7 +127,7 @@ functions:
     /* ... binding code ... */
 
     py::class_<MyClass>(m, "MyClass")
-        .def(py::init<>)
+        .def(py::init<>())
         .def_readwrite("contents", &MyClass::contents);
 
 In this case, properties can be read and written in their entirety. However, an
@@ -103,10 +158,10 @@ the declaration
 before any binding code (e.g. invocations to ``class_::def()``, etc.). This
 macro must be specified at the top level (and outside of any namespaces), since
 it instantiates a partial template overload. If your binding code consists of
-multiple compilation units, it must be present in every file preceding any
-usage of ``std::vector<int>``. Opaque types must also have a corresponding
-``class_`` declaration to associate them with a name in Python, and to define a
-set of available operations, e.g.:
+multiple compilation units, it must be present in every file (typically via a
+common header) preceding any usage of ``std::vector<int>``. Opaque types must
+also have a corresponding ``class_`` declaration to associate them with a name
+in Python, and to define a set of available operations, e.g.:
 
 .. code-block:: cpp
 
@@ -120,6 +175,20 @@ set of available operations, e.g.:
         }, py::keep_alive<0, 1>()) /* Keep vector alive while iterator is used */
         // ....
 
+Please take a look at the :ref:`macro_notes` before using the
+``PYBIND11_MAKE_OPAQUE`` macro.
+
+.. seealso::
+
+    The file :file:`tests/test_opaque_types.cpp` contains a complete
+    example that demonstrates how to create and expose opaque types using
+    pybind11 in more detail.
+
+.. _stl_bind:
+
+Binding STL containers
+======================
+
 The ability to expose STL containers as native Python objects is a fairly
 common request, hence pybind11 also provides an optional header file named
 :file:`pybind11/stl_bind.h` that does exactly this. The mapped containers try
@@ -141,14 +210,34 @@ The following example showcases usage of :file:`pybind11/stl_bind.h`:
     py::bind_vector<std::vector<int>>(m, "VectorInt");
     py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
 
-Please take a look at the :ref:`macro_notes` before using the
-``PYBIND11_MAKE_OPAQUE`` macro.
+When binding STL containers pybind11 considers the types of the container's
+elements to decide whether the container should be confined to the local module
+(via the :ref:`module_local` feature).  If the container element types are
+anything other than already-bound custom types bound without
+``py::module_local()`` the container binding will have ``py::module_local()``
+applied.  This includes converting types such as numeric types, strings, Eigen
+types; and types that have not yet been bound at the time of the stl container
+binding.  This module-local binding is designed to avoid potential conflicts
+between module bindings (for example, from two separate modules each attempting
+to bind ``std::vector<int>`` as a python type).
+
+It is possible to override this behavior to force a definition to be either
+module-local or global.  To do so, you can pass the attributes
+``py::module_local()`` (to make the binding module-local) or
+``py::module_local(false)`` (to make the binding global) into the
+``py::bind_vector`` or ``py::bind_map`` arguments:
 
-.. seealso::
+.. code-block:: cpp
 
-    The file :file:`tests/test_opaque_types.cpp` contains a complete
-    example that demonstrates how to create and expose opaque types using
-    pybind11 in more detail.
+    py::bind_vector<std::vector<int>>(m, "VectorInt", py::module_local(false));
+
+Note, however, that such a global binding would make it impossible to load this
+module at the same time as any other pybind module that also attempts to bind
+the same container type (``std::vector<int>`` in the above example).
+
+See :ref:`module_local` for more details on module-local bindings.
+
+.. seealso::
 
     The file :file:`tests/test_stl_binders.cpp` shows how to use the
     convenience STL container wrappers.
diff --git a/pybind11/docs/advanced/cast/strings.rst b/pybind11/docs/advanced/cast/strings.rst
new file mode 100644
index 000000000..2cdbade3a
--- /dev/null
+++ b/pybind11/docs/advanced/cast/strings.rst
@@ -0,0 +1,303 @@
+Strings, bytes and Unicode conversions
+######################################
+
+.. note::
+
+    This section discusses string handling in terms of Python 3 strings. For
+    Python 2.7, replace all occurrences of ``str`` with ``unicode`` and
+    ``bytes`` with ``str``.  Python 2.7 users may find it best to use ``from
+    __future__ import unicode_literals`` to avoid unintentionally using ``str``
+    instead of ``unicode``.
+
+Passing Python strings to C++
+=============================
+
+When a Python ``str`` is passed from Python to a C++ function that accepts
+``std::string`` or ``char *`` as arguments, pybind11 will encode the Python
+string to UTF-8. All Python ``str`` can be encoded in UTF-8, so this operation
+does not fail.
+
+The C++ language is encoding agnostic. It is the responsibility of the
+programmer to track encodings. It's often easiest to simply `use UTF-8
+everywhere <http://utf8everywhere.org/>`_.
+
+.. code-block:: c++
+
+    m.def("utf8_test",
+        [](const std::string &s) {
+            cout << "utf-8 is icing on the cake.\n";
+            cout << s;
+        }
+    );
+    m.def("utf8_charptr",
+        [](const char *s) {
+            cout << "My favorite food is\n";
+            cout << s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> utf8_test('🎂')
+    utf-8 is icing on the cake.
+    🎂
+
+    >>> utf8_charptr('🍕')
+    My favorite food is
+    🍕
+
+.. note::
+
+    Some terminal emulators do not support UTF-8 or emoji fonts and may not
+    display the example above correctly.
+
+The results are the same whether the C++ function accepts arguments by value or
+reference, and whether or not ``const`` is used.
+
+Passing bytes to C++
+--------------------
+
+A Python ``bytes`` object will be passed to C++ functions that accept
+``std::string`` or ``char*`` *without* conversion.
+
+
+Returning C++ strings to Python
+===============================
+
+When a C++ function returns a ``std::string`` or ``char*`` to a Python caller,
+**pybind11 will assume that the string is valid UTF-8** and will decode it to a
+native Python ``str``, using the same API as Python uses to perform
+``bytes.decode('utf-8')``. If this implicit conversion fails, pybind11 will
+raise a ``UnicodeDecodeError``.
+
+.. code-block:: c++
+
+    m.def("std_string_return",
+        []() {
+            return std::string("This string needs to be UTF-8 encoded");
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.std_string_return(), str)
+    True
+
+
+Because UTF-8 is inclusive of pure ASCII, there is never any issue with
+returning a pure ASCII string to Python. If there is any possibility that the
+string is not pure ASCII, it is necessary to ensure the encoding is valid
+UTF-8.
+
+.. warning::
+
+    Implicit conversion assumes that a returned ``char *`` is null-terminated.
+    If there is no null terminator a buffer overrun will occur.
+
+Explicit conversions
+--------------------
+
+If some C++ code constructs a ``std::string`` that is not a UTF-8 string, one
+can perform a explicit conversion and return a ``py::str`` object. Explicit
+conversion has the same overhead as implicit conversion.
+
+.. code-block:: c++
+
+    // This uses the Python C API to convert Latin-1 to Unicode
+    m.def("str_output",
+        []() {
+            std::string s = "Send your r\xe9sum\xe9 to Alice in HR"; // Latin-1
+            py::str py_s = PyUnicode_DecodeLatin1(s.data(), s.length());
+            return py_s;
+        }
+    );
+
+.. code-block:: python
+
+    >>> str_output()
+    'Send your résumé to Alice in HR'
+
+The `Python C API
+<https://docs.python.org/3/c-api/unicode.html#built-in-codecs>`_ provides
+several built-in codecs.
+
+
+One could also use a third party encoding library such as libiconv to transcode
+to UTF-8.
+
+Return C++ strings without conversion
+-------------------------------------
+
+If the data in a C++ ``std::string`` does not represent text and should be
+returned to Python as ``bytes``, then one can return the data as a
+``py::bytes`` object.
+
+.. code-block:: c++
+
+    m.def("return_bytes",
+        []() {
+            std::string s("\xba\xd0\xba\xd0");  // Not valid UTF-8
+            return py::bytes(s);  // Return the data without transcoding
+        }
+    );
+
+.. code-block:: python
+
+    >>> example.return_bytes()
+    b'\xba\xd0\xba\xd0'
+
+
+Note the asymmetry: pybind11 will convert ``bytes`` to ``std::string`` without
+encoding, but cannot convert ``std::string`` back to ``bytes`` implicitly.
+
+.. code-block:: c++
+
+    m.def("asymmetry",
+        [](std::string s) {  // Accepts str or bytes from Python
+            return s;  // Looks harmless, but implicitly converts to str
+        }
+    );
+
+.. code-block:: python
+
+    >>> isinstance(example.asymmetry(b"have some bytes"), str)
+    True
+
+    >>> example.asymmetry(b"\xba\xd0\xba\xd0")  # invalid utf-8 as bytes
+    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte
+
+
+Wide character strings
+======================
+
+When a Python ``str`` is passed to a C++ function expecting ``std::wstring``,
+``wchar_t*``, ``std::u16string`` or ``std::u32string``, the ``str`` will be
+encoded to UTF-16 or UTF-32 depending on how the C++ compiler implements each
+type, in the platform's native endianness. When strings of these types are
+returned, they are assumed to contain valid UTF-16 or UTF-32, and will be
+decoded to Python ``str``.
+
+.. code-block:: c++
+
+    #define UNICODE
+    #include <windows.h>
+
+    m.def("set_window_text",
+        [](HWND hwnd, std::wstring s) {
+            // Call SetWindowText with null-terminated UTF-16 string
+            ::SetWindowText(hwnd, s.c_str());
+        }
+    );
+    m.def("get_window_text",
+        [](HWND hwnd) {
+            const int buffer_size = ::GetWindowTextLength(hwnd) + 1;
+            auto buffer = std::make_unique< wchar_t[] >(buffer_size);
+
+            ::GetWindowText(hwnd, buffer.data(), buffer_size);
+
+            std::wstring text(buffer.get());
+
+            // wstring will be converted to Python str
+            return text;
+        }
+    );
+
+.. warning::
+
+    Wide character strings may not work as described on Python 2.7 or Python
+    3.3 compiled with ``--enable-unicode=ucs2``.
+
+Strings in multibyte encodings such as Shift-JIS must transcoded to a
+UTF-8/16/32 before being returned to Python.
+
+
+Character literals
+==================
+
+C++ functions that accept character literals as input will receive the first
+character of a Python ``str`` as their input. If the string is longer than one
+Unicode character, trailing characters will be ignored.
+
+When a character literal is returned from C++ (such as a ``char`` or a
+``wchar_t``), it will be converted to a ``str`` that represents the single
+character.
+
+.. code-block:: c++
+
+    m.def("pass_char", [](char c) { return c; });
+    m.def("pass_wchar", [](wchar_t w) { return w; });
+
+.. code-block:: python
+
+    >>> example.pass_char('A')
+    'A'
+
+While C++ will cast integers to character types (``char c = 0x65;``), pybind11
+does not convert Python integers to characters implicitly. The Python function
+``chr()`` can be used to convert integers to characters.
+
+.. code-block:: python
+
+    >>> example.pass_char(0x65)
+    TypeError
+
+    >>> example.pass_char(chr(0x65))
+    'A'
+
+If the desire is to work with an 8-bit integer, use ``int8_t`` or ``uint8_t``
+as the argument type.
+
+Grapheme clusters
+-----------------
+
+A single grapheme may be represented by two or more Unicode characters. For
+example 'é' is usually represented as U+00E9 but can also be expressed as the
+combining character sequence U+0065 U+0301 (that is, the letter 'e' followed by
+a combining acute accent). The combining character will be lost if the
+two-character sequence is passed as an argument, even though it renders as a
+single grapheme.
+
+.. code-block:: python
+
+    >>> example.pass_wchar('é')
+    'é'
+
+    >>> combining_e_acute = 'e' + '\u0301'
+
+    >>> combining_e_acute
+    'é'
+
+    >>> combining_e_acute == 'é'
+    False
+
+    >>> example.pass_wchar(combining_e_acute)
+    'e'
+
+Normalizing combining characters before passing the character literal to C++
+may resolve *some* of these issues:
+
+.. code-block:: python
+
+    >>> example.pass_wchar(unicodedata.normalize('NFC', combining_e_acute))
+    'é'
+
+In some languages (Thai for example), there are `graphemes that cannot be
+expressed as a single Unicode code point
+<http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>`_, so there is
+no way to capture them in a C++ character type.
+
+
+C++17 string views
+==================
+
+C++17 string views are automatically supported when compiling in C++17 mode.
+They follow the same rules for encoding and decoding as the corresponding STL
+string type (for example, a ``std::u16string_view`` argument will be passed
+UTF-16-encoded data, and a returned ``std::string_view`` will be decoded as
+UTF-8).
+
+References
+==========
+
+* `The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!) <https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/>`_
+* `C++ - Using STL Strings at Win32 API Boundaries <https://msdn.microsoft.com/en-ca/magazine/mt238407.aspx>`_
diff --git a/pybind11/docs/advanced/classes.rst b/pybind11/docs/advanced/classes.rst
index e20895e6d..93deeec62 100644
--- a/pybind11/docs/advanced/classes.rst
+++ b/pybind11/docs/advanced/classes.rst
@@ -45,9 +45,7 @@ Normally, the binding code for these classes would look as follows:
 
 .. code-block:: cpp
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         py::class_<Animal> animal(m, "Animal");
         animal
             .def("go", &Animal::go);
@@ -56,8 +54,6 @@ Normally, the binding code for these classes would look as follows:
             .def(py::init<>());
 
         m.def("call_go", &call_go);
-
-        return m.ptr();
     }
 
 However, these bindings are impossible to extend: ``Animal`` is not
@@ -79,7 +75,7 @@ helper class that is defined as follows:
             PYBIND11_OVERLOAD_PURE(
                 std::string, /* Return type */
                 Animal,      /* Parent class */
-                go,          /* Name of function */
+                go,          /* Name of function in C++ (must match Python name) */
                 n_times      /* Argument(s) */
             );
         }
@@ -90,17 +86,16 @@ functions, and :func:`PYBIND11_OVERLOAD` should be used for functions which have
 a default implementation.  There are also two alternate macros
 :func:`PYBIND11_OVERLOAD_PURE_NAME` and :func:`PYBIND11_OVERLOAD_NAME` which
 take a string-valued name argument between the *Parent class* and *Name of the
-function* slots. This is useful when the C++ and Python versions of the
+function* slots, which defines the name of function in Python. This is required
+when the C++ and Python versions of the
 function have different names, e.g.  ``operator()`` vs ``__call__``.
 
 The binding code also needs a few minor adaptations (highlighted):
 
 .. code-block:: cpp
-    :emphasize-lines: 4,6,7
-
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
+    :emphasize-lines: 2,4,5
 
+    PYBIND11_MODULE(example, m) {
         py::class_<Animal, PyAnimal /* <--- trampoline*/> animal(m, "Animal");
         animal
             .def(py::init<>())
@@ -110,18 +105,25 @@ The binding code also needs a few minor adaptations (highlighted):
             .def(py::init<>());
 
         m.def("call_go", &call_go);
-
-        return m.ptr();
     }
 
 Importantly, pybind11 is made aware of the trampoline helper class by
-specifying it as an extra template argument to :class:`class_`.  (This can also
+specifying it as an extra template argument to :class:`class_`. (This can also
 be combined with other template arguments such as a custom holder type; the
 order of template types does not matter).  Following this, we are able to
 define a constructor as usual.
 
+Bindings should be made against the actual class, not the trampoline helper class.
+
+.. code-block:: cpp
+
+    py::class_<Animal, PyAnimal /* <--- trampoline*/> animal(m, "Animal");
+        animal
+            .def(py::init<>())
+            .def("go", &PyAnimal::go); /* <--- THIS IS WRONG, use &Animal::go */
+
 Note, however, that the above is sufficient for allowing python classes to
-extend ``Animal``, but not ``Dog``: see ref:`virtual_and_inheritance` for the
+extend ``Animal``, but not ``Dog``: see :ref:`virtual_and_inheritance` for the
 necessary steps required to providing proper overload support for inherited
 classes.
 
@@ -142,6 +144,30 @@ a virtual method call.
     >>> call_go(c)
     u'meow! meow! meow! '
 
+If you are defining a custom constructor in a derived Python class, you *must*
+ensure that you explicitly call the bound C++ constructor using ``__init__``,
+*regardless* of whether it is a default constructor or not. Otherwise, the
+memory for the C++ portion of the instance will be left uninitialized, which
+will generally leave the C++ instance in an invalid state and cause undefined
+behavior if the C++ instance is subsequently used.
+
+Here is an example:
+
+.. code-block:: python
+
+    class Dachschund(Dog):
+        def __init__(self, name):
+            Dog.__init__(self) # Without this, undefind behavior may occur if the C++ portions are referenced.
+            self.name = name
+        def bark(self):
+            return "yap!"
+
+Note that a direct ``__init__`` constructor *should be called*, and ``super()``
+should not be used. For simple cases of linear inheritance, ``super()``
+may work, but once you begin mixing Python and C++ multiple inheritance,
+things will fall apart due to differences between Python's MRO and C++'s
+mechanisms.
+
 Please take a look at the :ref:`macro_notes` before using this feature.
 
 .. note::
@@ -186,7 +212,7 @@ example as follows:
         virtual std::string go(int n_times) = 0;
         virtual std::string name() { return "unknown"; }
     };
-    class Dog : public class Animal {
+    class Dog : public Animal {
     public:
         std::string go(int n_times) override {
             std::string result;
@@ -220,6 +246,13 @@ override the ``name()`` method):
         std::string bark() override { PYBIND11_OVERLOAD(std::string, Dog, bark, ); }
     };
 
+.. note::
+
+    Note the trailing commas in the ``PYBIND11_OVERLOAD`` calls to ``name()``
+    and ``bark()``. These are needed to portably implement a trampoline for a
+    function that does not take any arguments. For functions that take
+    a nonzero number of arguments, the trailing comma must be omitted.
+
 A registered class derived from a pybind11-registered class with virtual
 methods requires a similar trampoline class, *even if* it doesn't explicitly
 declare or override any virtual methods itself:
@@ -228,7 +261,8 @@ declare or override any virtual methods itself:
 
     class Husky : public Dog {};
     class PyHusky : public Husky {
-        using Dog::Dog; // Inherit constructors
+    public:
+        using Husky::Husky; // Inherit constructors
         std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, Husky, go, n_times); }
         std::string name() override { PYBIND11_OVERLOAD(std::string, Husky, name, ); }
         std::string bark() override { PYBIND11_OVERLOAD(std::string, Husky, bark, ); }
@@ -242,11 +276,13 @@ follows:
 .. code-block:: cpp
 
     template <class AnimalBase = Animal> class PyAnimal : public AnimalBase {
+    public:
         using AnimalBase::AnimalBase; // Inherit constructors
         std::string go(int n_times) override { PYBIND11_OVERLOAD_PURE(std::string, AnimalBase, go, n_times); }
         std::string name() override { PYBIND11_OVERLOAD(std::string, AnimalBase, name, ); }
     };
     template <class DogBase = Dog> class PyDog : public PyAnimal<DogBase> {
+    public:
         using PyAnimal<DogBase>::PyAnimal; // Inherit constructors
         // Override PyAnimal's pure virtual go() with a non-pure one:
         std::string go(int n_times) override { PYBIND11_OVERLOAD(std::string, DogBase, go, n_times); }
@@ -286,6 +322,8 @@ can now create a python class that inherits from ``Dog``:
     See the file :file:`tests/test_virtual_functions.cpp` for complete examples
     using both the duplication and templated trampoline approaches.
 
+.. _extended_aliases:
+
 Extended trampoline class functionality
 =======================================
 
@@ -313,7 +351,7 @@ ensuring member initialization and (eventual) destruction.
 
 .. seealso::
 
-    See the file :file:`tests/test_alias_initialization.cpp` for complete examples
+    See the file :file:`tests/test_virtual_functions.cpp` for complete examples
     showing both normal and forced trampoline instantiation.
 
 .. _custom_constructors:
@@ -322,29 +360,129 @@ Custom constructors
 ===================
 
 The syntax for binding constructors was previously introduced, but it only
-works when a constructor with the given parameters actually exists on the C++
-side. To extend this to more general cases, let's take a look at what actually
-happens under the hood: the following statement
+works when a constructor of the appropriate arguments actually exists on the
+C++ side.  To extend this to more general cases, pybind11 makes it possible
+to bind factory functions as constructors. For example, suppose you have a
+class like this:
 
 .. code-block:: cpp
 
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function:
+        static Example create(int a) { return Example(a); }
+    };
+
     py::class_<Example>(m, "Example")
-        .def(py::init<int>());
+        .def(py::init(&Example::create));
+
+While it is possible to create a straightforward binding of the static
+``create`` method, it may sometimes be preferable to expose it as a constructor
+on the Python side. This can be accomplished by calling ``.def(py::init(...))``
+with the function reference returning the new instance passed as an argument.
+It is also possible to use this approach to bind a function returning a new
+instance by raw pointer or by the holder (e.g. ``std::unique_ptr``).
 
-is short hand notation for
+The following example shows the different approaches:
 
 .. code-block:: cpp
 
+    class Example {
+    private:
+        Example(int); // private constructor
+    public:
+        // Factory function - returned by value:
+        static Example create(int a) { return Example(a); }
+
+        // These constructors are publicly callable:
+        Example(double);
+        Example(int, int);
+        Example(std::string);
+    };
+
     py::class_<Example>(m, "Example")
-        .def("__init__",
-            [](Example &instance, int arg) {
-                new (&instance) Example(arg);
-            }
-        );
+        // Bind the factory function as a constructor:
+        .def(py::init(&Example::create))
+        // Bind a lambda function returning a pointer wrapped in a holder:
+        .def(py::init([](std::string arg) {
+            return std::unique_ptr<Example>(new Example(arg));
+        }))
+        // Return a raw pointer:
+        .def(py::init([](int a, int b) { return new Example(a, b); }))
+        // You can mix the above with regular C++ constructor bindings as well:
+        .def(py::init<double>())
+        ;
+
+When the constructor is invoked from Python, pybind11 will call the factory
+function and store the resulting C++ instance in the Python instance.
+
+When combining factory functions constructors with :ref:`virtual function
+trampolines <overriding_virtuals>` there are two approaches.  The first is to
+add a constructor to the alias class that takes a base value by
+rvalue-reference.  If such a constructor is available, it will be used to
+construct an alias instance from the value returned by the factory function.
+The second option is to provide two factory functions to ``py::init()``: the
+first will be invoked when no alias class is required (i.e. when the class is
+being used but not inherited from in Python), and the second will be invoked
+when an alias is required.
+
+You can also specify a single factory function that always returns an alias
+instance: this will result in behaviour similar to ``py::init_alias<...>()``,
+as described in the :ref:`extended trampoline class documentation
+<extended_aliases>`.
+
+The following example shows the different factory approaches for a class with
+an alias:
 
-In other words, :func:`init` creates an anonymous function that invokes an
-in-place constructor. Memory allocation etc. is already take care of beforehand
-within pybind11.
+.. code-block:: cpp
+
+    #include <pybind11/factory.h>
+    class Example {
+    public:
+        // ...
+        virtual ~Example() = default;
+    };
+    class PyExample : public Example {
+    public:
+        using Example::Example;
+        PyExample(Example &&base) : Example(std::move(base)) {}
+    };
+    py::class_<Example, PyExample>(m, "Example")
+        // Returns an Example pointer.  If a PyExample is needed, the Example
+        // instance will be moved via the extra constructor in PyExample, above.
+        .def(py::init([]() { return new Example(); }))
+        // Two callbacks:
+        .def(py::init([]() { return new Example(); } /* no alias needed */,
+                      []() { return new PyExample(); } /* alias needed */))
+        // *Always* returns an alias instance (like py::init_alias<>())
+        .def(py::init([]() { return new PyExample(); }))
+        ;
+
+Brace initialization
+--------------------
+
+``pybind11::init<>`` internally uses C++11 brace initialization to call the
+constructor of the target class. This means that it can be used to bind
+*implicit* constructors as well:
+
+.. code-block:: cpp
+
+    struct Aggregate {
+        int a;
+        std::string b;
+    };
+
+    py::class_<Aggregate>(m, "Aggregate")
+        .def(py::init<int, const std::string &>());
+
+.. note::
+
+    Note that brace initialization preferentially invokes constructor overloads
+    taking a ``std::initializer_list``. In the rare event that this causes an
+    issue, you can work around it by using ``py::init(...)`` with a lambda
+    function that constructs the new object as desired.
 
 .. _classes_with_non_public_destructors:
 
@@ -373,7 +511,9 @@ crucial that instances are deallocated on the C++ side to avoid memory leaks.
     /* ... binding code ... */
 
     py::class_<MyClass, std::unique_ptr<MyClass, py::nodelete>>(m, "MyClass")
-        .def(py::init<>)
+        .def(py::init<>())
+
+.. _implicit_conversions:
 
 Implicit conversions
 ====================
@@ -413,6 +553,10 @@ Python side:
     Implicit conversions from ``A`` to ``B`` only work when ``B`` is a custom
     data type that is exposed to Python via pybind11.
 
+    To prevent runaway recursion, implicit conversions are non-reentrant: an
+    implicit conversion invoked as part of another implicit conversion of the
+    same type (i.e. from ``A`` to ``B``) will fail.
+
 .. _static_properties:
 
 Static properties
@@ -422,24 +566,15 @@ The section on :ref:`properties` discussed the creation of instance properties
 that are implemented in terms of C++ getters and setters.
 
 Static properties can also be created in a similar way to expose getters and
-setters of static class attributes. Two things are important to note:
-
-1. Static properties are implemented by instrumenting the *metaclass* of the
-   class in question -- however, this requires the class to have a modifiable
-   metaclass in the first place. pybind11 provides a ``py::metaclass()``
-   annotation that must be specified in the ``class_`` constructor, or any
-   later method calls to ``def_{property_,∅}_{readwrite,readonly}_static`` will
-   fail (see the example below).
-
-2. For static properties defined in terms of setter and getter functions, note
-   that the implicit ``self`` argument also exists in this case and is used to
-   pass the Python ``type`` subclass instance. This parameter will often not be
-   needed by the C++ side, and the following example illustrates how to
-   instantiate a lambda getter function that ignores it:
+setters of static class attributes. Note that the implicit ``self`` argument
+also exists in this case and is used to pass the Python ``type`` subclass
+instance. This parameter will often not be needed by the C++ side, and the
+following example illustrates how to instantiate a lambda getter function
+that ignores it:
 
 .. code-block:: cpp
 
-    py::class_<Foo>(m, "Foo", py::metaclass())
+    py::class_<Foo>(m, "Foo")
         .def_property_readonly_static("foo", [](py::object /* self */) { return Foo(); });
 
 Operator overloading
@@ -478,18 +613,15 @@ to Python.
 
     #include <pybind11/operators.h>
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         py::class_<Vector2>(m, "Vector2")
             .def(py::init<float, float>())
             .def(py::self + py::self)
             .def(py::self += py::self)
             .def(py::self *= float())
             .def(float() * py::self)
+            .def(py::self * float())
             .def("__repr__", &Vector2::toString);
-
-        return m.ptr();
     }
 
 Note that a line like
@@ -523,13 +655,15 @@ throwing a type error.
     complete example that demonstrates how to work with overloaded operators in
     more detail.
 
+.. _pickling:
+
 Pickling support
 ================
 
 Python's ``pickle`` module provides a powerful facility to serialize and
 de-serialize a Python object graph into a binary data stream. To pickle and
-unpickle C++ classes using pybind11, two additional functions must be provided.
-Suppose the class in question has the following signature:
+unpickle C++ classes using pybind11, a ``py::pickle()`` definition must be
+provided. Suppose the class in question has the following signature:
 
 .. code-block:: cpp
 
@@ -545,8 +679,9 @@ Suppose the class in question has the following signature:
         int m_extra = 0;
     };
 
-The binding code including the requisite ``__setstate__`` and ``__getstate__`` methods [#f3]_
-looks as follows:
+Pickling support in Python is enabled by defining the ``__setstate__`` and
+``__getstate__`` methods [#f3]_. For pybind11 classes, use ``py::pickle()``
+to bind these two functions:
 
 .. code-block:: cpp
 
@@ -555,21 +690,28 @@ looks as follows:
         .def("value", &Pickleable::value)
         .def("extra", &Pickleable::extra)
         .def("setExtra", &Pickleable::setExtra)
-        .def("__getstate__", [](const Pickleable &p) {
-            /* Return a tuple that fully encodes the state of the object */
-            return py::make_tuple(p.value(), p.extra());
-        })
-        .def("__setstate__", [](Pickleable &p, py::tuple t) {
-            if (t.size() != 2)
-                throw std::runtime_error("Invalid state!");
-
-            /* Invoke the in-place constructor. Note that this is needed even
-               when the object just has a trivial default constructor */
-            new (&p) Pickleable(t[0].cast<std::string>());
-
-            /* Assign any additional state */
-            p.setExtra(t[1].cast<int>());
-        });
+        .def(py::pickle(
+            [](const Pickleable &p) { // __getstate__
+                /* Return a tuple that fully encodes the state of the object */
+                return py::make_tuple(p.value(), p.extra());
+            },
+            [](py::tuple t) { // __setstate__
+                if (t.size() != 2)
+                    throw std::runtime_error("Invalid state!");
+
+                /* Create a new C++ instance */
+                Pickleable p(t[0].cast<std::string>());
+
+                /* Assign any additional state */
+                p.setExtra(t[1].cast<int>());
+
+                return p;
+            }
+        ));
+
+The ``__setstate__`` part of the ``py::picke()`` definition follows the same
+rules as the single-argument version of ``py::init()``. The return type can be
+a value, pointer or holder type. See :ref:`custom_constructors` for details.
 
 An instance can now be pickled as follows:
 
@@ -617,27 +759,243 @@ interspersed with alias types and holder types (discussed earlier in this
 document)---pybind11 will automatically find out which is which. The only
 requirement is that the first template argument is the type to be declared.
 
-There are two caveats regarding the implementation of this feature:
+It is also permitted to inherit multiply from exported C++ classes in Python,
+as well as inheriting from multiple Python and/or pybind-exported classes.
+
+There is one caveat regarding the implementation of this feature:
+
+When only one base type is specified for a C++ type that actually has multiple
+bases, pybind11 will assume that it does not participate in multiple
+inheritance, which can lead to undefined behavior. In such cases, add the tag
+``multiple_inheritance`` to the class constructor:
+
+.. code-block:: cpp
+
+    py::class_<MyType, BaseType2>(m, "MyType", py::multiple_inheritance());
+
+The tag is redundant and does not need to be specified when multiple base types
+are listed.
+
+.. _module_local:
+
+Module-local class bindings
+===========================
+
+When creating a binding for a class, pybind by default makes that binding
+"global" across modules.  What this means is that a type defined in one module
+can be returned from any module resulting in the same Python type.  For
+example, this allows the following:
+
+.. code-block:: cpp
+
+    // In the module1.cpp binding code for module1:
+    py::class_<Pet>(m, "Pet")
+        .def(py::init<std::string>())
+        .def_readonly("name", &Pet::name);
+
+.. code-block:: cpp
+
+    // In the module2.cpp binding code for module2:
+    m.def("create_pet", [](std::string name) { return new Pet(name); });
+
+.. code-block:: pycon
+
+    >>> from module1 import Pet
+    >>> from module2 import create_pet
+    >>> pet1 = Pet("Kitty")
+    >>> pet2 = create_pet("Doggy")
+    >>> pet2.name()
+    'Doggy'
+
+When writing binding code for a library, this is usually desirable: this
+allows, for example, splitting up a complex library into multiple Python
+modules.
+
+In some cases, however, this can cause conflicts.  For example, suppose two
+unrelated modules make use of an external C++ library and each provide custom
+bindings for one of that library's classes.  This will result in an error when
+a Python program attempts to import both modules (directly or indirectly)
+because of conflicting definitions on the external type:
+
+.. code-block:: cpp
+
+    // dogs.cpp
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("name", &pets::Pet::name);
 
-1. When only one base type is specified for a C++ type that actually has
-   multiple bases, pybind11 will assume that it does not participate in
-   multiple inheritance, which can lead to undefined behavior. In such cases,
-   add the tag ``multiple_inheritance``:
+    // Binding for local extension class:
+    py::class<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
 
-    .. code-block:: cpp
+.. code-block:: cpp
+
+    // cats.cpp, in a completely separate project from the above dogs.cpp.
+
+    // Binding for external library class:
+    py::class<pets::Pet>(m, "Pet")
+        .def("get_name", &pets::Pet::name);
+
+    // Binding for local extending class:
+    py::class<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+
+.. code-block:: pycon
+
+    >>> import cats
+    >>> import dogs
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ImportError: generic_type: type "Pet" is already registered!
+
+To get around this, you can tell pybind11 to keep the external class binding
+localized to the module by passing the ``py::module_local()`` attribute into
+the ``py::class_`` constructor:
+
+.. code-block:: cpp
 
-        py::class_<MyType, BaseType2>(m, "MyType", py::multiple_inheritance());
+    // Pet binding in dogs.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+
+.. code-block:: cpp
+
+    // Pet binding in cats.cpp:
+    py::class<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+
+This makes the Python-side ``dogs.Pet`` and ``cats.Pet`` into distinct classes,
+avoiding the conflict and allowing both modules to be loaded.  C++ code in the
+``dogs`` module that casts or returns a ``Pet`` instance will result in a
+``dogs.Pet`` Python instance, while C++ code in the ``cats`` module will result
+in a ``cats.Pet`` Python instance.
+
+This does come with two caveats, however: First, external modules cannot return
+or cast a ``Pet`` instance to Python (unless they also provide their own local
+bindings).  Second, from the Python point of view they are two distinct classes.
+
+Note that the locality only applies in the C++ -> Python direction.  When
+passing such a ``py::module_local`` type into a C++ function, the module-local
+classes are still considered.  This means that if the following function is
+added to any module (including but not limited to the ``cats`` and ``dogs``
+modules above) it will be callable with either a ``dogs.Pet`` or ``cats.Pet``
+argument:
+
+.. code-block:: cpp
+
+    m.def("pet_name", [](const pets::Pet &pet) { return pet.name(); });
+
+For example, suppose the above function is added to each of ``cats.cpp``,
+``dogs.cpp`` and ``frogs.cpp`` (where ``frogs.cpp`` is some other module that
+does *not* bind ``Pets`` at all).
+
+.. code-block:: pycon
+
+    >>> import cats, dogs, frogs  # No error because of the added py::module_local()
+    >>> mycat, mydog = cats.Cat("Fluffy"), dogs.Dog("Rover")
+    >>> (cats.pet_name(mycat), dogs.pet_name(mydog))
+    ('Fluffy', 'Rover')
+    >>> (cats.pet_name(mydog), dogs.pet_name(mycat), frogs.pet_name(mycat))
+    ('Rover', 'Fluffy', 'Fluffy')
+
+It is possible to use ``py::module_local()`` registrations in one module even
+if another module registers the same type globally: within the module with the
+module-local definition, all C++ instances will be cast to the associated bound
+Python type.  In other modules any such values are converted to the global
+Python type created elsewhere.
+
+.. note::
+
+    STL bindings (as provided via the optional :file:`pybind11/stl_bind.h`
+    header) apply ``py::module_local`` by default when the bound type might
+    conflict with other modules; see :ref:`stl_bind` for details.
+
+.. note::
 
-   The tag is redundant and does not need to be specified when multiple base
-   types are listed.
+    The localization of the bound types is actually tied to the shared object
+    or binary generated by the compiler/linker.  For typical modules created
+    with ``PYBIND11_MODULE()``, this distinction is not significant.  It is
+    possible, however, when :ref:`embedding` to embed multiple modules in the
+    same binary (see :ref:`embedding_modules`).  In such a case, the
+    localization will apply across all embedded modules within the same binary.
+
+.. seealso::
+
+    The file :file:`tests/test_local_bindings.cpp` contains additional examples
+    that demonstrate how ``py::module_local()`` works.
+
+Binding protected member functions
+==================================
+
+It's normally not possible to expose ``protected`` member functions to Python:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    py::class_<A>(m, "A")
+        .def("foo", &A::foo); // error: 'foo' is a protected member of 'A'
+
+On one hand, this is good because non-``public`` members aren't meant to be
+accessed from the outside. But we may want to make use of ``protected``
+functions in derived Python classes.
+
+The following pattern makes this possible:
+
+.. code-block:: cpp
+
+    class A {
+    protected:
+        int foo() const { return 42; }
+    };
+
+    class Publicist : public A { // helper type for exposing protected functions
+    public:
+        using A::foo; // inherited with different access modifier
+    };
+
+    py::class_<A>(m, "A") // bind the primary class
+        .def("foo", &Publicist::foo); // expose protected methods via the publicist
+
+This works because ``&Publicist::foo`` is exactly the same function as
+``&A::foo`` (same signature and address), just with a different access
+modifier. The only purpose of the ``Publicist`` helper class is to make
+the function name ``public``.
+
+If the intent is to expose ``protected`` ``virtual`` functions which can be
+overridden in Python, the publicist pattern can be combined with the previously
+described trampoline:
+
+.. code-block:: cpp
+
+    class A {
+    public:
+        virtual ~A() = default;
+
+    protected:
+        virtual int foo() const { return 42; }
+    };
+
+    class Trampoline : public A {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, A, foo, ); }
+    };
+
+    class Publicist : public A {
+    public:
+        using A::foo;
+    };
+
+    py::class_<A, Trampoline>(m, "A") // <-- `Trampoline` here
+        .def("foo", &Publicist::foo); // <-- `Publicist` here, not `Trampoline`!
+
+.. note::
 
-2. As was previously discussed in the section on :ref:`overriding_virtuals`, it
-   is easy to create Python types that derive from C++ classes. It is even
-   possible to make use of multiple inheritance to declare a Python class which
-   has e.g. a C++ and a Python class as bases. However, any attempt to create a
-   type that has *two or more* C++ classes in its hierarchy of base types will
-   fail with a fatal error message: ``TypeError: multiple bases have instance
-   lay-out conflict``. Core Python types that are implemented in C (e.g.
-   ``dict``, ``list``, ``Exception``, etc.) also fall under this combination
-   and cannot be combined with C++ types bound using pybind11 via multiple
-   inheritance.
+    MSVC 2015 has a compiler bug (fixed in version 2017) which
+    requires a more explicit function binding in the form of
+    ``.def("foo", static_cast<int (A::*)() const>(&Publicist::foo));``
+    where ``int (A::*)() const`` is the type of ``A::foo``.
diff --git a/pybind11/docs/advanced/embedding.rst b/pybind11/docs/advanced/embedding.rst
new file mode 100644
index 000000000..393031603
--- /dev/null
+++ b/pybind11/docs/advanced/embedding.rst
@@ -0,0 +1,261 @@
+.. _embedding:
+
+Embedding the interpreter
+#########################
+
+While pybind11 is mainly focused on extending Python using C++, it's also
+possible to do the reverse: embed the Python interpreter into a C++ program.
+All of the other documentation pages still apply here, so refer to them for
+general pybind11 usage. This section will cover a few extra things required
+for embedding.
+
+Getting started
+===============
+
+A basic executable with an embedded interpreter can be created with just a few
+lines of CMake and the ``pybind11::embed`` target, as shown below. For more
+information, see :doc:`/compiling`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.0)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or `add_subdirectory(pybind11)`
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+The essential structure of the ``main.cpp`` file looks like this:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h> // everything needed for embedding
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+        py::print("Hello, World!"); // use the Python API
+    }
+
+The interpreter must be initialized before using any Python API, which includes
+all the functions and classes in pybind11. The RAII guard class `scoped_interpreter`
+takes care of the interpreter lifetime. After the guard is destroyed, the interpreter
+shuts down and clears its memory. No Python functions can be called after this.
+
+Executing Python code
+=====================
+
+There are a few different ways to run Python code. One option is to use `eval`,
+`exec` or `eval_file`, as explained in :ref:`eval`. Here is a quick example in
+the context of an executable with an embedded interpreter:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        py::exec(R"(
+            kwargs = dict(name="World", number=42)
+            message = "Hello, {name}! The answer is {number}".format(**kwargs)
+            print(message)
+        )");
+    }
+
+Alternatively, similar results can be achieved using pybind11's API (see
+:doc:`/advanced/pycpp/index` for more details).
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto kwargs = py::dict("name"_a="World", "number"_a=42);
+        auto message = "Hello, {name}! The answer is {number}"_s.format(**kwargs);
+        py::print(message);
+    }
+
+The two approaches can also be combined:
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    #include <iostream>
+
+    namespace py = pybind11;
+    using namespace py::literals;
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto locals = py::dict("name"_a="World", "number"_a=42);
+        py::exec(R"(
+            message = "Hello, {name}! The answer is {number}".format(**locals())
+        )", py::globals(), locals);
+
+        auto message = locals["message"].cast<std::string>();
+        std::cout << message;
+    }
+
+Importing modules
+=================
+
+Python modules can be imported using `module::import()`:
+
+.. code-block:: cpp
+
+    py::module sys = py::module::import("sys");
+    py::print(sys.attr("path"));
+
+For convenience, the current working directory is included in ``sys.path`` when
+embedding the interpreter. This makes it easy to import local Python files:
+
+.. code-block:: python
+
+    """calc.py located in the working directory"""
+
+    def add(i, j):
+        return i + j
+
+
+.. code-block:: cpp
+
+    py::module calc = py::module::import("calc");
+    py::object result = calc.attr("add")(1, 2);
+    int n = result.cast<int>();
+    assert(n == 3);
+
+Modules can be reloaded using `module::reload()` if the source is modified e.g.
+by an external process. This can be useful in scenarios where the application
+imports a user defined data processing script which needs to be updated after
+changes by the user. Note that this function does not reload modules recursively.
+
+.. _embedding_modules:
+
+Adding embedded modules
+=======================
+
+Embedded binary modules can be added using the `PYBIND11_EMBEDDED_MODULE` macro.
+Note that the definition must be placed at global scope. They can be imported
+like any other module.
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(fast_calc, m) {
+        // `m` is a `py::module` which is used to bind functions and classes
+        m.def("add", [](int i, int j) {
+            return i + j;
+        });
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto fast_calc = py::module::import("fast_calc");
+        auto result = fast_calc.attr("add")(1, 2).cast<int>();
+        assert(result == 3);
+    }
+
+Unlike extension modules where only a single binary module can be created, on
+the embedded side an unlimited number of modules can be added using multiple
+`PYBIND11_EMBEDDED_MODULE` definitions (as long as they have unique names).
+
+These modules are added to Python's list of builtins, so they can also be
+imported in pure Python files loaded by the interpreter. Everything interacts
+naturally:
+
+.. code-block:: python
+
+    """py_module.py located in the working directory"""
+    import cpp_module
+
+    a = cpp_module.a
+    b = a + 1
+
+
+.. code-block:: cpp
+
+    #include <pybind11/embed.h>
+    namespace py = pybind11;
+
+    PYBIND11_EMBEDDED_MODULE(cpp_module, m) {
+        m.attr("a") = 1;
+    }
+
+    int main() {
+        py::scoped_interpreter guard{};
+
+        auto py_module = py::module::import("py_module");
+
+        auto locals = py::dict("fmt"_a="{} + {} = {}", **py_module.attr("__dict__"));
+        assert(locals["a"].cast<int>() == 1);
+        assert(locals["b"].cast<int>() == 2);
+
+        py::exec(R"(
+            c = a + b
+            message = fmt.format(a, b, c)
+        )", py::globals(), locals);
+
+        assert(locals["c"].cast<int>() == 3);
+        assert(locals["message"].cast<std::string>() == "1 + 2 = 3");
+    }
+
+
+Interpreter lifetime
+====================
+
+The Python interpreter shuts down when `scoped_interpreter` is destroyed. After
+this, creating a new instance will restart the interpreter. Alternatively, the
+`initialize_interpreter` / `finalize_interpreter` pair of functions can be used
+to directly set the state at any time.
+
+Modules created with pybind11 can be safely re-initialized after the interpreter
+has been restarted. However, this may not apply to third-party extension modules.
+The issue is that Python itself cannot completely unload extension modules and
+there are several caveats with regard to interpreter restarting. In short, not
+all memory may be freed, either due to Python reference cycles or user-created
+global data. All the details can be found in the CPython documentation.
+
+.. warning::
+
+    Creating two concurrent `scoped_interpreter` guards is a fatal error. So is
+    calling `initialize_interpreter` for a second time after the interpreter
+    has already been initialized.
+
+    Do not use the raw CPython API functions ``Py_Initialize`` and
+    ``Py_Finalize`` as these do not properly handle the lifetime of
+    pybind11's internal data.
+
+
+Sub-interpreter support
+=======================
+
+Creating multiple copies of `scoped_interpreter` is not possible because it
+represents the main Python interpreter. Sub-interpreters are something different
+and they do permit the existence of multiple interpreters. This is an advanced
+feature of the CPython API and should be handled with care. pybind11 does not
+currently offer a C++ interface for sub-interpreters, so refer to the CPython
+documentation for all the details regarding this feature.
+
+We'll just mention a couple of caveats the sub-interpreters support in pybind11:
+
+ 1. Sub-interpreters will not receive independent copies of embedded modules.
+    Instead, these are shared and modifications in one interpreter may be
+    reflected in another.
+
+ 2. Managing multiple threads, multiple interpreters and the GIL can be
+    challenging and there are several caveats here, even within the pure
+    CPython API (please refer to the Python docs for details). As for
+    pybind11, keep in mind that `gil_scoped_release` and `gil_scoped_acquire`
+    do not take sub-interpreters into account.
diff --git a/pybind11/docs/advanced/functions.rst b/pybind11/docs/advanced/functions.rst
index f291e8222..c7892b5d3 100644
--- a/pybind11/docs/advanced/functions.rst
+++ b/pybind11/docs/advanced/functions.rst
@@ -6,6 +6,8 @@ with the basics of binding functions and classes, as explained in :doc:`/basics`
 and :doc:`/classes`. The following guide is applicable to both free and member
 functions, i.e. *methods* in Python.
 
+.. _return_value_policies:
+
 Return value policies
 =====================
 
@@ -14,7 +16,7 @@ lifetime of objects managed by them. This can lead to issues when creating
 bindings for functions that return a non-trivial type. Just by looking at the
 type information, it is not clear whether Python should take charge of the
 returned value and eventually free its resources, or if this is handled on the
-C++ side. For this reason, pybind11 provides a several `return value policy`
+C++ side. For this reason, pybind11 provides a several *return value policy*
 annotations that can be passed to the :func:`module::def` and
 :func:`class_::def` functions. The default policy is
 :enum:`return_value_policy::automatic`.
@@ -24,11 +26,11 @@ Just to illustrate what can go wrong, consider the following simple example:
 
 .. code-block:: cpp
 
-    /* Function declaration */ 
+    /* Function declaration */
     Data *get_data() { return _data; /* (pointer to a static data structure) */ }
     ...
 
-    /* Binding code */ 
+    /* Binding code */
     m.def("get_data", &get_data); // <-- KABOOM, will cause crash when called from Python
 
 What's going on here? When ``get_data()`` is called from Python, the return
@@ -44,7 +46,7 @@ silent data corruption.
 
 In the above example, the policy :enum:`return_value_policy::reference` should have
 been specified so that the global data instance is only *referenced* without any
-implied transfer of ownership, i.e.: 
+implied transfer of ownership, i.e.:
 
 .. code-block:: cpp
 
@@ -88,11 +90,12 @@ The following table provides an overview of available policies:
 |                                                  | return value is referenced by Python. This is the default policy for       |
 |                                                  | property getters created via ``def_property``, ``def_readwrite``, etc.     |
 +--------------------------------------------------+----------------------------------------------------------------------------+
-| :enum:`return_value_policy::automatic`           | This is the default return value policy, which falls back to the policy    |
+| :enum:`return_value_policy::automatic`           | **Default policy.** This policy falls back to the policy                   |
 |                                                  | :enum:`return_value_policy::take_ownership` when the return value is a     |
-|                                                  | pointer. Otherwise, it uses :enum:`return_value::move` or                  |
-|                                                  | :enum:`return_value::copy` for rvalue and lvalue references, respectively. |
-|                                                  | See above for a description of what all of these different policies do.    |
+|                                                  | pointer. Otherwise, it uses :enum:`return_value_policy::move` or           |
+|                                                  | :enum:`return_value_policy::copy` for rvalue and lvalue references,        |
+|                                                  | respectively. See above for a description of what all of these different   |
+|                                                  | policies do.                                                               |
 +--------------------------------------------------+----------------------------------------------------------------------------+
 | :enum:`return_value_policy::automatic_reference` | As above, but use policy :enum:`return_value_policy::reference` when the   |
 |                                                  | return value is a pointer. This is the default conversion policy for       |
@@ -158,19 +161,26 @@ targeted arguments can be passed through the :class:`cpp_function` constructor:
 Additional call policies
 ========================
 
-In addition to the above return value policies, further `call policies` can be
-specified to indicate dependencies between parameters. There is currently just
-one policy named ``keep_alive<Nurse, Patient>``, which indicates that the
-argument with index ``Patient`` should be kept alive at least until the
-argument with index ``Nurse`` is freed by the garbage collector. Argument
+In addition to the above return value policies, further *call policies* can be
+specified to indicate dependencies between parameters or ensure a certain state
+for the function call.
+
+Keep alive
+----------
+
+In general, this policy is required when the C++ object is any kind of container
+and another object is being added to the container. ``keep_alive<Nurse, Patient>``
+indicates that the argument with index ``Patient`` should be kept alive at least
+until the argument with index ``Nurse`` is freed by the garbage collector. Argument
 indices start at one, while zero refers to the return value. For methods, index
 ``1`` refers to the implicit ``this`` pointer, while regular arguments begin at
 index ``2``. Arbitrarily many call policies can be specified. When a ``Nurse``
 with value ``None`` is detected at runtime, the call policy does nothing.
 
-This feature internally relies on the ability to create a *weak reference* to
-the nurse object, which is permitted by all classes exposed via pybind11. When
-the nurse object does not support weak references, an exception will be thrown.
+When the nurse is not a pybind11-registered type, the implementation internally
+relies on the ability to create a *weak reference* to the nurse object. When
+the nurse object is not a pybind11-registered type and does not support weak
+references, an exception will be thrown.
 
 Consider the following example: here, the binding code for a list append
 operation ties the lifetime of the newly added element to the underlying
@@ -181,16 +191,53 @@ container:
     py::class_<List>(m, "List")
         .def("append", &List::append, py::keep_alive<1, 2>());
 
+For consistency, the argument indexing is identical for constructors. Index
+``1`` still refers to the implicit ``this`` pointer, i.e. the object which is
+being constructed. Index ``0`` refers to the return type which is presumed to
+be ``void`` when a constructor is viewed like a function. The following example
+ties the lifetime of the constructor element to the constructed object:
+
+.. code-block:: cpp
+
+    py::class_<Nurse>(m, "Nurse")
+        .def(py::init<Patient &>(), py::keep_alive<1, 2>());
+
 .. note::
 
     ``keep_alive`` is analogous to the ``with_custodian_and_ward`` (if Nurse,
     Patient != 0) and ``with_custodian_and_ward_postcall`` (if Nurse/Patient ==
     0) policies from Boost.Python.
 
+Call guard
+----------
+
+The ``call_guard<T>`` policy allows any scope guard type ``T`` to be placed
+around the function call. For example, this definition:
+
+.. code-block:: cpp
+
+    m.def("foo", foo, py::call_guard<T>());
+
+is equivalent to the following pseudocode:
+
+.. code-block:: cpp
+
+    m.def("foo", [](args...) {
+        T scope_guard;
+        return foo(args...); // forwarded arguments
+    });
+
+The only requirement is that ``T`` is default-constructible, but otherwise any
+scope guard will work. This is very useful in combination with `gil_scoped_release`.
+See :ref:`gil`.
+
+Multiple guards can also be specified as ``py::call_guard<T1, T2, T3...>``. The
+constructor order is left to right and destruction happens in reverse.
+
 .. seealso::
 
-    The file :file:`tests/test_keep_alive.cpp` contains a complete example
-    that demonstrates using :class:`keep_alive` in more detail.
+    The file :file:`tests/test_call_policies.cpp` contains a complete example
+    that demonstrates using `keep_alive` and `call_guard` in more detail.
 
 .. _python_objects_as_args:
 
@@ -207,8 +254,8 @@ For instance, the following statement iterates over a Python ``dict``:
     void print_dict(py::dict dict) {
         /* Easily interact with Python types */
         for (auto item : dict)
-            std::cout << "key=" << item.first << ", "
-                      << "value=" << item.second << std::endl;
+            std::cout << "key=" << std::string(py::str(item.first)) << ", "
+                      << "value=" << std::string(py::str(item.second)) << std::endl;
     }
 
 It can be exported:
@@ -252,16 +299,21 @@ Such functions can also be created using pybind11:
    m.def("generic", &generic);
 
 The class ``py::args`` derives from ``py::tuple`` and ``py::kwargs`` derives
-from ``py::dict``. Note that the ``kwargs`` argument is invalid if no keyword
-arguments were actually provided. Please refer to the other examples for
-details on how to iterate over these, and on how to cast their entries into
-C++ objects. A demonstration is also available in
-``tests/test_kwargs_and_defaults.cpp``.
+from ``py::dict``.
 
-.. warning::
+You may also use just one or the other, and may combine these with other
+arguments as long as the ``py::args`` and ``py::kwargs`` arguments are the last
+arguments accepted by the function.
+
+Please refer to the other examples for details on how to iterate over these,
+and on how to cast their entries into C++ objects. A demonstration is also
+available in ``tests/test_kwargs_and_defaults.cpp``.
 
-   Unlike Python, pybind11 does not allow combining normal parameters with the
-   ``args`` / ``kwargs`` special parameters.
+.. note::
+
+    When combining \*args or \*\*kwargs with :ref:`keyword_args` you should
+    *not* include ``py::arg`` tags for the ``py::args`` and ``py::kwargs``
+    arguments.
 
 Default arguments revisited
 ===========================
@@ -309,3 +361,138 @@ like so:
 
     py::class_<MyClass>("MyClass")
         .def("myFunction", py::arg("arg") = (SomeType *) nullptr);
+
+.. _nonconverting_arguments:
+
+Non-converting arguments
+========================
+
+Certain argument types may support conversion from one type to another.  Some
+examples of conversions are:
+
+* :ref:`implicit_conversions` declared using ``py::implicitly_convertible<A,B>()``
+* Calling a method accepting a double with an integer argument
+* Calling a ``std::complex<float>`` argument with a non-complex python type
+  (for example, with a float).  (Requires the optional ``pybind11/complex.h``
+  header).
+* Calling a function taking an Eigen matrix reference with a numpy array of the
+  wrong type or of an incompatible data layout.  (Requires the optional
+  ``pybind11/eigen.h`` header).
+
+This behaviour is sometimes undesirable: the binding code may prefer to raise
+an error rather than convert the argument.  This behaviour can be obtained
+through ``py::arg`` by calling the ``.noconvert()`` method of the ``py::arg``
+object, such as:
+
+.. code-block:: cpp
+
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+
+Attempting the call the second function (the one without ``.noconvert()``) with
+an integer will succeed, but attempting to call the ``.noconvert()`` version
+will fail with a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> floats_preferred(4)
+    2.0
+    >>> floats_only(4)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: floats_only(): incompatible function arguments. The following argument types are supported:
+        1. (f: float) -> float
+
+    Invoked with: 4
+
+You may, of course, combine this with the :var:`_a` shorthand notation (see
+:ref:`keyword_args`) and/or :ref:`default_args`.  It is also permitted to omit
+the argument name by using the ``py::arg()`` constructor without an argument
+name, i.e. by specifying ``py::arg().noconvert()``.
+
+.. note::
+
+    When specifying ``py::arg`` options it is necessary to provide the same
+    number of options as the bound function has arguments.  Thus if you want to
+    enable no-convert behaviour for just one of several arguments, you will
+    need to specify a ``py::arg()`` annotation for each argument with the
+    no-convert argument modified to ``py::arg().noconvert()``.
+
+.. _none_arguments:
+
+Allow/Prohibiting None arguments
+================================
+
+When a C++ type registered with :class:`py::class_` is passed as an argument to
+a function taking the instance as pointer or shared holder (e.g. ``shared_ptr``
+or a custom, copyable holder as described in :ref:`smart_pointers`), pybind
+allows ``None`` to be passed from Python which results in calling the C++
+function with ``nullptr`` (or an empty holder) for the argument.
+
+To explicitly enable or disable this behaviour, using the
+``.none`` method of the :class:`py::arg` object:
+
+.. code-block:: cpp
+
+    py::class_<Dog>(m, "Dog").def(py::init<>());
+    py::class_<Cat>(m, "Cat").def(py::init<>());
+    m.def("bark", [](Dog *dog) -> std::string {
+        if (dog) return "woof!"; /* Called with a Dog instance */
+        else return "(no dog)"; /* Called with None, d == nullptr */
+    }, py::arg("dog").none(true));
+    m.def("meow", [](Cat *cat) -> std::string {
+        // Can't be called with None argument
+        return "meow";
+    }, py::arg("cat").none(false));
+
+With the above, the Python call ``bark(None)`` will return the string ``"(no
+dog)"``, while attempting to call ``meow(None)`` will raise a ``TypeError``:
+
+.. code-block:: pycon
+
+    >>> from animals import Dog, Cat, bark, meow
+    >>> bark(Dog())
+    'woof!'
+    >>> meow(Cat())
+    'meow'
+    >>> bark(None)
+    '(no dog)'
+    >>> meow(None)
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: meow(): incompatible function arguments. The following argument types are supported:
+        1. (cat: animals.Cat) -> str
+
+    Invoked with: None
+
+The default behaviour when the tag is unspecified is to allow ``None``.
+
+Overload resolution order
+=========================
+
+When a function or method with multiple overloads is called from Python,
+pybind11 determines which overload to call in two passes.  The first pass
+attempts to call each overload without allowing argument conversion (as if
+every argument had been specified as ``py::arg().noconvert()`` as decribed
+above).
+
+If no overload succeeds in the no-conversion first pass, a second pass is
+attempted in which argument conversion is allowed (except where prohibited via
+an explicit ``py::arg().noconvert()`` attribute in the function definition).
+
+If the second pass also fails a ``TypeError`` is raised.
+
+Within each pass, overloads are tried in the order they were registered with
+pybind11.
+
+What this means in practice is that pybind11 will prefer any overload that does
+not require conversion of arguments to an overload that does, but otherwise prefers
+earlier-defined overloads to later-defined ones.
+
+.. note::
+
+    pybind11 does *not* further prioritize based on the number/pattern of
+    overloaded arguments.  That is, pybind11 does not prioritize a function
+    requiring one conversion over one requiring three, but only prioritizes
+    overloads requiring no conversion at all to overloads that require
+    conversion of at least one argument.
diff --git a/pybind11/docs/advanced/misc.rst b/pybind11/docs/advanced/misc.rst
index c13df7bf8..87481ba32 100644
--- a/pybind11/docs/advanced/misc.rst
+++ b/pybind11/docs/advanced/misc.rst
@@ -15,10 +15,12 @@ T2>, myFunc)``. In this case, the preprocessor assumes that the comma indicates
 the beginning of the next parameter. Use a ``typedef`` to bind the template to
 another name and use it in the macro to avoid this problem.
 
+.. _gil:
 
 Global Interpreter Lock (GIL)
 =============================
 
+When calling a C++ function from Python, the GIL is always held.
 The classes :class:`gil_scoped_release` and :class:`gil_scoped_acquire` can be
 used to acquire and release the global interpreter lock in the body of a C++
 function call. In this way, long-running C++ code can be parallelized using
@@ -26,7 +28,7 @@ multiple Python threads. Taking :ref:`overriding_virtuals` as an example, this
 could be realized as follows (important changes highlighted):
 
 .. code-block:: cpp
-    :emphasize-lines: 8,9,33,34
+    :emphasize-lines: 8,9,31,32
 
     class PyAnimal : public Animal {
     public:
@@ -47,9 +49,7 @@ could be realized as follows (important changes highlighted):
         }
     };
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         py::class_<Animal, PyAnimal> animal(m, "Animal");
         animal
             .def(py::init<>())
@@ -63,10 +63,15 @@ could be realized as follows (important changes highlighted):
             py::gil_scoped_release release;
             return call_go(animal);
         });
-
-        return m.ptr();
     }
 
+The ``call_go`` wrapper can also be simplified using the `call_guard` policy
+(see :ref:`call_policies`) which yields the same result:
+
+.. code-block:: cpp
+
+    m.def("call_go", &call_go, py::call_guard<py::gil_scoped_release>());
+
 
 Binding sequence data types, iterators, the slicing protocol, etc.
 ==================================================================
@@ -130,22 +135,16 @@ has been executed:
 
 Naturally, both methods will fail when there are cyclic dependencies.
 
-Note that compiling code which has its default symbol visibility set to
-*hidden* (e.g. via the command line flag ``-fvisibility=hidden`` on GCC/Clang) can interfere with the
-ability to access types defined in another extension module. Workarounds
-include changing the global symbol visibility (not recommended, because it will
-lead unnecessarily large binaries) or manually exporting types that are
-accessed by multiple extension modules:
+Note that pybind11 code compiled with hidden-by-default symbol visibility (e.g.
+via the command line flag ``-fvisibility=hidden`` on GCC/Clang), which is
+required proper pybind11 functionality, can interfere with the ability to
+access types defined in another extension module.  Working around this requires
+manually exporting types that are accessed by multiple extension modules;
+pybind11 provides a macro to do just this:
 
 .. code-block:: cpp
 
-    #ifdef _WIN32
-    #  define EXPORT_TYPE __declspec(dllexport)
-    #else
-    #  define EXPORT_TYPE __attribute__ ((visibility("default")))
-    #endif
-
-    class EXPORT_TYPE Dog : public Animal {
+    class PYBIND11_EXPORT Dog : public Animal {
         ...
     };
 
@@ -169,6 +168,54 @@ would be then able to access the data behind the same pointer.
 
 .. [#f6] https://docs.python.org/3/extending/extending.html#using-capsules
 
+Module Destructors
+==================
+
+pybind11 does not provide an explicit mechanism to invoke cleanup code at
+module destruction time. In rare cases where such functionality is required, it
+is possible to emulate it using Python capsules or weak references with a
+destruction callback.
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() {
+        // perform cleanup here -- this function is called with the GIL held
+    };
+
+    m.add_object("_cleanup", py::capsule(cleanup_callback));
+
+This approach has the potential downside that instances of classes exposed
+within the module may still be alive when the cleanup callback is invoked
+(whether this is acceptable will generally depend on the application).
+
+Alternatively, the capsule may also be stashed within a type object, which
+ensures that it not called before all instances of that type have been
+collected:
+
+.. code-block:: cpp
+
+    auto cleanup_callback = []() { /* ... */ };
+    m.attr("BaseClass").attr("_cleanup") = py::capsule(cleanup_callback);
+
+Both approaches also expose a potentially dangerous ``_cleanup`` attribute in
+Python, which may be undesirable from an API standpoint (a premature explicit
+call from Python might lead to undefined behavior). Yet another approach that 
+avoids this issue involves weak reference with a cleanup callback:
+
+.. code-block:: cpp
+
+    // Register a callback function that is invoked when the BaseClass object is colelcted
+    py::cpp_function cleanup_callback(
+        [](py::handle weakref) {
+            // perform cleanup here -- this function is called with the GIL held
+
+            weakref.dec_ref(); // release weak reference
+        }
+    );
+
+    // Create a weak reference with a cleanup callback and initially leak it
+    (void) py::weakref(m.attr("BaseClass"), cleanup_callback).release();
+
 
 Generating documentation using Sphinx
 =====================================
@@ -210,15 +257,11 @@ The class ``options`` allows you to selectively suppress auto-generated signatur
 
 .. code-block:: cpp
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         py::options options;
         options.disable_function_signatures();
-        
+
         m.def("add", [](int a, int b) { return a + b; }, "A function which adds two numbers");
-        
-        return m.ptr();
     }
 
 Note that changes to the settings affect only function bindings created during the 
diff --git a/pybind11/docs/advanced/pycpp/numpy.rst b/pybind11/docs/advanced/pycpp/numpy.rst
index 111ff0e3c..98b0c25b9 100644
--- a/pybind11/docs/advanced/pycpp/numpy.rst
+++ b/pybind11/docs/advanced/pycpp/numpy.rst
@@ -57,11 +57,11 @@ specification.
 
     struct buffer_info {
         void *ptr;
-        size_t itemsize;
+        ssize_t itemsize;
         std::string format;
-        int ndim;
-        std::vector<size_t> shape;
-        std::vector<size_t> strides;
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::vector<ssize_t> strides;
     };
 
 To create a C++ function that can take a Python buffer object as an argument,
@@ -95,11 +95,11 @@ buffer objects (e.g. a NumPy matrix).
                 throw std::runtime_error("Incompatible buffer dimension!");
 
             auto strides = Strides(
-                info.strides[rowMajor ? 0 : 1] / sizeof(Scalar),
-                info.strides[rowMajor ? 1 : 0] / sizeof(Scalar));
+                info.strides[rowMajor ? 0 : 1] / (py::ssize_t)sizeof(Scalar),
+                info.strides[rowMajor ? 1 : 0] / (py::ssize_t)sizeof(Scalar));
 
             auto map = Eigen::Map<Matrix, 0, Strides>(
-                static_cat<Scalar *>(info.ptr), info.shape[0], info.shape[1], strides);
+                static_cast<Scalar *>(info.ptr), info.shape[0], info.shape[1], strides);
 
             new (&m) Matrix(map);
         });
@@ -111,18 +111,14 @@ as follows:
 
     .def_buffer([](Matrix &m) -> py::buffer_info {
         return py::buffer_info(
-            m.data(),                /* Pointer to buffer */
-            sizeof(Scalar),          /* Size of one scalar */
-            /* Python struct-style format descriptor */
-            py::format_descriptor<Scalar>::format(),
-            /* Number of dimensions */
-            2,
-            /* Buffer dimensions */
-            { (size_t) m.rows(),
-              (size_t) m.cols() },
-            /* Strides (in bytes) for each index */
+            m.data(),                                /* Pointer to buffer */
+            sizeof(Scalar),                          /* Size of one scalar */
+            py::format_descriptor<Scalar>::format(), /* Python struct-style format descriptor */
+            2,                                       /* Number of dimensions */
+            { m.rows(), m.cols() },                  /* Buffer dimensions */
             { sizeof(Scalar) * (rowMajor ? m.cols() : 1),
               sizeof(Scalar) * (rowMajor ? 1 : m.rows()) }
+                                                     /* Strides (in bytes) for each index */
         );
      })
 
@@ -155,7 +151,7 @@ NumPy array containing double precision values.
 When it is invoked with a different type (e.g. an integer or a list of
 integers), the binding code will attempt to cast the input into a NumPy array
 of the requested type. Note that this feature requires the
-:file:``pybind11/numpy.h`` header to be included.
+:file:`pybind11/numpy.h` header to be included.
 
 Data in NumPy arrays is not guaranteed to packed in a dense manner;
 furthermore, entries can be separated by arbitrary column and row strides.
@@ -176,9 +172,10 @@ function overload.
 Structured types
 ================
 
-In order for ``py::array_t`` to work with structured (record) types, we first need
-to register the memory layout of the type. This can be done via ``PYBIND11_NUMPY_DTYPE``
-macro which expects the type followed by field names:
+In order for ``py::array_t`` to work with structured (record) types, we first
+need to register the memory layout of the type. This can be done via
+``PYBIND11_NUMPY_DTYPE`` macro, called in the plugin definition code, which
+expects the type followed by field names:
 
 .. code-block:: cpp
 
@@ -192,10 +189,21 @@ macro which expects the type followed by field names:
         A a;
     };
 
-    PYBIND11_NUMPY_DTYPE(A, x, y);
-    PYBIND11_NUMPY_DTYPE(B, z, a);
+    // ...
+    PYBIND11_MODULE(test, m) {
+        // ...
 
-    /* now both A and B can be used as template arguments to py::array_t */
+        PYBIND11_NUMPY_DTYPE(A, x, y);
+        PYBIND11_NUMPY_DTYPE(B, z, a);
+        /* now both A and B can be used as template arguments to py::array_t */
+    }
+
+The structure should consist of fundamental arithmetic types, ``std::complex``,
+previously registered substructures, and arrays of any of the above. Both C++
+arrays and ``std::array`` are supported. While there is a static assertion to
+prevent many types of unsupported structures, it is still the user's
+responsibility to use only "plain" structures that can be safely manipulated as
+raw memory without violating invariants.
 
 Vectorizing functions
 =====================
@@ -231,27 +239,13 @@ by the compiler. The result is returned as a NumPy array of type
 The scalar argument ``z`` is transparently replicated 4 times.  The input
 arrays ``x`` and ``y`` are automatically converted into the right types (they
 are of type  ``numpy.dtype.int64`` but need to be ``numpy.dtype.int32`` and
-``numpy.dtype.float32``, respectively)
-
-Sometimes we might want to explicitly exclude an argument from the vectorization
-because it makes little sense to wrap it in a NumPy array. For instance,
-suppose the function signature was
-
-.. code-block:: cpp
-
-    double my_func(int x, float y, my_custom_type *z);
-
-This can be done with a stateful Lambda closure:
+``numpy.dtype.float32``, respectively).
 
-.. code-block:: cpp
+.. note::
 
-    // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
-    m.def("vectorized_func",
-        [](py::array_t<int> x, py::array_t<float> y, my_custom_type *z) {
-            auto stateful_closure = [z](int x, float y) { return my_func(x, y, z); };
-            return py::vectorize(stateful_closure)(x, y);
-        }
-    );
+    Only arithmetic, complex, and POD types passed by value or by ``const &``
+    reference are vectorized; all other arguments are passed through as-is.
+    Functions taking rvalue reference arguments cannot be vectorized.
 
 In cases where the computation is too complicated to be reduced to
 ``vectorize``, it will be necessary to create and access the buffer contents
@@ -290,13 +284,83 @@ simply using ``vectorize``).
         return result;
     }
 
-    PYBIND11_PLUGIN(test) {
-        py::module m("test");
+    PYBIND11_MODULE(test, m) {
         m.def("add_arrays", &add_arrays, "Add two NumPy arrays");
-        return m.ptr();
     }
 
 .. seealso::
 
     The file :file:`tests/test_numpy_vectorize.cpp` contains a complete
     example that demonstrates using :func:`vectorize` in more detail.
+
+Direct access
+=============
+
+For performance reasons, particularly when dealing with very large arrays, it
+is often desirable to directly access array elements without internal checking
+of dimensions and bounds on every access when indices are known to be already
+valid.  To avoid such checks, the ``array`` class and ``array_t<T>`` template
+class offer an unchecked proxy object that can be used for this unchecked
+access through the ``unchecked<N>`` and ``mutable_unchecked<N>`` methods,
+where ``N`` gives the required dimensionality of the array:
+
+.. code-block:: cpp
+
+    m.def("sum_3d", [](py::array_t<double> x) {
+        auto r = x.unchecked<3>(); // x must have ndim = 3; can be non-writeable
+        double sum = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    sum += r(i, j, k);
+        return sum;
+    });
+    m.def("increment_3d", [](py::array_t<double> x) {
+        auto r = x.mutable_unchecked<3>(); // Will throw if ndim != 3 or flags.writeable is false
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                for (ssize_t k = 0; k < r.shape(2); k++)
+                    r(i, j, k) += 1.0;
+    }, py::arg().noconvert());
+
+To obtain the proxy from an ``array`` object, you must specify both the data
+type and number of dimensions as template arguments, such as ``auto r =
+myarray.mutable_unchecked<float, 2>()``.
+
+If the number of dimensions is not known at compile time, you can omit the
+dimensions template parameter (i.e. calling ``arr_t.unchecked()`` or
+``arr.unchecked<T>()``.  This will give you a proxy object that works in the
+same way, but results in less optimizable code and thus a small efficiency
+loss in tight loops.
+
+Note that the returned proxy object directly references the array's data, and
+only reads its shape, strides, and writeable flag when constructed.  You must
+take care to ensure that the referenced array is not destroyed or reshaped for
+the duration of the returned object, typically by limiting the scope of the
+returned instance.
+
+The returned proxy object supports some of the same methods as ``py::array`` so
+that it can be used as a drop-in replacement for some existing, index-checked
+uses of ``py::array``:
+
+- ``r.ndim()`` returns the number of dimensions
+
+- ``r.data(1, 2, ...)`` and ``r.mutable_data(1, 2, ...)``` returns a pointer to
+  the ``const T`` or ``T`` data, respectively, at the given indices.  The
+  latter is only available to proxies obtained via ``a.mutable_unchecked()``.
+
+- ``itemsize()`` returns the size of an item in bytes, i.e. ``sizeof(T)``.
+
+- ``ndim()`` returns the number of dimensions.
+
+- ``shape(n)`` returns the size of dimension ``n``
+
+- ``size()`` returns the total number of elements (i.e. the product of the shapes).
+
+- ``nbytes()`` returns the number of bytes used by the referenced elements
+  (i.e. ``itemsize()`` times ``size()``).
+
+.. seealso::
+
+    The file :file:`tests/test_numpy_array.cpp` contains additional examples
+    demonstrating the use of this feature.
diff --git a/pybind11/docs/advanced/pycpp/object.rst b/pybind11/docs/advanced/pycpp/object.rst
index 8fc165d16..117131edc 100644
--- a/pybind11/docs/advanced/pycpp/object.rst
+++ b/pybind11/docs/advanced/pycpp/object.rst
@@ -33,10 +33,50 @@ The reverse direction uses the following syntax:
 
 When conversion fails, both directions throw the exception :class:`cast_error`.
 
+.. _python_libs:
+
+Accessing Python libraries from C++
+===================================
+
+It is also possible to import objects defined in the Python standard
+library or available in the current Python environment (``sys.path``) and work
+with these in C++.
+
+This example obtains a reference to the Python ``Decimal`` class.
+
+.. code-block:: cpp
+
+    // Equivalent to "from decimal import Decimal"
+    py::object Decimal = py::module::import("decimal").attr("Decimal");
+
+.. code-block:: cpp
+
+    // Try to import scipy
+    py::object scipy = py::module::import("scipy");
+    return scipy.attr("__version__");
+
+.. _calling_python_functions:
+
 Calling Python functions
 ========================
 
-It is also possible to call python functions via ``operator()``.
+It is also possible to call Python classes, functions and methods 
+via ``operator()``.
+
+.. code-block:: cpp
+
+    // Construct a Python object of class Decimal
+    py::object pi = Decimal("3.14159");
+
+.. code-block:: cpp
+
+    // Use Python to make our directories
+    py::object os = py::module::import("os");
+    py::object makedirs = os.attr("makedirs");
+    makedirs("/tmp/path/to/somewhere");
+
+One can convert the result obtained from Python to a pure C++ version 
+if a ``py::class_`` or type conversion is defined.
 
 .. code-block:: cpp
 
@@ -44,6 +84,37 @@ It is also possible to call python functions via ``operator()``.
     py::object result_py = f(1234, "hello", some_instance);
     MyClass &result = result_py.cast<MyClass>();
 
+.. _calling_python_methods:
+
+Calling Python methods
+========================
+
+To call an object's method, one can again use ``.attr`` to obtain access to the
+Python method.
+
+.. code-block:: cpp
+
+    // Calculate e^Ï€ in decimal
+    py::object exp_pi = pi.attr("exp")();
+    py::print(py::str(exp_pi));
+
+In the example above ``pi.attr("exp")`` is a *bound method*: it will always call
+the method for that same instance of the class. Alternately one can create an 
+*unbound method* via the Python class (instead of instance) and pass the ``self`` 
+object explicitly, followed by other arguments.
+
+.. code-block:: cpp
+
+    py::object decimal_exp = Decimal.attr("exp");
+
+    // Compute the e^n for n=0..4
+    for (int n = 0; n < 5; n++) {
+        py::print(decimal_exp(Decimal(n));
+    }
+
+Keyword arguments
+=================
+
 Keyword arguments are also supported. In Python, there is the usual call syntax:
 
 .. code-block:: python
@@ -57,9 +128,12 @@ In C++, the same call can be made using:
 
 .. code-block:: cpp
 
-    using pybind11::literals; // to bring in the `_a` literal
+    using namespace pybind11::literals; // to bring in the `_a` literal
     f(1234, "say"_a="hello", "to"_a=some_instance); // keyword call in C++
 
+Unpacking arguments
+===================
+
 Unpacking of ``*args`` and ``**kwargs`` is also possible and can be mixed with
 other arguments:
 
@@ -88,7 +162,7 @@ Generalized unpacking according to PEP448_ is also supported:
 
 .. seealso::
 
-    The file :file:`tests/test_python_types.cpp` contains a complete
+    The file :file:`tests/test_pytypes.cpp` contains a complete
     example that demonstrates passing native Python types in more detail. The
     file :file:`tests/test_callbacks.cpp` presents a few examples of calling
     Python functions from C++, including keywords arguments and unpacking.
diff --git a/pybind11/docs/advanced/pycpp/utilities.rst b/pybind11/docs/advanced/pycpp/utilities.rst
index ba0dbef88..369e7c94d 100644
--- a/pybind11/docs/advanced/pycpp/utilities.rst
+++ b/pybind11/docs/advanced/pycpp/utilities.rst
@@ -21,19 +21,81 @@ expected in Python:
     auto args = py::make_tuple("unpacked", true);
     py::print("->", *args, "end"_a="<-"); // -> unpacked True <-
 
+.. _ostream_redirect:
+
+Capturing standard output from ostream
+======================================
+
+Often, a library will use the streams ``std::cout`` and ``std::cerr`` to print,
+but this does not play well with Python's standard ``sys.stdout`` and ``sys.stderr``
+redirection. Replacing a library's printing with `py::print <print>` may not
+be feasible. This can be fixed using a guard around the library function that
+redirects output to the corresponding Python streams:
+
+.. code-block:: cpp
+
+    #include <pybind11/iostream.h>
+
+    ...
+
+    // Add a scoped redirect for your noisy code
+    m.def("noisy_func", []() {
+        py::scoped_ostream_redirect stream(
+            std::cout,                               // std::ostream&
+            py::module::import("sys").attr("stdout") // Python output
+        );
+        call_noisy_func();
+    });
+
+This method respects flushes on the output streams and will flush if needed
+when the scoped guard is destroyed. This allows the output to be redirected in
+real time, such as to a Jupyter notebook. The two arguments, the C++ stream and
+the Python output, are optional, and default to standard output if not given. An
+extra type, `py::scoped_estream_redirect <scoped_estream_redirect>`, is identical
+except for defaulting to ``std::cerr`` and ``sys.stderr``; this can be useful with
+`py::call_guard`, which allows multiple items, but uses the default constructor:
+
+.. code-block:: py
+
+    // Alternative: Call single function using call guard
+    m.def("noisy_func", &call_noisy_function,
+          py::call_guard<py::scoped_ostream_redirect,
+                         py::scoped_estream_redirect>());
+
+The redirection can also be done in Python with the addition of a context
+manager, using the `py::add_ostream_redirect() <add_ostream_redirect>` function:
+
+.. code-block:: cpp
+
+    py::add_ostream_redirect(m, "ostream_redirect");
+
+The name in Python defaults to ``ostream_redirect`` if no name is passed.  This
+creates the following context manager in Python:
+
+.. code-block:: python
+
+    with ostream_redirect(stdout=True, stderr=True):
+        noisy_function()
+
+It defaults to redirecting both streams, though you can use the keyword
+arguments to disable one of the streams if needed.
+
+.. note::
+
+    The above methods will not redirect C-level output to file descriptors, such
+    as ``fprintf``. For those cases, you'll need to redirect the file
+    descriptors either directly in C or with Python's ``os.dup2`` function
+    in an operating-system dependent way.
+
+.. _eval:
+
 Evaluating Python expressions from strings and files
 ====================================================
 
-pybind11 provides the :func:`eval` and :func:`eval_file` functions to evaluate
+pybind11 provides the `eval`, `exec` and `eval_file` functions to evaluate
 Python expressions and statements. The following example illustrates how they
 can be used.
 
-Both functions accept a template parameter that describes how the argument
-should be interpreted. Possible choices include ``eval_expr`` (isolated
-expression), ``eval_single_statement`` (a single statement, return value is
-always ``none``), and ``eval_statements`` (sequence of statements, return value
-is always ``none``).
-
 .. code-block:: cpp
 
     // At beginning of file
@@ -48,10 +110,35 @@ is always ``none``).
     int result = py::eval("my_variable + 10", scope).cast<int>();
 
     // Evaluate a sequence of statements
-    py::eval<py::eval_statements>(
+    py::exec(
         "print('Hello')\n"
         "print('world!');",
         scope);
 
     // Evaluate the statements in an separate Python file on disk
     py::eval_file("script.py", scope);
+
+C++11 raw string literals are also supported and quite handy for this purpose.
+The only requirement is that the first statement must be on a new line following
+the raw string delimiter ``R"(``, ensuring all lines have common leading indent:
+
+.. code-block:: cpp
+
+    py::exec(R"(
+        x = get_answer()
+        if x == 42:
+            print('Hello World!')
+        else:
+            print('Bye!')
+        )", scope
+    );
+
+.. note::
+
+    `eval` and `eval_file` accept a template parameter that describes how the
+    string/file should be interpreted. Possible choices include ``eval_expr``
+    (isolated expression), ``eval_single_statement`` (a single statement, return
+    value is always ``none``), and ``eval_statements`` (sequence of statements,
+    return value is always ``none``). `eval` defaults to  ``eval_expr``,
+    `eval_file` defaults to ``eval_statements`` and `exec` is just a shortcut
+    for ``eval<eval_statements>``.
diff --git a/pybind11/docs/advanced/smart_ptrs.rst b/pybind11/docs/advanced/smart_ptrs.rst
index 3c982136c..da57748ca 100644
--- a/pybind11/docs/advanced/smart_ptrs.rst
+++ b/pybind11/docs/advanced/smart_ptrs.rst
@@ -63,16 +63,12 @@ code?
         std::shared_ptr<Child> child;
     };
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example");
-
+    PYBIND11_MODULE(example, m) {
         py::class_<Child, std::shared_ptr<Child>>(m, "Child");
 
         py::class_<Parent, std::shared_ptr<Parent>>(m, "Parent")
            .def(py::init<>())
            .def("get_child", &Parent::get_child);
-
-        return m.ptr();
     }
 
 The following Python code will cause undefined behavior (and likely a
@@ -149,6 +145,27 @@ situation where ``true`` should be passed is when the ``T`` instances use
 
 Please take a look at the :ref:`macro_notes` before using this feature.
 
+By default, pybind11 assumes that your custom smart pointer has a standard
+interface, i.e. provides a ``.get()`` member function to access the underlying
+raw pointer. If this is not the case, pybind11's ``holder_helper`` must be
+specialized:
+
+.. code-block:: cpp
+
+    // Always needed for custom holder types
+    PYBIND11_DECLARE_HOLDER_TYPE(T, SmartPtr<T>);
+
+    // Only needed if the type's `.get()` goes by another name
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct holder_helper<SmartPtr<T>> { // <-- specialization
+            static const T *get(const SmartPtr<T> &p) { return p.getPointer(); }
+        };
+    }}
+
+The above specialization informs pybind11 that the custom ``SmartPtr`` class
+provides ``.get()`` functionality via ``.getPointer()``.
+
 .. seealso::
 
     The file :file:`tests/test_smart_ptr.cpp` contains a complete example
diff --git a/pybind11/docs/basics.rst b/pybind11/docs/basics.rst
index 33c60049d..447250ed9 100644
--- a/pybind11/docs/basics.rst
+++ b/pybind11/docs/basics.rst
@@ -73,6 +73,8 @@ For brevity, all code examples assume that the following two lines are present:
 
 Some features may require additional headers, but those will be specified as needed.
 
+.. _simple_example:
+
 Creating bindings for a simple function
 =======================================
 
@@ -96,25 +98,21 @@ a file named :file:`example.cpp` with the following contents:
         return i + j;
     }
 
-    namespace py = pybind11;
-
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "pybind11 example plugin"; // optional module docstring
 
         m.def("add", &add, "A function which adds two numbers");
-
-        return m.ptr();
     }
 
 .. [#f1] In practice, implementation and binding code will generally be located
          in separate files.
 
-The :func:`PYBIND11_PLUGIN` macro creates a function that will be called when an
-``import`` statement is issued from within Python. The next line creates a
-module named ``example`` (with the supplied docstring). The method
-:func:`module::def` generates binding code that exposes the
-``add()`` function to Python. The last line returns the internal Python object
-associated with ``m`` to the Python interpreter.
+The :func:`PYBIND11_MODULE` macro creates a function that will be called when an
+``import`` statement is issued from within Python. The module name (``example``)
+is given as the first macro argument (it should not be in quotes). The second
+argument (``m``) defines a variable of type :class:`py::module <module>` which
+is the main interface for creating bindings. The method :func:`module::def`
+generates binding code that exposes the ``add()`` function to Python.
 
 .. note::
 
@@ -124,23 +122,31 @@ associated with ``m`` to the Python interpreter.
     approach and the used syntax are borrowed from Boost.Python, though the
     underlying implementation is very different.
 
-pybind11 is a header-only-library, hence it is not necessary to link against
-any special libraries (other than Python itself). On Windows, use the CMake
-build file discussed in section :ref:`cmake`. On Linux and Mac OS, the above
-example can be compiled using the following command
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+On Linux, the above example can be compiled using the following command:
 
 .. code-block:: bash
 
-    $ c++ -O3 -shared -std=c++11 -I <path-to-pybind11>/include `python-config --cflags --ldflags` example.cpp -o example.so
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+For more details on the required compiler flags on Linux and MacOS, see
+:ref:`building_manually`. For complete cross-platform compilation instructions,
+refer to the :ref:`compiling` page.
+
+The `python_example`_ and `cmake_example`_ repositories are also a good place
+to start. They are both complete project examples with cross-platform build
+systems. The only difference between the two is that `python_example`_ uses
+Python's ``setuptools`` to build the module, while `cmake_example`_ uses CMake
+(which may be preferable for existing C++ projects).
 
-In general, it is advisable to include several additional build parameters
-that can considerably reduce the size of the created binary. Refer to section
-:ref:`cmake` for a detailed example of a suitable cross-platform CMake-based
-build system.
+.. _python_example: https://github.com/pybind/python_example
+.. _cmake_example: https://github.com/pybind/cmake_example
 
-Assuming that the created file :file:`example.so` (:file:`example.pyd` on Windows)
-is located in the current directory, the following interactive Python session
-shows how to load and execute the example.
+Building the above C++ code will produce a binary module file that can be
+imported to Python. Assuming that the compiled module is located in the
+current directory, the following interactive Python session shows how to
+load and execute the example:
 
 .. code-block:: pycon
 
@@ -261,12 +267,10 @@ converted using the function ``py::cast``.
 
 .. code-block:: cpp
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
+    PYBIND11_MODULE(example, m) {
         m.attr("the_answer") = 42;
         py::object world = py::cast("World");
         m.attr("what") = world;
-        return m.ptr();
     }
 
 These are then accessible from Python:
diff --git a/pybind11/docs/benchmark.py b/pybind11/docs/benchmark.py
index 6f02e92ff..6dc0604ea 100644
--- a/pybind11/docs/benchmark.py
+++ b/pybind11/docs/benchmark.py
@@ -33,10 +33,8 @@ def generate_dummy_code_pybind11(nclasses=10):
     result = "#include <pybind11/pybind11.h>\n\n"
     result += "namespace py = pybind11;\n\n"
     result += decl + '\n'
-    result += "PYBIND11_PLUGIN(example) {\n"
-    result += "    py::module m(\"example\");"
+    result += "PYBIND11_MODULE(example, m) {\n"
     result += bindings
-    result += "    return m.ptr();"
     result += "}"
     return result
 
diff --git a/pybind11/docs/benchmark.rst b/pybind11/docs/benchmark.rst
index 8babaa319..59d533df9 100644
--- a/pybind11/docs/benchmark.rst
+++ b/pybind11/docs/benchmark.rst
@@ -31,8 +31,7 @@ Here is an example of the binding code for one class:
     };
     ...
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example");
+    PYBIND11_MODULE(example, m) {
         ...
         py::class_<cl034>(m, "cl034")
             .def("fn_000", &cl034::fn_000)
@@ -40,7 +39,6 @@ Here is an example of the binding code for one class:
             .def("fn_002", &cl034::fn_002)
             .def("fn_003", &cl034::fn_003)
         ...
-        return m.ptr();
     }
 
 The Boost.Python version looks almost identical except that a return value
diff --git a/pybind11/docs/changelog.rst b/pybind11/docs/changelog.rst
index 74bedde8b..1ca501d15 100644
--- a/pybind11/docs/changelog.rst
+++ b/pybind11/docs/changelog.rst
@@ -6,6 +6,460 @@ Changelog
 Starting with version 1.8.0, pybind11 releases use a `semantic versioning
 <http://semver.org>`_ policy.
 
+v2.3.0 (Not yet released)
+-----------------------------------------------------
+
+* TBD
+
+v2.2.1 (September 14, 2017)
+-----------------------------------------------------
+
+* Added ``py::module::reload()`` member function for reloading a module.
+  `#1040 <https://github.com/pybind/pybind11/pull/1040>`_.
+
+* Fixed a reference leak in the number converter.
+  `#1078 <https://github.com/pybind/pybind11/pull/1078>`_.
+
+* Fixed compilation with Clang on host GCC < 5 (old libstdc++ which isn't fully
+  C++11 compliant). `#1062 <https://github.com/pybind/pybind11/pull/1062>`_.
+
+* Fixed a regression where the automatic ``std::vector<bool>`` caster would
+  fail to compile. The same fix also applies to any container which returns
+  element proxies instead of references.
+  `#1053 <https://github.com/pybind/pybind11/pull/1053>`_.
+
+* Fixed a regression where the ``py::keep_alive`` policy could not be applied
+  to constructors. `#1065 <https://github.com/pybind/pybind11/pull/1065>`_.
+
+* Fixed a nullptr dereference when loading a ``py::module_local`` type
+  that's only registered in an external module.
+  `#1058 <https://github.com/pybind/pybind11/pull/1058>`_.
+
+* Fixed implicit conversion of accessors to types derived from ``py::object``.
+  `#1076 <https://github.com/pybind/pybind11/pull/1076>`_.
+
+* The ``name`` in ``PYBIND11_MODULE(name, variable)`` can now be a macro.
+  `#1082 <https://github.com/pybind/pybind11/pull/1082>`_.
+
+* Relaxed overly strict ``py::pickle()`` check for matching get and set types.
+  `#1064 <https://github.com/pybind/pybind11/pull/1064>`_.
+
+* Conversion errors now try to be more informative when it's likely that
+  a missing header is the cause (e.g. forgetting ``<pybind11/stl.h>``).
+  `#1077 <https://github.com/pybind/pybind11/pull/1077>`_.
+
+v2.2.0 (August 31, 2017)
+-----------------------------------------------------
+
+* Support for embedding the Python interpreter. See the
+  :doc:`documentation page </advanced/embedding>` for a
+  full overview of the new features.
+  `#774 <https://github.com/pybind/pybind11/pull/774>`_,
+  `#889 <https://github.com/pybind/pybind11/pull/889>`_,
+  `#892 <https://github.com/pybind/pybind11/pull/892>`_,
+  `#920 <https://github.com/pybind/pybind11/pull/920>`_.
+
+  .. code-block:: cpp
+
+      #include <pybind11/embed.h>
+      namespace py = pybind11;
+
+      int main() {
+          py::scoped_interpreter guard{}; // start the interpreter and keep it alive
+
+          py::print("Hello, World!"); // use the Python API
+      }
+
+* Support for inheriting from multiple C++ bases in Python.
+  `#693 <https://github.com/pybind/pybind11/pull/693>`_.
+
+  .. code-block:: python
+
+      from cpp_module import CppBase1, CppBase2
+
+      class PyDerived(CppBase1, CppBase2):
+          def __init__(self):
+              CppBase1.__init__(self)  # C++ bases must be initialized explicitly
+              CppBase2.__init__(self)
+
+* ``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+  ``PYBIND11_PLUGIN`` is deprecated. See :ref:`macros` for details.
+  `#879 <https://github.com/pybind/pybind11/pull/879>`_.
+
+  .. code-block:: cpp
+
+      // new
+      PYBIND11_MODULE(example, m) {
+          m.def("add", [](int a, int b) { return a + b; });
+      }
+
+      // old
+      PYBIND11_PLUGIN(example) {
+          py::module m("example");
+          m.def("add", [](int a, int b) { return a + b; });
+          return m.ptr();
+      }
+
+* pybind11's headers and build system now more strictly enforce hidden symbol
+  visibility for extension modules. This should be seamless for most users,
+  but see the :doc:`upgrade` if you use a custom build system.
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Support for ``py::module_local`` types which allow multiple modules to
+  export the same C++ types without conflicts. This is useful for opaque
+  types like ``std::vector<int>``. ``py::bind_vector`` and ``py::bind_map``
+  now default to ``py::module_local`` if their elements are builtins or
+  local types. See :ref:`module_local` for details.
+  `#949 <https://github.com/pybind/pybind11/pull/949>`_,
+  `#981 <https://github.com/pybind/pybind11/pull/981>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_,
+  `#997 <https://github.com/pybind/pybind11/pull/997>`_.
+
+* Custom constructors can now be added very easily using lambdas or factory
+  functions which return a class instance by value, pointer or holder. This
+  supersedes the old placement-new ``__init__`` technique.
+  See :ref:`custom_constructors` for details.
+  `#805 <https://github.com/pybind/pybind11/pull/805>`_,
+  `#1014 <https://github.com/pybind/pybind11/pull/1014>`_.
+
+  .. code-block:: cpp
+
+      struct Example {
+          Example(std::string);
+      };
+
+      py::class_<Example>(m, "Example")
+          .def(py::init<std::string>()) // existing constructor
+          .def(py::init([](int n) { // custom constructor
+              return std::make_unique<Example>(std::to_string(n));
+          }));
+
+* Similarly to custom constructors, pickling support functions are now bound
+  using the ``py::pickle()`` adaptor which improves type safety. See the
+  :doc:`upgrade` and :ref:`pickling` for details.
+  `#1038 <https://github.com/pybind/pybind11/pull/1038>`_.
+
+* Builtin support for converting C++17 standard library types and general
+  conversion improvements:
+
+  1. C++17 ``std::variant`` is supported right out of the box. C++11/14
+     equivalents (e.g. ``boost::variant``) can also be added with a simple
+     user-defined specialization. See :ref:`cpp17_container_casters` for details.
+     `#811 <https://github.com/pybind/pybind11/pull/811>`_,
+     `#845 <https://github.com/pybind/pybind11/pull/845>`_,
+     `#989 <https://github.com/pybind/pybind11/pull/989>`_.
+
+  2. Out-of-the-box support for C++17 ``std::string_view``.
+     `#906 <https://github.com/pybind/pybind11/pull/906>`_.
+
+  3. Improved compatibility of the builtin ``optional`` converter.
+     `#874 <https://github.com/pybind/pybind11/pull/874>`_.
+
+  4. The ``bool`` converter now accepts ``numpy.bool_`` and types which
+     define ``__bool__`` (Python 3.x) or ``__nonzero__`` (Python 2.7).
+     `#925 <https://github.com/pybind/pybind11/pull/925>`_.
+
+  5. C++-to-Python casters are now more efficient and move elements out
+     of rvalue containers whenever possible.
+     `#851 <https://github.com/pybind/pybind11/pull/851>`_,
+     `#936 <https://github.com/pybind/pybind11/pull/936>`_,
+     `#938 <https://github.com/pybind/pybind11/pull/938>`_.
+
+  6. Fixed ``bytes`` to ``std::string/char*`` conversion on Python 3.
+     `#817 <https://github.com/pybind/pybind11/pull/817>`_.
+
+  7. Fixed lifetime of temporary C++ objects created in Python-to-C++ conversions.
+     `#924 <https://github.com/pybind/pybind11/pull/924>`_.
+
+* Scope guard call policy for RAII types, e.g. ``py::call_guard<py::gil_scoped_release>()``,
+  ``py::call_guard<py::scoped_ostream_redirect>()``. See :ref:`call_policies` for details.
+  `#740 <https://github.com/pybind/pybind11/pull/740>`_.
+
+* Utility for redirecting C++ streams to Python (e.g. ``std::cout`` ->
+  ``sys.stdout``). Scope guard ``py::scoped_ostream_redirect`` in C++ and
+  a context manager in Python. See :ref:`ostream_redirect`.
+  `#1009 <https://github.com/pybind/pybind11/pull/1009>`_.
+
+* Improved handling of types and exceptions across module boundaries.
+  `#915 <https://github.com/pybind/pybind11/pull/915>`_,
+  `#951 <https://github.com/pybind/pybind11/pull/951>`_,
+  `#995 <https://github.com/pybind/pybind11/pull/995>`_.
+
+* Fixed destruction order of ``py::keep_alive`` nurse/patient objects
+  in reference cycles.
+  `#856 <https://github.com/pybind/pybind11/pull/856>`_.
+
+* Numpy and buffer protocol related improvements:
+
+  1. Support for negative strides in Python buffer objects/numpy arrays. This
+     required changing integers from unsigned to signed for the related C++ APIs.
+     Note: If you have compiler warnings enabled, you may notice some new conversion
+     warnings after upgrading. These can be resolved with ``static_cast``.
+     `#782 <https://github.com/pybind/pybind11/pull/782>`_.
+
+  2. Support ``std::complex`` and arrays inside ``PYBIND11_NUMPY_DTYPE``.
+     `#831 <https://github.com/pybind/pybind11/pull/831>`_,
+     `#832 <https://github.com/pybind/pybind11/pull/832>`_.
+
+  3. Support for constructing ``py::buffer_info`` and ``py::arrays`` using
+     arbitrary containers or iterators instead of requiring a ``std::vector``.
+     `#788 <https://github.com/pybind/pybind11/pull/788>`_,
+     `#822 <https://github.com/pybind/pybind11/pull/822>`_,
+     `#860 <https://github.com/pybind/pybind11/pull/860>`_.
+
+  4. Explicitly check numpy version and require >= 1.7.0.
+     `#819 <https://github.com/pybind/pybind11/pull/819>`_.
+
+* Support for allowing/prohibiting ``None`` for specific arguments and improved
+  ``None`` overload resolution order. See :ref:`none_arguments` for details.
+  `#843 <https://github.com/pybind/pybind11/pull/843>`_.
+  `#859 <https://github.com/pybind/pybind11/pull/859>`_.
+
+* Added ``py::exec()`` as a shortcut for ``py::eval<py::eval_statements>()``
+  and support for C++11 raw string literals as input. See :ref:`eval`.
+  `#766 <https://github.com/pybind/pybind11/pull/766>`_,
+  `#827 <https://github.com/pybind/pybind11/pull/827>`_.
+
+* ``py::vectorize()`` ignores non-vectorizable arguments and supports
+  member functions.
+  `#762 <https://github.com/pybind/pybind11/pull/762>`_.
+
+* Support for bound methods as callbacks (``pybind11/functional.h``).
+  `#815 <https://github.com/pybind/pybind11/pull/815>`_.
+
+* Allow aliasing pybind11 methods: ``cls.attr("foo") = cls.attr("bar")``.
+  `#802 <https://github.com/pybind/pybind11/pull/802>`_.
+
+* Don't allow mixed static/non-static overloads.
+  `#804 <https://github.com/pybind/pybind11/pull/804>`_.
+
+* Fixed overriding static properties in derived classes.
+  `#784 <https://github.com/pybind/pybind11/pull/784>`_.
+
+* Improved deduction of member functions of a derived class when its bases
+  aren't registered with pybind11.
+  `#855 <https://github.com/pybind/pybind11/pull/855>`_.
+
+  .. code-block:: cpp
+
+      struct Base {
+          int foo() { return 42; }
+      }
+
+      struct Derived : Base {}
+
+      // Now works, but previously required also binding `Base`
+      py::class_<Derived>(m, "Derived")
+          .def("foo", &Derived::foo); // function is actually from `Base`
+
+* The implementation of ``py::init<>`` now uses C++11 brace initialization
+  syntax to construct instances, which permits binding implicit constructors of
+  aggregate types. `#1015 <https://github.com/pybind/pybind11/pull/1015>`_.
+
+    .. code-block:: cpp
+
+        struct Aggregate {
+            int a;
+            std::string b;
+        };
+
+        py::class_<Aggregate>(m, "Aggregate")
+            .def(py::init<int, const std::string &>());
+
+* Fixed issues with multiple inheritance with offset base/derived pointers.
+  `#812 <https://github.com/pybind/pybind11/pull/812>`_,
+  `#866 <https://github.com/pybind/pybind11/pull/866>`_,
+  `#960 <https://github.com/pybind/pybind11/pull/960>`_.
+
+* Fixed reference leak of type objects.
+  `#1030 <https://github.com/pybind/pybind11/pull/1030>`_.
+
+* Improved support for the ``/std:c++14`` and ``/std:c++latest`` modes
+  on MSVC 2017.
+  `#841 <https://github.com/pybind/pybind11/pull/841>`_,
+  `#999 <https://github.com/pybind/pybind11/pull/999>`_.
+
+* Fixed detection of private operator new on MSVC.
+  `#893 <https://github.com/pybind/pybind11/pull/893>`_,
+  `#918 <https://github.com/pybind/pybind11/pull/918>`_.
+
+* Intel C++ compiler compatibility fixes.
+  `#937 <https://github.com/pybind/pybind11/pull/937>`_.
+
+* Fixed implicit conversion of `py::enum_` to integer types on Python 2.7.
+  `#821 <https://github.com/pybind/pybind11/pull/821>`_.
+
+* Added ``py::hash`` to fetch the hash value of Python objects, and
+  ``.def(hash(py::self))`` to provide the C++ ``std::hash`` as the Python
+  ``__hash__`` method.
+  `#1034 <https://github.com/pybind/pybind11/pull/1034>`_.
+
+* Fixed ``__truediv__`` on Python 2 and ``__itruediv__`` on Python 3.
+  `#867 <https://github.com/pybind/pybind11/pull/867>`_.
+
+* ``py::capsule`` objects now support the ``name`` attribute. This is useful
+  for interfacing with ``scipy.LowLevelCallable``.
+  `#902 <https://github.com/pybind/pybind11/pull/902>`_.
+
+* Fixed ``py::make_iterator``'s ``__next__()`` for past-the-end calls.
+  `#897 <https://github.com/pybind/pybind11/pull/897>`_.
+
+* Added ``error_already_set::matches()`` for checking Python exceptions.
+  `#772 <https://github.com/pybind/pybind11/pull/772>`_.
+
+* Deprecated ``py::error_already_set::clear()``. It's no longer needed
+  following a simplification of the ``py::error_already_set`` class.
+  `#954 <https://github.com/pybind/pybind11/pull/954>`_.
+
+* Deprecated ``py::handle::operator==()`` in favor of ``py::handle::is()``
+  `#825 <https://github.com/pybind/pybind11/pull/825>`_.
+
+* Deprecated ``py::object::borrowed``/``py::object::stolen``.
+  Use ``py::object::borrowed_t{}``/``py::object::stolen_t{}`` instead.
+  `#771 <https://github.com/pybind/pybind11/pull/771>`_.
+
+* Changed internal data structure versioning to avoid conflicts between
+  modules compiled with different revisions of pybind11.
+  `#1012 <https://github.com/pybind/pybind11/pull/1012>`_.
+
+* Additional compile-time and run-time error checking and more informative messages.
+  `#786 <https://github.com/pybind/pybind11/pull/786>`_,
+  `#794 <https://github.com/pybind/pybind11/pull/794>`_,
+  `#803 <https://github.com/pybind/pybind11/pull/803>`_.
+
+* Various minor improvements and fixes.
+  `#764 <https://github.com/pybind/pybind11/pull/764>`_,
+  `#791 <https://github.com/pybind/pybind11/pull/791>`_,
+  `#795 <https://github.com/pybind/pybind11/pull/795>`_,
+  `#840 <https://github.com/pybind/pybind11/pull/840>`_,
+  `#844 <https://github.com/pybind/pybind11/pull/844>`_,
+  `#846 <https://github.com/pybind/pybind11/pull/846>`_,
+  `#849 <https://github.com/pybind/pybind11/pull/849>`_,
+  `#858 <https://github.com/pybind/pybind11/pull/858>`_,
+  `#862 <https://github.com/pybind/pybind11/pull/862>`_,
+  `#871 <https://github.com/pybind/pybind11/pull/871>`_,
+  `#872 <https://github.com/pybind/pybind11/pull/872>`_,
+  `#881 <https://github.com/pybind/pybind11/pull/881>`_,
+  `#888 <https://github.com/pybind/pybind11/pull/888>`_,
+  `#899 <https://github.com/pybind/pybind11/pull/899>`_,
+  `#928 <https://github.com/pybind/pybind11/pull/928>`_,
+  `#931 <https://github.com/pybind/pybind11/pull/931>`_,
+  `#944 <https://github.com/pybind/pybind11/pull/944>`_,
+  `#950 <https://github.com/pybind/pybind11/pull/950>`_,
+  `#952 <https://github.com/pybind/pybind11/pull/952>`_,
+  `#962 <https://github.com/pybind/pybind11/pull/962>`_,
+  `#965 <https://github.com/pybind/pybind11/pull/965>`_,
+  `#970 <https://github.com/pybind/pybind11/pull/970>`_,
+  `#978 <https://github.com/pybind/pybind11/pull/978>`_,
+  `#979 <https://github.com/pybind/pybind11/pull/979>`_,
+  `#986 <https://github.com/pybind/pybind11/pull/986>`_,
+  `#1020 <https://github.com/pybind/pybind11/pull/1020>`_,
+  `#1027 <https://github.com/pybind/pybind11/pull/1027>`_,
+  `#1037 <https://github.com/pybind/pybind11/pull/1037>`_.
+
+* Testing improvements.
+  `#798 <https://github.com/pybind/pybind11/pull/798>`_,
+  `#882 <https://github.com/pybind/pybind11/pull/882>`_,
+  `#898 <https://github.com/pybind/pybind11/pull/898>`_,
+  `#900 <https://github.com/pybind/pybind11/pull/900>`_,
+  `#921 <https://github.com/pybind/pybind11/pull/921>`_,
+  `#923 <https://github.com/pybind/pybind11/pull/923>`_,
+  `#963 <https://github.com/pybind/pybind11/pull/963>`_.
+
+v2.1.1 (April 7, 2017)
+-----------------------------------------------------
+
+* Fixed minimum version requirement for MSVC 2015u3
+  `#773 <https://github.com/pybind/pybind11/pull/773>`_.
+
+v2.1.0 (March 22, 2017)
+-----------------------------------------------------
+
+* pybind11 now performs function overload resolution in two phases. The first
+  phase only considers exact type matches, while the second allows for implicit
+  conversions to take place. A special ``noconvert()`` syntax can be used to
+  completely disable implicit conversions for specific arguments.
+  `#643 <https://github.com/pybind/pybind11/pull/643>`_,
+  `#634 <https://github.com/pybind/pybind11/pull/634>`_,
+  `#650 <https://github.com/pybind/pybind11/pull/650>`_.
+
+* Fixed a regression where static properties no longer worked with classes
+  using multiple inheritance. The ``py::metaclass`` attribute is no longer
+  necessary (and deprecated as of this release) when binding classes with
+  static properties.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* Classes bound using ``pybind11`` can now use custom metaclasses.
+  `#679 <https://github.com/pybind/pybind11/pull/679>`_,
+
+* ``py::args`` and ``py::kwargs`` can now be mixed with other positional
+  arguments when binding functions using pybind11.
+  `#611 <https://github.com/pybind/pybind11/pull/611>`_.
+
+* Improved support for C++11 unicode string and character types; added
+  extensive documentation regarding pybind11's string conversion behavior.
+  `#624 <https://github.com/pybind/pybind11/pull/624>`_,
+  `#636 <https://github.com/pybind/pybind11/pull/636>`_,
+  `#715 <https://github.com/pybind/pybind11/pull/715>`_.
+
+* pybind11 can now avoid expensive copies when converting Eigen arrays to NumPy
+  arrays (and vice versa). `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* The "fast path" in ``py::vectorize`` now works for any full-size group of C or
+  F-contiguous arrays. The non-fast path is also faster since it no longer performs
+  copies of the input arguments (except when type conversions are necessary).
+  `#610 <https://github.com/pybind/pybind11/pull/610>`_.
+
+* Added fast, unchecked access to NumPy arrays via a proxy object.
+  `#746 <https://github.com/pybind/pybind11/pull/746>`_.
+
+* Transparent support for class-specific ``operator new`` and
+  ``operator delete`` implementations.
+  `#755 <https://github.com/pybind/pybind11/pull/755>`_.
+
+* Slimmer and more efficient STL-compatible iterator interface for sequence types.
+  `#662 <https://github.com/pybind/pybind11/pull/662>`_.
+
+* Improved custom holder type support.
+  `#607 <https://github.com/pybind/pybind11/pull/607>`_.
+
+* ``nullptr`` to ``None`` conversion fixed in various builtin type casters.
+  `#732 <https://github.com/pybind/pybind11/pull/732>`_.
+
+* ``enum_`` now exposes its members via a special ``__members__`` attribute.
+  `#666 <https://github.com/pybind/pybind11/pull/666>`_.
+
+* ``std::vector`` bindings created using ``stl_bind.h`` can now optionally
+  implement the buffer protocol. `#488 <https://github.com/pybind/pybind11/pull/488>`_.
+
+* Automated C++ reference documentation using doxygen and breathe.
+  `#598 <https://github.com/pybind/pybind11/pull/598>`_.
+
+* Added minimum compiler version assertions.
+  `#727 <https://github.com/pybind/pybind11/pull/727>`_.
+
+* Improved compatibility with C++1z.
+  `#677 <https://github.com/pybind/pybind11/pull/677>`_.
+
+* Improved ``py::capsule`` API. Can be used to implement cleanup
+  callbacks that are involved at module destruction time.
+  `#752 <https://github.com/pybind/pybind11/pull/752>`_.
+
+* Various minor improvements and fixes.
+  `#595 <https://github.com/pybind/pybind11/pull/595>`_,
+  `#588 <https://github.com/pybind/pybind11/pull/588>`_,
+  `#589 <https://github.com/pybind/pybind11/pull/589>`_,
+  `#603 <https://github.com/pybind/pybind11/pull/603>`_,
+  `#619 <https://github.com/pybind/pybind11/pull/619>`_,
+  `#648 <https://github.com/pybind/pybind11/pull/648>`_,
+  `#695 <https://github.com/pybind/pybind11/pull/695>`_,
+  `#720 <https://github.com/pybind/pybind11/pull/720>`_,
+  `#723 <https://github.com/pybind/pybind11/pull/723>`_,
+  `#729 <https://github.com/pybind/pybind11/pull/729>`_,
+  `#724 <https://github.com/pybind/pybind11/pull/724>`_,
+  `#742 <https://github.com/pybind/pybind11/pull/742>`_,
+  `#753 <https://github.com/pybind/pybind11/pull/753>`_.
+
 v2.0.1 (Jan 4, 2017)
 -----------------------------------------------------
 
diff --git a/pybind11/docs/classes.rst b/pybind11/docs/classes.rst
index 872977684..ca2477e83 100644
--- a/pybind11/docs/classes.rst
+++ b/pybind11/docs/classes.rst
@@ -27,18 +27,14 @@ The binding code for ``Pet`` looks as follows:
 
     namespace py = pybind11;
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind11 example plugin");
-
+    PYBIND11_MODULE(example, m) {
         py::class_<Pet>(m, "Pet")
             .def(py::init<const std::string &>())
             .def("setName", &Pet::setName)
             .def("getName", &Pet::getName);
-
-        return m.ptr();
     }
 
-:class:`class_` creates bindings for a C++ `class` or `struct`-style data
+:class:`class_` creates bindings for a C++ *class* or *struct*-style data
 structure. :func:`init` is a convenience function that takes the types of a
 constructor's parameters as template arguments and wraps the corresponding
 constructor (see the :ref:`custom_constructors` section for details). An
@@ -229,8 +225,8 @@ just brings them on par.
 
 .. _inheritance:
 
-Inheritance
-===========
+Inheritance and automatic upcasting
+===================================
 
 Suppose now that the example consists of two data structures with an
 inheritance relationship:
@@ -287,6 +283,65 @@ expose fields and methods of both types:
     >>> p.bark()
     u'woof!'
 
+The C++ classes defined above are regular non-polymorphic types with an
+inheritance relationship. This is reflected in Python:
+
+.. code-block:: cpp
+
+    // Return a base pointer to a derived instance
+    m.def("pet_store", []() { return std::unique_ptr<Pet>(new Dog("Molly")); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store()
+    >>> type(p)  # `Dog` instance behind `Pet` pointer
+    Pet          # no pointer upcasting for regular non-polymorphic types
+    >>> p.bark()
+    AttributeError: 'Pet' object has no attribute 'bark'
+
+The function returned a ``Dog`` instance, but because it's a non-polymorphic
+type behind a base pointer, Python only sees a ``Pet``. In C++, a type is only
+considered polymorphic if it has at least one virtual function and pybind11
+will automatically recognize this:
+
+.. code-block:: cpp
+
+    struct PolymorphicPet {
+        virtual ~PolymorphicPet() = default;
+    };
+
+    struct PolymorphicDog : PolymorphicPet {
+        std::string bark() const { return "woof!"; }
+    };
+
+    // Same binding code
+    py::class_<PolymorphicPet>(m, "PolymorphicPet");
+    py::class_<PolymorphicDog, PolymorphicPet>(m, "PolymorphicDog")
+        .def(py::init<>())
+        .def("bark", &PolymorphicDog::bark);
+
+    // Again, return a base pointer to a derived instance
+    m.def("pet_store2", []() { return std::unique_ptr<PolymorphicPet>(new PolymorphicDog); });
+
+.. code-block:: pycon
+
+    >>> p = example.pet_store2()
+    >>> type(p)
+    PolymorphicDog  # automatically upcast
+    >>> p.bark()
+    u'woof!'
+
+Given a pointer to a polymorphic base, pybind11 performs automatic upcasting
+to the actual derived type. Note that this goes beyond the usual situation in
+C++: we don't just get access to the virtual functions of the base, we get the
+concrete derived type including functions and attributes that the base type may
+not even be aware of.
+
+.. seealso::
+
+    For more information about polymorphic behavior see :ref:`overriding_virtuals`.
+
+
 Overloaded methods
 ==================
 
@@ -298,8 +353,8 @@ different kinds of input arguments:
     struct Pet {
         Pet(const std::string &name, int age) : name(name), age(age) { }
 
-        void set(int age) { age = age; }
-        void set(const std::string &name) { name = name; }
+        void set(int age_) { age = age_; }
+        void set(const std::string &name_) { name = name_; }
 
         std::string name;
         int age;
@@ -423,6 +478,12 @@ typed enums.
     >>> int(p.type)
     1L
 
+The entries defined by the enumeration type are exposed in the ``__members__`` property:
+
+.. code-block:: pycon
+
+    >>> Pet.Kind.__members__
+    {'Dog': Kind.Dog, 'Cat': Kind.Cat}
 
 .. note::
 
diff --git a/pybind11/docs/compiling.rst b/pybind11/docs/compiling.rst
index c7053dbf9..b5d6ce948 100644
--- a/pybind11/docs/compiling.rst
+++ b/pybind11/docs/compiling.rst
@@ -1,3 +1,5 @@
+.. _compiling:
+
 Build systems
 #############
 
@@ -14,10 +16,10 @@ the [python_example]_ repository.
 Building with cppimport
 ========================
 
- cppimport is a small Python import hook that determines whether there is a C++
- source file whose name matches the requested module. If there is, the file is
- compiled as a Python extension using pybind11 and placed in the same folder as
- the C++ source file. Python is then able to find the module and load it.
+[cppimport]_ is a small Python import hook that determines whether there is a C++
+source file whose name matches the requested module. If there is, the file is
+compiled as a Python extension using pybind11 and placed in the same folder as
+the C++ source file. Python is then able to find the module and load it.
 
 .. [cppimport] https://github.com/tbenthompson/cppimport
 
@@ -74,13 +76,15 @@ removes this target from the default build (see CMake docs for details).
 
 Since pybind11 is a template library, ``pybind11_add_module`` adds compiler
 flags to ensure high quality code generation without bloat arising from long
-symbol names and duplication of code in different translation units. The
-additional flags enable LTO (Link Time Optimization), set default visibility
-to *hidden* and strip unneeded symbols. See the :ref:`FAQ entry <faq:symhidden>`
-for a more detailed explanation. These optimizations are never applied in
-``Debug`` mode. If ``NO_EXTRAS`` is given, they will always be disabled, even
-in ``Release`` mode. However, this will result in code bloat and is generally
-not recommended.
+symbol names and duplication of code in different translation units. It
+sets default visibility to *hidden*, which is required for some pybind11
+features and functionality when attempting to load multiple pybind11 modules
+compiled under different pybind11 versions.  It also adds additional flags
+enabling LTO (Link Time Optimization) and strip unneeded symbols. See the
+:ref:`FAQ entry <faq:symhidden>` for a more detailed explanation. These
+latter optimizations are never applied in ``Debug`` mode.  If ``NO_EXTRAS`` is
+given, they will always be disabled, even in ``Release`` mode. However, this
+will result in code bloat and is generally not recommended.
 
 As stated above, LTO is enabled by default. Some newer compilers also support
 different flavors of LTO such as `ThinLTO`_. Setting ``THIN_LTO`` will cause
@@ -92,17 +96,28 @@ regular LTO if ``-flto=thin`` is not available.
 Configuration variables
 -----------------------
 
-By default, pybind11 will compile modules with the latest C++ standard
-available on the target compiler. To override this, the standard flag can
-be given explicitly in ``PYBIND11_CPP_STANDARD``:
+By default, pybind11 will compile modules with the C++14 standard, if available
+on the target compiler, falling back to C++11 if C++14 support is not
+available.  Note, however, that this default is subject to change: future
+pybind11 releases are expected to migrate to newer C++ standards as they become
+available.  To override this, the standard flag can be given explicitly in
+``PYBIND11_CPP_STANDARD``:
 
 .. code-block:: cmake
 
+    # Use just one of these:
+    # GCC/clang:
     set(PYBIND11_CPP_STANDARD -std=c++11)
+    set(PYBIND11_CPP_STANDARD -std=c++14)
+    set(PYBIND11_CPP_STANDARD -std=c++1z) # Experimental C++17 support
+    # MSVC:
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+    set(PYBIND11_CPP_STANDARD /std:c++latest) # Enables some MSVC C++17 features
+
     add_subdirectory(pybind11)  # or find_package(pybind11)
 
 Note that this and all other configuration variables must be set **before** the
-call to ``add_subdiretory`` or ``find_package``. The variables can also be set
+call to ``add_subdirectory`` or ``find_package``. The variables can also be set
 when calling CMake from the command line using the ``-D<variable>=<value>`` flag.
 
 The target Python version can be selected by setting ``PYBIND11_PYTHON_VERSION``
@@ -170,11 +185,84 @@ to an independently constructed (through ``add_library``, not
     flags (i.e. this is up to you).
 
     These include Link Time Optimization (``-flto`` on GCC/Clang/ICPC, ``/GL``
-    and ``/LTCG`` on Visual Studio). Default-hidden symbols on GCC/Clang/ICPC
-    (``-fvisibility=hidden``) and .OBJ files with many sections on Visual Studio
-    (``/bigobj``). The :ref:`FAQ <faq:symhidden>` contains an
+    and ``/LTCG`` on Visual Studio) and .OBJ files with many sections on Visual
+    Studio (``/bigobj``).  The :ref:`FAQ <faq:symhidden>` contains an
     explanation on why these are needed.
 
+Embedding the Python interpreter
+--------------------------------
+
+In addition to extension modules, pybind11 also supports embedding Python into
+a C++ executable or library. In CMake, simply link with the ``pybind11::embed``
+target. It provides everything needed to get the interpreter running. The Python
+headers and libraries are attached to the target. Unlike ``pybind11::module``,
+there is no need to manually set any additional properties here. For more
+information about usage in C++, see :doc:`/advanced/embedding`.
+
+.. code-block:: cmake
+
+    cmake_minimum_required(VERSION 3.0)
+    project(example)
+
+    find_package(pybind11 REQUIRED)  # or add_subdirectory(pybind11)
+
+    add_executable(example main.cpp)
+    target_link_libraries(example PRIVATE pybind11::embed)
+
+.. _building_manually:
+
+Building manually
+=================
+
+pybind11 is a header-only library, hence it is not necessary to link against
+any special libraries and there are no intermediate (magic) translation steps.
+
+On Linux, you can compile an example such as the one given in
+:ref:`simple_example` using the following command:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+The flags given here assume that you're using Python 3. For Python 2, just
+change the executable appropriately (to ``python`` or ``python2``).
+
+The ``python3 -m pybind11 --includes`` command fetches the include paths for
+both pybind11 and Python headers. This assumes that pybind11 has been installed
+using ``pip`` or ``conda``. If it hasn't, you can also manually specify
+``-I <path-to-pybind11>/include`` together with the Python includes path
+``python3-config --includes``.
+
+Note that Python 2.7 modules don't use a special suffix, so you should simply
+use ``example.so`` instead of ``example`python3-config --extension-suffix```.
+Besides, the ``--extension-suffix`` option may or may not be available, depending
+on the distribution; in the latter case, the module extension can be manually
+set to ``.so``.
+
+On Mac OS: the build command is almost the same but it also requires passing
+the ``-undefined dynamic_lookup`` flag so as to ignore missing symbols when
+building the module:
+
+.. code-block:: bash
+
+    $ c++ -O3 -Wall -shared -std=c++11 -undefined dynamic_lookup `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+
+In general, it is advisable to include several additional build parameters
+that can considerably reduce the size of the created binary. Refer to section
+:ref:`cmake` for a detailed example of a suitable cross-platform CMake-based
+build system that works on all platforms including Windows.
+
+.. note::
+
+    On Linux and macOS, it's better to (intentionally) not link against
+    ``libpython``. The symbols will be resolved when the extension library
+    is loaded into a Python binary. This is preferable because you might
+    have several different installations of a given Python version (e.g. the
+    system-provided Python, and one that ships with a piece of commercial
+    software). In this way, the plugin will work with both versions, instead
+    of possibly importing a second Python library into a process that already
+    contains one (which will lead to a segfault).
+
 Generating binding code automatically
 =====================================
 
diff --git a/pybind11/docs/conf.py b/pybind11/docs/conf.py
index 0769f20be..cd0e17eb7 100644
--- a/pybind11/docs/conf.py
+++ b/pybind11/docs/conf.py
@@ -16,6 +16,7 @@
 import sys
 import os
 import shlex
+import subprocess
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -30,7 +31,11 @@ import shlex
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = []
+extensions = ['breathe']
+
+breathe_projects = {'pybind11': '.build/doxygenxml/'}
+breathe_default_project = 'pybind11'
+breathe_domain_by_extension = {'h': 'cpp'}
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['.templates']
@@ -48,7 +53,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = 'pybind11'
-copyright = '2016, Wenzel Jakob'
+copyright = '2017, Wenzel Jakob'
 author = 'Wenzel Jakob'
 
 # The version info for the project you're documenting, acts as replacement for
@@ -56,9 +61,9 @@ author = 'Wenzel Jakob'
 # built documents.
 #
 # The short X.Y version.
-version = '2.0'
+version = '2.2'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.1'
+release = '2.2.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -79,7 +84,7 @@ exclude_patterns = ['.build', 'release.rst']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-#default_role = None
+default_role = 'any'
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
 #add_function_parentheses = True
@@ -306,3 +311,22 @@ texinfo_documents = [
 
 primary_domain = 'cpp'
 highlight_language = 'cpp'
+
+
+def generate_doxygen_xml(app):
+    build_dir = os.path.join(app.confdir, '.build')
+    if not os.path.exists(build_dir):
+        os.mkdir(build_dir)
+
+    try:
+        subprocess.call(['doxygen', '--version'])
+        retcode = subprocess.call(['doxygen'], cwd=app.confdir)
+        if retcode < 0:
+            sys.stderr.write("doxygen error code: {}\n".format(-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: {}\n".format(e))
+
+
+def setup(app):
+    """Add hook for building doxygen xml when needed"""
+    app.connect("builder-inited", generate_doxygen_xml)
diff --git a/pybind11/docs/faq.rst b/pybind11/docs/faq.rst
index 34002b42d..8f33eb014 100644
--- a/pybind11/docs/faq.rst
+++ b/pybind11/docs/faq.rst
@@ -4,30 +4,24 @@ Frequently asked questions
 "ImportError: dynamic module does not define init function"
 ===========================================================
 
-1. Make sure that the name specified in ``pybind::module`` and
-   ``PYBIND11_PLUGIN`` is consistent and identical to the filename of the
-   extension library. The latter should not contain any extra prefixes (e.g.
-   ``test.so`` instead of ``libtest.so``).
-
-2. If the above did not fix your issue, then you are likely using an
-   incompatible version of Python (for instance, the extension library was
-   compiled against Python 2, while the interpreter is running on top of some
-   version of Python 3, or vice versa)
+You are likely using an incompatible version of Python (for instance, the
+extension library was compiled against Python 2, while the interpreter is
+running on top of some version of Python 3, or vice versa).
 
 "Symbol not found: ``__Py_ZeroStruct`` / ``_PyInstanceMethod_Type``"
 ========================================================================
 
-See item 2 of the first answer.
+See the first answer.
 
 "SystemError: dynamic module not initialized properly"
 ======================================================
 
-See item 2 of the first answer.
+See the first answer.
 
 The Python interpreter immediately crashes when importing my module
 ===================================================================
 
-See item 2 of the first answer.
+See the first answer.
 
 CMake doesn't detect the right Python version
 =============================================
@@ -104,14 +98,10 @@ following example:
     void init_ex2(py::module &);
     /* ... */
 
-    PYBIND11_PLUGIN(example) {
-        py::module m("example", "pybind example plugin");
-
+    PYBIND11_MODULE(example, m) {
         init_ex1(m);
         init_ex2(m);
         /* ... */
-
-        return m.ptr();
     }
 
 :file:`ex1.cpp`:
@@ -161,6 +151,33 @@ specifying a larger value, e.g. ``-ftemplate-depth=1024`` on GCC/Clang. The
 culprit is generally the generation of function signatures at compile time
 using C++14 template metaprogramming.
 
+.. _`faq:hidden_visibility`:
+
+"‘SomeClass’ declared with greater visibility than the type of its field ‘SomeClass::member’ [-Wattributes]"
+============================================================================================================
+
+This error typically indicates that you are compiling without the required
+``-fvisibility`` flag.  pybind11 code internally forces hidden visibility on
+all internal code, but if non-hidden (and thus *exported*) code attempts to
+include a pybind type (for example, ``py::object`` or ``py::list``) you can run
+into this warning.
+
+To avoid it, make sure you are specifying ``-fvisibility=hidden`` when
+compiling pybind code.
+
+As to why ``-fvisibility=hidden`` is necessary, because pybind modules could
+have been compiled under different versions of pybind itself, it is also
+important that the symbols defined in one module do not clash with the
+potentially-incompatible symbols defined in another.  While Python extension
+modules are usually loaded with localized symbols (under POSIX systems
+typically using ``dlopen`` with the ``RTLD_LOCAL`` flag), this Python default
+can be changed, but even if it isn't it is not always enough to guarantee
+complete independence of the symbols involved when not using
+``-fvisibility=hidden``.
+
+Additionally, ``-fvisiblity=hidden`` can deliver considerably binary size
+savings.  (See the following section for more details).
+
 
 .. _`faq:symhidden`:
 
@@ -202,11 +219,14 @@ world. So we'll generally only want to export symbols for those functions which
 are actually called from the outside.
 
 This can be achieved by specifying the parameter ``-fvisibility=hidden`` to GCC
-and Clang, which sets the default symbol visibility to *hidden*. It's best to
-do this only for release builds, since the symbol names can be helpful in
-debugging sessions. On Visual Studio, symbols are already hidden by default, so
-nothing needs to be done there. Needless to say, this has a tremendous impact
-on the final binary size of the resulting extension library.
+and Clang, which sets the default symbol visibility to *hidden*, which has a
+tremendous impact on the final binary size of the resulting extension library.
+(On Visual Studio, symbols are already hidden by default, so nothing needs to
+be done there.)
+
+In addition to decreasing binary size, ``-fvisibility=hidden`` also avoids
+potential serious issues when loading multiple modules and is required for
+proper pybind operation.  See the previous FAQ entry for more details.
 
 Another aspect that can require a fair bit of code are function signature
 descriptions. pybind11 automatically generates human-readable function
diff --git a/pybind11/docs/index.rst b/pybind11/docs/index.rst
index cedf65209..d236611b7 100644
--- a/pybind11/docs/index.rst
+++ b/pybind11/docs/index.rst
@@ -14,6 +14,7 @@ pybind11 --- Seamless operability between C++11 and Python
 
    intro
    changelog
+   upgrade
 
 .. toctree::
    :caption: The Basics
@@ -33,6 +34,7 @@ pybind11 --- Seamless operability between C++11 and Python
    advanced/smart_ptrs
    advanced/cast/index
    advanced/pycpp/index
+   advanced/embedding
    advanced/misc
 
 .. toctree::
diff --git a/pybind11/docs/reference.rst b/pybind11/docs/reference.rst
index 542259eba..e41141bd9 100644
--- a/pybind11/docs/reference.rst
+++ b/pybind11/docs/reference.rst
@@ -9,239 +9,94 @@
 Reference
 #########
 
+.. _macros:
+
 Macros
 ======
 
-.. function:: PYBIND11_PLUGIN(const char *name)
-
-    This macro creates the entry point that will be invoked when the Python
-    interpreter imports a plugin library. Please create a
-    :class:`module` in the function body and return the pointer to its
-    underlying Python object at the end.
-
-    .. code-block:: cpp
-
-        PYBIND11_PLUGIN(example) {
-            pybind11::module m("example", "pybind11 example plugin");
-            /// Set up bindings here
-            return m.ptr();
-        }
+.. doxygendefine:: PYBIND11_MODULE
 
 .. _core_types:
 
 Convenience classes for arbitrary Python types
 ==============================================
 
-Without reference counting
---------------------------
-
-.. class:: handle
-
-    The :class:`handle` class is a thin wrapper around an arbitrary Python
-    object (i.e. a ``PyObject *`` in Python's C API). It does not perform any
-    automatic reference counting and merely provides a basic C++ interface to
-    various Python API functions.
-
-.. seealso::
-
-    The :class:`object` class inherits from :class:`handle` and adds automatic
-    reference counting features.
-
-.. function:: handle::handle()
-
-    The default constructor creates a handle with a ``nullptr``-valued pointer.
-
-.. function:: handle::handle(const handle&)
-
-    Copy constructor
-
-.. function:: handle::handle(PyObject *)
-
-    Creates a :class:`handle` from the given raw Python object pointer.
-
-.. function:: PyObject * handle::ptr() const
-
-    Return the ``PyObject *`` underlying a :class:`handle`.
-
-.. function:: const handle& handle::inc_ref() const
-
-    Manually increase the reference count of the Python object. Usually, it is
-    preferable to use the :class:`object` class which derives from
-    :class:`handle` and calls this function automatically. Returns a reference
-    to itself.
-
-.. function:: const handle& handle::dec_ref() const
-
-    Manually decrease the reference count of the Python object. Usually, it is
-    preferable to use the :class:`object` class which derives from
-    :class:`handle` and calls this function automatically. Returns a reference
-    to itself.
-
-.. function:: void handle::ref_count() const
-
-    Return the object's current reference count
-
-.. function:: handle handle::get_type() const
-
-    Return a handle to the Python type object underlying the instance
-
-.. function detail::accessor handle::operator[](handle key) const
-
-    Return an internal functor to invoke the object's sequence protocol.
-    Casting the returned ``detail::accessor`` instance to a :class:`handle` or
-    :class:`object` subclass causes a corresponding call to ``__getitem__``.
-    Assigning a :class:`handle` or :class:`object` subclass causes a call to
-    ``__setitem__``.
-
-.. function detail::accessor handle::operator[](const char *key) const
-
-    See the above function (the only difference is that they key is provided as
-    a string literal).
-
-.. function detail::accessor handle::attr(handle key) const
-
-    Return an internal functor to access the object's attributes.
-    Casting the returned ``detail::accessor`` instance to a :class:`handle` or
-    :class:`object` subclass causes a corresponding call to ``__getattr``.
-    Assigning a :class:`handle` or :class:`object` subclass causes a call to
-    ``__setattr``.
-
-.. function detail::accessor handle::attr(const char *key) const
-
-    See the above function (the only difference is that they key is provided as
-    a string literal).
-
-.. function operator handle::bool() const
-
-    Return ``true`` when the :class:`handle` wraps a valid Python object.
-
-.. function str handle::str() const
-
-    Return a string representation of the object. This is analogous to
-    the ``str()`` function in Python.
-
-.. function:: template <typename T> T handle::cast() const
-
-    Attempt to cast the Python object into the given C++ type. A
-    :class:`cast_error` will be throw upon failure.
+Common member functions
+-----------------------
 
-.. function:: template <typename ... Args> object handle::call(Args&&... args) const
+.. doxygenclass:: object_api
+    :members:
 
-    Assuming the Python object is a function or implements the ``__call__``
-    protocol, ``call()`` invokes the underlying function, passing an arbitrary
-    set of parameters. The result is returned as a :class:`object` and may need
-    to be converted back into a Python object using :func:`handle::cast`.
+Without reference counting
+--------------------------
 
-    When some of the arguments cannot be converted to Python objects, the
-    function will throw a :class:`cast_error` exception. When the Python
-    function call fails, a :class:`error_already_set` exception is thrown.
+.. doxygenclass:: handle
+    :members:
 
 With reference counting
 -----------------------
 
-.. class:: object : public handle
-
-    Like :class:`handle`, the object class is a thin wrapper around an
-    arbitrary Python object (i.e. a ``PyObject *`` in Python's C API). In
-    contrast to :class:`handle`, it optionally increases the object's reference
-    count upon construction, and it *always* decreases the reference count when
-    the :class:`object` instance goes out of scope and is destructed. When
-    using :class:`object` instances consistently, it is much easier to get
-    reference counting right at the first attempt.
-
-.. function:: object::object(const object &o)
-
-    Copy constructor; always increases the reference count
-
-.. function:: object::object(const handle &h, bool borrowed)
-
-    Creates a :class:`object` from the given :class:`handle`. The reference
-    count is only increased if the ``borrowed`` parameter is set to ``true``.
+.. doxygenclass:: object
+    :members:
 
-.. function:: object::object(PyObject *ptr, bool borrowed)
+.. doxygenfunction:: reinterpret_borrow
 
-    Creates a :class:`object` from the given raw Python object pointer. The
-    reference  count is only increased if the ``borrowed`` parameter is set to
-    ``true``.
-
-.. function:: object::object(object &&other)
-
-    Move constructor; steals the object from ``other`` and preserves its
-    reference count.
-
-.. function:: handle object::release()
-
-    Resets the internal pointer to ``nullptr`` without without decreasing the
-    object's reference count. The function returns a raw handle to the original
-    Python object.
-
-.. function:: object::~object()
-
-    Destructor, which automatically calls :func:`handle::dec_ref()`.
+.. doxygenfunction:: reinterpret_steal
 
 Convenience classes for specific Python types
 =============================================
 
+.. doxygenclass:: module
+    :members:
 
-.. class:: module : public object
-
-.. function:: module::module(const char *name, const char *doc = nullptr)
-
-    Create a new top-level Python module with the given name and docstring
-
-.. function:: module module::def_submodule(const char *name, const char *doc = nullptr)
-
-    Create and return a new Python submodule with the given name and docstring.
-    This also works recursively, i.e.
-
-    .. code-block:: cpp
-
-        pybind11::module m("example", "pybind11 example plugin");
-        pybind11::module m2 = m.def_submodule("sub", "A submodule of 'example'");
-        pybind11::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
-
-.. cpp:function:: template <typename Func, typename ... Extra> module& module::def(const char *name, Func && f, Extra && ... extra)
-
-    Create Python binding for a new function within the module scope. ``Func``
-    can be a plain C++ function, a function pointer, or a lambda function. For
-    details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+.. doxygengroup:: pytypes
+    :members:
 
 .. _extras:
 
-Passing extra arguments to the def function
-===========================================
+Passing extra arguments to ``def`` or ``class_``
+================================================
+
+.. doxygengroup:: annotations
+    :members:
 
-.. class:: arg
+Embedding the interpreter
+=========================
 
-.. function:: arg::arg(const char *name)
+.. doxygendefine:: PYBIND11_EMBEDDED_MODULE
 
-.. function:: template <typename T> arg_v arg::operator=(T &&value)
+.. doxygenfunction:: initialize_interpreter
 
-.. class:: arg_v : public arg
+.. doxygenfunction:: finalize_interpreter
 
-    Represents a named argument with a default value
+.. doxygenclass:: scoped_interpreter
 
-.. class:: sibling
+Redirecting C++ streams
+=======================
 
-    Used to specify a handle to an existing sibling function; used internally
-    to implement function overloading in :func:`module::def` and
-    :func:`class_::def`.
+.. doxygenclass:: scoped_ostream_redirect
 
-.. function:: sibling::sibling(handle handle)
+.. doxygenclass:: scoped_estream_redirect
 
-.. class doc
+.. doxygenfunction:: add_ostream_redirect
 
-    This is class is internally used by pybind11.
+Python build-in functions
+=========================
 
-.. function:: doc::doc(const char *value)
+.. doxygengroup:: python_builtins
+    :members:
 
-    Create a new docstring with the specified value
+Exceptions
+==========
 
-.. class name
+.. doxygenclass:: error_already_set
+    :members:
 
-    This is class is internally used by pybind11.
+.. doxygenclass:: builtin_exception
+    :members:
 
-.. function:: name::name(const char *value)
 
-    Used to specify the function name
+Literals
+========
 
+.. doxygennamespace:: literals
diff --git a/pybind11/docs/release.rst b/pybind11/docs/release.rst
index 30d159a6f..b31bbe97e 100644
--- a/pybind11/docs/release.rst
+++ b/pybind11/docs/release.rst
@@ -2,7 +2,7 @@ To release a new version of pybind11:
 
 - Update the version number and push to pypi
     - Update ``pybind11/_version.py`` (set release version, remove 'dev').
-    - Update ``PYBIND11_VERSION_MAJOR`` etc. in ``include/pybind11/common.h``.
+    - Update ``PYBIND11_VERSION_MAJOR`` etc. in ``include/pybind11/detail/common.h``.
     - Ensure that all the information in ``setup.py`` is up-to-date.
     - Update version in ``docs/conf.py``.
     - Tag release date in ``docs/changelog.rst``.
@@ -14,8 +14,9 @@ To release a new version of pybind11:
     - ``python setup.py sdist upload``.
     - ``python setup.py bdist_wheel upload``.
 - Update conda-forge (https://github.com/conda-forge/pybind11-feedstock) via PR
-    - change version number in ``recipe/meta.yml``
-    - update checksum to match the one computed by pypi
+    - download release package from Github: ``wget https://github.com/pybind/pybind11/archive/vX.Y.Z.tar.gz``
+    - compute checksum: ``shasum -a 256  vX.Y.Z.tar.gz``
+    - change version number and checksum in ``recipe/meta.yml``
 - Get back to work
     - Update ``_version.py`` (add 'dev' and increment minor).
     - Update version in ``docs/conf.py``
diff --git a/pybind11/docs/requirements.txt b/pybind11/docs/requirements.txt
new file mode 100644
index 000000000..3818fe80e
--- /dev/null
+++ b/pybind11/docs/requirements.txt
@@ -0,0 +1 @@
+breathe == 4.5.0
diff --git a/pybind11/docs/upgrade.rst b/pybind11/docs/upgrade.rst
new file mode 100644
index 000000000..3f5697391
--- /dev/null
+++ b/pybind11/docs/upgrade.rst
@@ -0,0 +1,404 @@
+Upgrade guide
+#############
+
+This is a companion guide to the :doc:`changelog`. While the changelog briefly
+lists all of the new features, improvements and bug fixes, this upgrade guide
+focuses only the subset which directly impacts your experience when upgrading
+to a new version. But it goes into more detail. This includes things like
+deprecated APIs and their replacements, build system changes, general code
+modernization and other useful information.
+
+
+v2.2
+====
+
+Deprecation of the ``PYBIND11_PLUGIN`` macro
+--------------------------------------------
+
+``PYBIND11_MODULE`` is now the preferred way to create module entry points.
+The old macro emits a compile-time deprecation warning.
+
+.. code-block:: cpp
+
+    // old
+    PYBIND11_PLUGIN(example) {
+        py::module m("example", "documentation string");
+
+        m.def("add", [](int a, int b) { return a + b; });
+
+        return m.ptr();
+    }
+
+    // new
+    PYBIND11_MODULE(example, m) {
+        m.doc() = "documentation string"; // optional
+
+        m.def("add", [](int a, int b) { return a + b; });
+    }
+
+
+New API for defining custom constructors and pickling functions
+---------------------------------------------------------------
+
+The old placement-new custom constructors have been deprecated. The new approach
+uses ``py::init()`` and factory functions to greatly improve type safety.
+
+Placement-new can be called accidentally with an incompatible type (without any
+compiler errors or warnings), or it can initialize the same object multiple times
+if not careful with the Python-side ``__init__`` calls. The new-style custom
+constructors prevent such mistakes. See :ref:`custom_constructors` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        .def("__init__", [](Foo &self, ...) {
+            new (&self) Foo(...); // uses placement-new
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        .def(py::init([](...) { // Note: no `self` argument
+            return new Foo(...); // return by raw pointer
+            // or: return std::make_unique<Foo>(...); // return by holder
+            // or: return Foo(...); // return by value (move constructor)
+        }));
+
+Mirroring the custom constructor changes, ``py::pickle()`` is now the preferred
+way to get and set object state. See :ref:`pickling` for details.
+
+.. code-block:: cpp
+
+    // old -- deprecated (runtime warning shown only in debug mode)
+    py::class<Foo>(m, "Foo")
+        ...
+        .def("__getstate__", [](const Foo &self) {
+            return py::make_tuple(self.value1(), self.value2(), ...);
+        })
+        .def("__setstate__", [](Foo &self, py::tuple t) {
+            new (&self) Foo(t[0].cast<std::string>(), ...);
+        });
+
+    // new
+    py::class<Foo>(m, "Foo")
+        ...
+        .def(py::pickle(
+            [](const Foo &self) { // __getstate__
+                return py::make_tuple(f.value1(), f.value2(), ...); // unchanged
+            },
+            [](py::tuple t) { // __setstate__, note: no `self` argument
+                return new Foo(t[0].cast<std::string>(), ...);
+                // or: return std::make_unique<Foo>(...); // return by holder
+                // or: return Foo(...); // return by value (move constructor)
+            }
+        ));
+
+For both the constructors and pickling, warnings are shown at module
+initialization time (on import, not when the functions are called).
+They're only visible when compiled in debug mode. Sample warning:
+
+.. code-block:: none
+
+    pybind11-bound class 'mymodule.Foo' is using an old-style placement-new '__init__'
+    which has been deprecated. See the upgrade guide in pybind11's docs.
+
+
+Stricter enforcement of hidden symbol visibility for pybind11 modules
+---------------------------------------------------------------------
+
+pybind11 now tries to actively enforce hidden symbol visibility for modules.
+If you're using either one of pybind11's :doc:`CMake or Python build systems
+<compiling>` (the two example repositories) and you haven't been exporting any
+symbols, there's nothing to be concerned about. All the changes have been done
+transparently in the background. If you were building manually or relied on
+specific default visibility, read on.
+
+Setting default symbol visibility to *hidden* has always been recommended for
+pybind11 (see :ref:`faq:symhidden`). On Linux and macOS, hidden symbol
+visibility (in conjunction with the ``strip`` utility) yields much smaller
+module binaries. `CPython's extension docs`_ also recommend hiding symbols
+by default, with the goal of avoiding symbol name clashes between modules.
+Starting with v2.2, pybind11 enforces this more strictly: (1) by declaring
+all symbols inside the ``pybind11`` namespace as hidden and (2) by including
+the ``-fvisibility=hidden`` flag on Linux and macOS (only for extension
+modules, not for embedding the interpreter).
+
+.. _CPython's extension docs: https://docs.python.org/3/extending/extending.html#providing-a-c-api-for-an-extension-module
+
+The namespace-scope hidden visibility is done automatically in pybind11's
+headers and it's generally transparent to users. It ensures that:
+
+* Modules compiled with different pybind11 versions don't clash with each other.
+
+* Some new features, like ``py::module_local`` bindings, can work as intended.
+
+The ``-fvisibility=hidden`` flag applies the same visibility to user bindings
+outside of the ``pybind11`` namespace. It's now set automatic by pybind11's
+CMake and Python build systems, but this needs to be done manually by users
+of other build systems. Adding this flag:
+
+* Minimizes the chances of symbol conflicts between modules. E.g. if two
+  unrelated modules were statically linked to different (ABI-incompatible)
+  versions of the same third-party library, a symbol clash would be likely
+  (and would end with unpredictable results).
+
+* Produces smaller binaries on Linux and macOS, as pointed out previously.
+
+Within pybind11's CMake build system, ``pybind11_add_module`` has always been
+setting the ``-fvisibility=hidden`` flag in release mode. From now on, it's
+being applied unconditionally, even in debug mode and it can no longer be opted
+out of with the ``NO_EXTRAS`` option. The ``pybind11::module`` target now also
+adds this flag to it's interface. The ``pybind11::embed`` target is unchanged.
+
+The most significant change here is for the ``pybind11::module`` target. If you
+were previously relying on default visibility, i.e. if your Python module was
+doubling as a shared library with dependents, you'll need to either export
+symbols manually (recommended for cross-platform libraries) or factor out the
+shared library (and have the Python module link to it like the other
+dependents). As a temporary workaround, you can also restore default visibility
+using the CMake code below, but this is not recommended in the long run:
+
+.. code-block:: cmake
+
+    target_link_libraries(mymodule PRIVATE pybind11::module)
+
+    add_library(restore_default_visibility INTERFACE)
+    target_compile_options(restore_default_visibility INTERFACE -fvisibility=default)
+    target_link_libraries(mymodule PRIVATE restore_default_visibility)
+
+
+Local STL container bindings
+----------------------------
+
+Previous pybind11 versions could only bind types globally -- all pybind11
+modules, even unrelated ones, would have access to the same exported types.
+However, this would also result in a conflict if two modules exported the
+same C++ type, which is especially problematic for very common types, e.g.
+``std::vector<int>``. :ref:`module_local` were added to resolve this (see
+that section for a complete usage guide).
+
+``py::class_`` still defaults to global bindings (because these types are
+usually unique across modules), however in order to avoid clashes of opaque
+types, ``py::bind_vector`` and ``py::bind_map`` will now bind STL containers
+as ``py::module_local`` if their elements are: builtins (``int``, ``float``,
+etc.), not bound using ``py::class_``, or bound as ``py::module_local``. For
+example, this change allows multiple modules to bind ``std::vector<int>``
+without causing conflicts. See :ref:`stl_bind` for more details.
+
+When upgrading to this version, if you have multiple modules which depend on
+a single global binding of an STL container, note that all modules can still
+accept foreign  ``py::module_local`` types in the direction of Python-to-C++.
+The locality only affects the C++-to-Python direction. If this is needed in
+multiple modules, you'll need to either:
+
+* Add a copy of the same STL binding to all of the modules which need it.
+
+* Restore the global status of that single binding by marking it
+  ``py::module_local(false)``.
+
+The latter is an easy workaround, but in the long run it would be best to
+localize all common type bindings in order to avoid conflicts with
+third-party modules.
+
+
+Negative strides for Python buffer objects and numpy arrays
+-----------------------------------------------------------
+
+Support for negative strides required changing the integer type from unsigned
+to signed in the interfaces of ``py::buffer_info`` and ``py::array``. If you
+have compiler warnings enabled, you may notice some new conversion warnings
+after upgrading. These can be resolved using ``static_cast``.
+
+
+Deprecation of some ``py::object`` APIs
+---------------------------------------
+
+To compare ``py::object`` instances by pointer, you should now use
+``obj1.is(obj2)`` which is equivalent to ``obj1 is obj2`` in Python.
+Previously, pybind11 used ``operator==`` for this (``obj1 == obj2``), but
+that could be confusing and is now deprecated (so that it can eventually
+be replaced with proper rich object comparison in a future release).
+
+For classes which inherit from ``py::object``, ``borrowed`` and ``stolen``
+were previously available as protected constructor tags. Now the types
+should be used directly instead: ``borrowed_t{}`` and ``stolen_t{}``
+(`#771 <https://github.com/pybind/pybind11/pull/771>`_).
+
+
+Stricter compile-time error checking
+------------------------------------
+
+Some error checks have been moved from run time to compile time. Notably,
+automatic conversion of ``std::shared_ptr<T>`` is not possible when ``T`` is
+not directly registered with ``py::class_<T>`` (e.g. ``std::shared_ptr<int>``
+or ``std::shared_ptr<std::vector<T>>`` are not automatically convertible).
+Attempting to bind a function with such arguments now results in a compile-time
+error instead of waiting to fail at run time.
+
+``py::init<...>()`` constructor definitions are also stricter and now prevent
+bindings which could cause unexpected behavior:
+
+.. code-block:: cpp
+
+    struct Example {
+        Example(int &);
+    };
+
+    py::class_<Example>(m, "Example")
+        .def(py::init<int &>()); // OK, exact match
+        // .def(py::init<int>()); // compile-time error, mismatch
+
+A non-``const`` lvalue reference is not allowed to bind to an rvalue. However,
+note that a constructor taking ``const T &`` can still be registered using
+``py::init<T>()`` because a ``const`` lvalue reference can bind to an rvalue.
+
+v2.1
+====
+
+Minimum compiler versions are enforced at compile time
+------------------------------------------------------
+
+The minimums also apply to v2.0 but the check is now explicit and a compile-time
+error is raised if the compiler does not meet the requirements:
+
+* GCC >= 4.8
+* clang >= 3.3 (appleclang >= 5.0)
+* MSVC >= 2015u3
+* Intel C++ >= 15.0
+
+
+The ``py::metaclass`` attribute is not required for static properties
+---------------------------------------------------------------------
+
+Binding classes with static properties is now possible by default. The
+zero-parameter version of ``py::metaclass()`` is deprecated. However, a new
+one-parameter ``py::metaclass(python_type)`` version was added for rare
+cases when a custom metaclass is needed to override pybind11's default.
+
+.. code-block:: cpp
+
+    // old -- emits a deprecation warning
+    py::class_<Foo>(m, "Foo", py::metaclass())
+        .def_property_readonly_static("foo", ...);
+
+    // new -- static properties work without the attribute
+    py::class_<Foo>(m, "Foo")
+        .def_property_readonly_static("foo", ...);
+
+    // new -- advanced feature, override pybind11's default metaclass
+    py::class_<Bar>(m, "Bar", py::metaclass(custom_python_type))
+        ...
+
+
+v2.0
+====
+
+Breaking changes in ``py::class_``
+----------------------------------
+
+These changes were necessary to make type definitions in pybind11
+future-proof, to support PyPy via its ``cpyext`` mechanism (`#527
+<https://github.com/pybind/pybind11/pull/527>`_), and to improve efficiency
+(`rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_).
+
+1. Declarations of types that provide access via the buffer protocol must
+   now include the ``py::buffer_protocol()`` annotation as an argument to
+   the ``py::class_`` constructor.
+
+   .. code-block:: cpp
+
+       py::class_<Matrix>("Matrix", py::buffer_protocol())
+           .def(py::init<...>())
+           .def_buffer(...);
+
+2. Classes which include static properties (e.g. ``def_readwrite_static()``)
+   must now include the ``py::metaclass()`` attribute. Note: this requirement
+   has since been removed in v2.1. If you're upgrading from 1.x, it's
+   recommended to skip directly to v2.1 or newer.
+
+3. This version of pybind11 uses a redesigned mechanism for instantiating
+   trampoline classes that are used to override virtual methods from within
+   Python. This led to the following user-visible syntax change:
+
+   .. code-block:: cpp
+
+       // old v1.x syntax
+       py::class_<TrampolineClass>("MyClass")
+           .alias<MyClass>()
+           ...
+
+       // new v2.x syntax
+       py::class_<MyClass, TrampolineClass>("MyClass")
+           ...
+
+   Importantly, both the original and the trampoline class are now specified
+   as arguments to the ``py::class_`` template, and the ``alias<..>()`` call
+   is gone. The new scheme has zero overhead in cases when Python doesn't
+   override any functions of the underlying C++ class.
+   `rev. 86d825 <https://github.com/pybind/pybind11/commit/86d825>`_.
+
+   The class type must be the first template argument given to ``py::class_``
+   while the trampoline can be mixed in arbitrary order with other arguments
+   (see the following section).
+
+
+Deprecation of the ``py::base<T>()`` attribute
+----------------------------------------------
+
+``py::base<T>()`` was deprecated in favor of specifying ``T`` as a template
+argument to ``py::class_``. This new syntax also supports multiple inheritance.
+Note that, while the type being exported must be the first argument in the
+``py::class_<Class, ...>`` template, the order of the following types (bases,
+holder and/or trampoline) is not important.
+
+.. code-block:: cpp
+
+    // old v1.x
+    py::class_<Derived>("Derived", py::base<Base>());
+
+    // new v2.x
+    py::class_<Derived, Base>("Derived");
+
+    // new -- multiple inheritance
+    py::class_<Derived, Base1, Base2>("Derived");
+
+    // new -- apart from `Derived` the argument order can be arbitrary
+    py::class_<Derived, Base1, Holder, Base2, Trampoline>("Derived");
+
+
+Out-of-the-box support for ``std::shared_ptr``
+----------------------------------------------
+
+The relevant type caster is now built in, so it's no longer necessary to
+include a declaration of the form:
+
+.. code-block:: cpp
+
+    PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)
+
+Continuing to do so won’t cause an error or even a deprecation warning,
+but it's completely redundant.
+
+
+Deprecation of a few ``py::object`` APIs
+----------------------------------------
+
+All of the old-style calls emit deprecation warnings.
+
++---------------------------------------+---------------------------------------------+
+|  Old syntax                           |  New syntax                                 |
++=======================================+=============================================+
+| ``obj.call(args...)``                 | ``obj(args...)``                            |
++---------------------------------------+---------------------------------------------+
+| ``obj.str()``                         | ``py::str(obj)``                            |
++---------------------------------------+---------------------------------------------+
+| ``auto l = py::list(obj); l.check()`` | ``py::isinstance<py::list>(obj)``           |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, true)``             | ``py::reinterpret_borrow<py::object>(ptr)`` |
++---------------------------------------+---------------------------------------------+
+| ``py::object(ptr, false)``            | ``py::reinterpret_steal<py::object>(ptr)``  |
++---------------------------------------+---------------------------------------------+
+| ``if (obj.attr("foo"))``              | ``if (py::hasattr(obj, "foo"))``            |
++---------------------------------------+---------------------------------------------+
+| ``if (obj["bar"])``                   | ``if (obj.contains("bar"))``                |
++---------------------------------------+---------------------------------------------+
diff --git a/pybind11/include/pybind11/attr.h b/pybind11/include/pybind11/attr.h
index 0676d5da6..dce875a6b 100644
--- a/pybind11/include/pybind11/attr.h
+++ b/pybind11/include/pybind11/attr.h
@@ -1,5 +1,5 @@
 /*
-    pybind11/pybind11.h: Infrastructure for processing custom
+    pybind11/attr.h: Infrastructure for processing custom
     type and function attributes
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
@@ -12,7 +12,10 @@
 
 #include "cast.h"
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
 
 /// Annotation for methods
 struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
@@ -39,7 +42,7 @@ template <typename T> struct base {
 };
 
 /// Keep patient alive while nurse lives
-template <int Nurse, int Patient> struct keep_alive { };
+template <size_t Nurse, size_t Patient> struct keep_alive { };
 
 /// Annotation indicating that a class is involved in a multiple inheritance relationship
 struct multiple_inheritance { };
@@ -51,36 +54,87 @@ struct dynamic_attr { };
 struct buffer_protocol { };
 
 /// Annotation which requests that a special metaclass is created for a type
-struct metaclass { };
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
 
 /// Annotation to mark enums as an arithmetic type
 struct arithmetic { };
 
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
 NAMESPACE_BEGIN(detail)
 /* Forward declarations */
 enum op_id : int;
 enum op_type : int;
 struct undefined_t;
 template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
-template <typename... Args> struct init;
-template <typename... Args> struct init_alias;
-inline void keep_alive_impl(int Nurse, int Patient, handle args, handle ret);
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
 
 /// Internal data structure which holds metadata about a keyword argument
 struct argument_record {
     const char *name;  ///< Argument name
     const char *descr; ///< Human-readable version of the argument value
     handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
 
-    argument_record(const char *name, const char *descr, handle value)
-        : name(name), descr(descr), value(value) { }
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
 };
 
 /// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
 struct function_record {
     function_record()
-        : is_constructor(false), is_stateless(false), is_operator(false),
-          has_args(false), has_kwargs(false), is_method(false) { }
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
 
     /// Function name
     char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
@@ -95,7 +149,7 @@ struct function_record {
     std::vector<argument_record> args;
 
     /// Pointer to lambda function which converts arguments and performs the actual call
-    handle (*impl) (function_record *, handle, handle, handle) = nullptr;
+    handle (*impl) (function_call &) = nullptr;
 
     /// Storage for the wrapped function pointer and captured data, if any
     void *data[3] = { };
@@ -109,6 +163,9 @@ struct function_record {
     /// True if name == '__init__'
     bool is_constructor : 1;
 
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
     /// True if this is a stateless function pointer
     bool is_stateless : 1;
 
@@ -124,8 +181,8 @@ struct function_record {
     /// True if this is a method
     bool is_method : 1;
 
-    /// Number of arguments
-    uint16_t nargs;
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
 
     /// Python method object
     PyMethodDef *def = nullptr;
@@ -143,8 +200,7 @@ struct function_record {
 /// Special data structure which (temporarily) holds metadata about a bound class
 struct type_record {
     PYBIND11_NOINLINE type_record()
-        : multiple_inheritance(false), dynamic_attr(false),
-          buffer_protocol(false), metaclass(false) { }
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false), module_local(false) { }
 
     /// Handle to the parent scope
     handle scope;
@@ -158,14 +214,17 @@ struct type_record {
     /// How large is the underlying C++ type?
     size_t type_size = 0;
 
-    /// How large is pybind11::instance<type>?
-    size_t instance_size = 0;
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = ::operator new;
 
-    /// Function pointer to class_<..>::init_holder
-    void (*init_holder)(PyObject *, const void *) = nullptr;
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
 
     /// Function pointer to class_<..>::dealloc
-    void (*dealloc)(PyObject *) = nullptr;
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
 
     /// List of base classes of the newly created type
     list bases;
@@ -173,6 +232,9 @@ struct type_record {
     /// Optional docstring
     const char *doc = nullptr;
 
+    /// Custom metaclass (optional)
+    handle metaclass;
+
     /// Multiple inheritance marker
     bool multiple_inheritance : 1;
 
@@ -182,28 +244,49 @@ struct type_record {
     /// Does the class implement the buffer protocol?
     bool buffer_protocol : 1;
 
-    /// Does the class require its own metaclass?
-    bool metaclass : 1;
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
 
-    PYBIND11_NOINLINE void add_base(const std::type_info *base, void *(*caster)(void *)) {
-        auto base_info = detail::get_type_info(*base, false);
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
         if (!base_info) {
-            std::string tname(base->name());
+            std::string tname(base.name());
             detail::clean_type_id(tname);
             pybind11_fail("generic_type: type \"" + std::string(name) +
                           "\" referenced unknown base type \"" + tname + "\"");
         }
 
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
         bases.append((PyObject *) base_info->type);
 
         if (base_info->type->tp_dictoffset != 0)
             dynamic_attr = true;
 
         if (caster)
-            base_info->implicit_casts.push_back(std::make_pair(type, caster));
+            base_info->implicit_casts.emplace_back(type, caster);
     }
 };
 
+inline function_call::function_call(function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
 /**
  * Partial template specializations to process custom attributes provided to
  * cpp_function_ and class_. These are either used to initialize the respective
@@ -216,8 +299,8 @@ template <typename T> struct process_attribute_default {
     /// Default implementation: do nothing
     static void init(const T &, function_record *) { }
     static void init(const T &, type_record *) { }
-    static void precall(handle) { }
-    static void postcall(handle, handle) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
 };
 
 /// Process an attribute specifying the function's name
@@ -262,12 +345,16 @@ template <> struct process_attribute<is_operator> : process_attribute_default<is
     static void init(const is_operator &, function_record *r) { r->is_operator = true; }
 };
 
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
 /// Process a keyword argument attribute (*without* a default value)
 template <> struct process_attribute<arg> : process_attribute_default<arg> {
     static void init(const arg &a, function_record *r) {
         if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr, handle());
-        r->args.emplace_back(a.name, nullptr, handle());
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
     }
 };
 
@@ -275,32 +362,34 @@ template <> struct process_attribute<arg> : process_attribute_default<arg> {
 template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
     static void init(const arg_v &a, function_record *r) {
         if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr, handle());
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
 
         if (!a.value) {
 #if !defined(NDEBUG)
-            auto descr = "'" + std::string(a.name) + ": " + a.type + "'";
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
             if (r->is_method) {
                 if (r->name)
                     descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
                 else
                     descr += " in method of '" + (std::string) str(r->scope) + "'";
             } else if (r->name) {
-                descr += " in function named '" + (std::string) r->name + "'";
+                descr += " in function '" + (std::string) r->name + "'";
             }
-            pybind11_fail("arg(): could not convert default keyword argument "
+            pybind11_fail("arg(): could not convert default argument "
                           + descr + " into a Python object (type not registered yet?)");
 #else
-            pybind11_fail("arg(): could not convert default keyword argument "
+            pybind11_fail("arg(): could not convert default argument "
                           "into a Python object (type not registered yet?). "
                           "Compile in debug mode for more information.");
 #endif
         }
-        r->args.emplace_back(a.name, a.descr, a.value.inc_ref());
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
     }
 };
 
-/// Process a parent class attribute
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
 template <typename T>
 struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
     static void init(const handle &h, type_record *r) { r->bases.append(h); }
@@ -309,7 +398,7 @@ struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attrib
 /// Process a parent class attribute (deprecated, does not support multiple inheritance)
 template <typename T>
 struct process_attribute<base<T>> : process_attribute_default<base<T>> {
-    static void init(const base<T> &, type_record *r) { r->add_base(&typeid(T), nullptr); }
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
 };
 
 /// Process a multiple inheritance attribute
@@ -330,28 +419,35 @@ struct process_attribute<buffer_protocol> : process_attribute_default<buffer_pro
 
 template <>
 struct process_attribute<metaclass> : process_attribute_default<metaclass> {
-    static void init(const metaclass &, type_record *r) { r->metaclass = true; }
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
 };
 
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
 
 /// Process an 'arithmetic' attribute for enums (does nothing here)
 template <>
 struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
 
-/***
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
  * Process a keep_alive call policy -- invokes keep_alive_impl during the
  * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
  * otherwise
  */
-template <int Nurse, int Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
-    template <int N = Nurse, int P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
-    static void precall(handle args) { keep_alive_impl(Nurse, Patient, args, handle()); }
-    template <int N = Nurse, int P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
-    static void postcall(handle, handle) { }
-    template <int N = Nurse, int P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
-    static void precall(handle) { }
-    template <int N = Nurse, int P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
-    static void postcall(handle args, handle ret) { keep_alive_impl(Nurse, Patient, args, ret); }
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
 };
 
 /// Recursively iterate over variadic template arguments
@@ -364,23 +460,30 @@ template <typename... Args> struct process_attributes {
         int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
         ignore_unused(unused);
     }
-    static void precall(handle fn_args) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(fn_args), 0) ... };
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
         ignore_unused(unused);
     }
-    static void postcall(handle fn_args, handle fn_ret) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(fn_args, fn_ret), 0) ... };
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
         ignore_unused(unused);
     }
 };
 
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
 /// Check the number of named arguments at compile time
 template <typename... Extra,
           size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
           size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
-constexpr bool expected_num_args(size_t nargs) {
-    return named == 0 || (self + named) == nargs;
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
 }
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/buffer_info.h b/pybind11/include/pybind11/buffer_info.h
new file mode 100644
index 000000000..9f072fa73
--- /dev/null
+++ b/pybind11/include/pybind11/buffer_info.h
@@ -0,0 +1,108 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of entries between adjacent entries (for each per dimension)
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in)) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}) {
+        this->view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(view, rhs.view);
+        std::swap(ownview, rhs.ownview);
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (view && ownview) { PyBuffer_Release(view); delete view; }
+    }
+
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in)) { }
+
+    Py_buffer *view = nullptr;
+    bool ownview = false;
+};
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/cast.h b/pybind11/include/pybind11/cast.h
index b953cc897..eab904bee 100644
--- a/pybind11/include/pybind11/cast.h
+++ b/pybind11/include/pybind11/cast.h
@@ -11,89 +11,184 @@
 #pragma once
 
 #include "pytypes.h"
-#include "typeid.h"
-#include "descr.h"
+#include "detail/typeid.h"
+#include "detail/descr.h"
+#include "detail/internals.h"
 #include <array>
 #include <limits>
+#include <tuple>
+
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#    define PYBIND11_HAS_STRING_VIEW
+#  endif
+#endif
+#ifdef PYBIND11_HAS_STRING_VIEW
+#include <string_view>
+#endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-/// Additional type information which does not fit into the PyTypeObject
-struct type_info {
-    PyTypeObject *type;
-    size_t type_size;
-    void (*init_holder)(PyObject *, const void *);
-    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
-    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
-    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
-    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
-    void *get_buffer_data = nullptr;
-    /** A simple type never occurs as a (direct or indirect) parent
-     * of a class that makes use of multiple inheritance */
-    bool simple_type = true;
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        get_internals().loader_patient_stack.push_back(nullptr);
+    }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            pybind11_fail("loader_life_support: internal error");
+
+        auto ptr = stack.back();
+        stack.pop_back();
+        Py_CLEAR(ptr);
+
+        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
+        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
+            stack.shrink_to_fit();
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+
+        auto &list_ptr = stack.back();
+        if (list_ptr == nullptr) {
+            list_ptr = PyList_New(1);
+            if (!list_ptr)
+                pybind11_fail("loader_life_support: error allocating list");
+            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+        } else {
+            auto result = PyList_Append(list_ptr, h.ptr());
+            if (result == -1)
+                pybind11_fail("loader_life_support: error adding patient");
+        }
+    }
 };
 
-PYBIND11_NOINLINE inline internals &get_internals() {
-    static internals *internals_ptr = nullptr;
-    if (internals_ptr)
-        return *internals_ptr;
-    handle builtins(PyEval_GetBuiltins());
-    const char *id = PYBIND11_INTERNALS_ID;
-    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
-        internals_ptr = capsule(builtins[id]);
-    } else {
-        internals_ptr = new internals();
-        #if defined(WITH_THREAD)
-            PyEval_InitThreads();
-            PyThreadState *tstate = PyThreadState_Get();
-            internals_ptr->tstate = PyThread_create_key();
-            PyThread_set_key_value(internals_ptr->tstate, tstate);
-            internals_ptr->istate = tstate->interp;
-        #endif
-        builtins[id] = capsule(internals_ptr);
-        internals_ptr->registered_exception_translators.push_front(
-            [](std::exception_ptr p) -> void {
-                try {
-                    if (p) std::rethrow_exception(p);
-                } catch (error_already_set &e)           { e.restore();                                    return;
-                } catch (const builtin_exception &e)     { e.set_error();                                  return;
-                } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
-                } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
-                } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-                } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
-                } catch (...) {
-                    PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
-                    return;
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
                 }
+                if (!found) bases.push_back(tinfo);
             }
-        );
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
     }
-    return *internals_ptr;
 }
 
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
 PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
-    auto const &type_dict = get_internals().registered_types_py;
-    do {
-        auto it = type_dict.find(type);
-        if (it != type_dict.end())
-            return (detail::type_info *) it->second;
-        type = type->tp_base;
-        if (!type)
-            return nullptr;
-    } while (true);
+    auto &bases = all_type_info(type);
+    if (bases.size() == 0)
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
 }
 
-PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_info &tp,
-                                                          bool throw_if_missing = false) {
-    auto &types = get_internals().registered_types_cpp;
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = registered_local_types_cpp();
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+}
 
-    auto it = types.find(std::type_index(tp));
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
     if (it != types.end())
-        return (detail::type_info *) it->second;
+        return it->second;
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+
     if (throw_if_missing) {
         std::string tname = tp.name();
         detail::clean_type_id(tname);
@@ -107,6 +202,199 @@ PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool t
     return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
 }
 
+struct value_and_holder {
+    instance *inst;
+    size_t index;
+    const detail::type_info *type;
+    void **vh;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() : inst{nullptr} {}
+
+    // Used for past-the-end iterator
+    value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr(); }
+
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+            ? inst->simple_holder_constructed
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
+    }
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst;
+        const type_vec *types;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+
+    if (!throw_if_missing)
+        return value_and_holder();
+
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+#else
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
+            std::string(Py_TYPE(this)->tp_name) + "' instance");
+#endif
+}
+
+PYBIND11_NOINLINE inline void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+#else
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+#endif
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+PYBIND11_NOINLINE inline void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+}
+
 PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
     handle type = detail::get_type_handle(tp, false);
     if (!type)
@@ -155,7 +443,6 @@ PYBIND11_NOINLINE inline std::string error_string() {
                 handle(frame->f_code->co_name).cast<std::string>() + "\n";
             frame = frame->f_back;
         }
-        trace = trace->tb_next;
     }
 #endif
 
@@ -166,9 +453,10 @@ PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail:
     auto &instances = get_internals().registered_instances;
     auto range = instances.equal_range(ptr);
     for (auto it = range.first; it != range.second; ++it) {
-        auto instance_type = detail::get_type_info(Py_TYPE(it->second));
-        if (instance_type && instance_type == type)
-            return handle((PyObject *) it->second);
+        for (auto vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
     }
     return handle();
 }
@@ -187,136 +475,63 @@ inline PyThreadState *get_thread_state_unchecked() {
 #endif
 }
 
-// Forward declaration
+// Forward declarations
 inline void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
 
 class type_caster_generic {
 public:
     PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
-     : typeinfo(get_type_info(type_info)) { }
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
 
-    PYBIND11_NOINLINE bool load(handle src, bool convert) {
-        if (!src)
-            return false;
-        return load(src, convert, Py_TYPE(src.ptr()));
-    }
+    type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
 
-    bool load(handle src, bool convert, PyTypeObject *tobj) {
-        if (!src || !typeinfo)
-            return false;
-        if (src.is_none()) {
-            value = nullptr;
-            return true;
-        }
-
-        if (typeinfo->simple_type) { /* Case 1: no multiple inheritance etc. involved */
-            /* Check if we can safely perform a reinterpret-style cast */
-            if (PyType_IsSubtype(tobj, typeinfo->type)) {
-                value = reinterpret_cast<instance<void> *>(src.ptr())->value;
-                return true;
-            }
-        } else { /* Case 2: multiple inheritance */
-            /* Check if we can safely perform a reinterpret-style cast */
-            if (tobj == typeinfo->type) {
-                value = reinterpret_cast<instance<void> *>(src.ptr())->value;
-                return true;
-            }
-
-            /* If this is a python class, also check the parents recursively */
-            auto const &type_dict = get_internals().registered_types_py;
-            bool new_style_class = PyType_Check((PyObject *) tobj);
-            if (type_dict.find(tobj) == type_dict.end() && new_style_class && tobj->tp_bases) {
-                auto parents = reinterpret_borrow<tuple>(tobj->tp_bases);
-                for (handle parent : parents) {
-                    bool result = load(src, convert, (PyTypeObject *) parent.ptr());
-                    if (result)
-                        return true;
-                }
-            }
-
-            /* Try implicit casts */
-            for (auto &cast : typeinfo->implicit_casts) {
-                type_caster_generic sub_caster(*cast.first);
-                if (sub_caster.load(src, convert)) {
-                    value = cast.second(sub_caster.value);
-                    return true;
-                }
-            }
-        }
-
-        /* Perform an implicit conversion */
-        if (convert) {
-            for (auto &converter : typeinfo->implicit_conversions) {
-                temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
-                if (load(temp, false))
-                    return true;
-            }
-            for (auto &converter : *typeinfo->direct_conversions) {
-                if (converter(src.ptr(), value))
-                    return true;
-            }
-        }
-        return false;
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
     }
 
     PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
-                                         const std::type_info *type_info,
-                                         const std::type_info *type_info_backup,
+                                         const detail::type_info *tinfo,
                                          void *(*copy_constructor)(const void *),
                                          void *(*move_constructor)(const void *),
                                          const void *existing_holder = nullptr) {
-        void *src = const_cast<void *>(_src);
-        if (src == nullptr)
-            return none().inc_ref();
-
-        auto &internals = get_internals();
-
-        auto it = internals.registered_types_cpp.find(std::type_index(*type_info));
-        if (it == internals.registered_types_cpp.end()) {
-            type_info = type_info_backup;
-            it = internals.registered_types_cpp.find(std::type_index(*type_info));
-        }
-
-        if (it == internals.registered_types_cpp.end()) {
-            std::string tname = type_info->name();
-            detail::clean_type_id(tname);
-            std::string msg = "Unregistered type : " + tname;
-            PyErr_SetString(PyExc_TypeError, msg.c_str());
+        if (!tinfo) // no type info: error will be set already
             return handle();
-        }
 
-        auto tinfo = (const detail::type_info *) it->second;
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
 
-        auto it_instances = internals.registered_instances.equal_range(src);
+        auto it_instances = get_internals().registered_instances.equal_range(src);
         for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
-            auto instance_type = detail::get_type_info(Py_TYPE(it_i->second));
-            if (instance_type && instance_type == tinfo)
-                return handle((PyObject *) it_i->second).inc_ref();
+            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                    return handle((PyObject *) it_i->second).inc_ref();
+            }
         }
 
-        auto inst = reinterpret_steal<object>(PyType_GenericAlloc(tinfo->type, 0));
-
-        auto wrapper = (instance<void> *) inst.ptr();
-
-        wrapper->value = nullptr;
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
         wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
 
         switch (policy) {
             case return_value_policy::automatic:
             case return_value_policy::take_ownership:
-                wrapper->value = src;
+                valueptr = src;
                 wrapper->owned = true;
                 break;
 
             case return_value_policy::automatic_reference:
             case return_value_policy::reference:
-                wrapper->value = src;
+                valueptr = src;
                 wrapper->owned = false;
                 break;
 
             case return_value_policy::copy:
                 if (copy_constructor)
-                    wrapper->value = copy_constructor(src);
+                    valueptr = copy_constructor(src);
                 else
                     throw cast_error("return_value_policy = copy, but the "
                                      "object is non-copyable!");
@@ -325,9 +540,9 @@ public:
 
             case return_value_policy::move:
                 if (move_constructor)
-                    wrapper->value = move_constructor(src);
+                    valueptr = move_constructor(src);
                 else if (copy_constructor)
-                    wrapper->value = copy_constructor(src);
+                    valueptr = copy_constructor(src);
                 else
                     throw cast_error("return_value_policy = move, but the "
                                      "object is neither movable nor copyable!");
@@ -335,33 +550,210 @@ public:
                 break;
 
             case return_value_policy::reference_internal:
-                wrapper->value = src;
+                valueptr = src;
                 wrapper->owned = false;
-                detail::keep_alive_impl(inst, parent);
+                keep_alive_impl(inst, parent);
                 break;
 
             default:
                 throw cast_error("unhandled return_value_policy: should not happen!");
         }
 
-        tinfo->init_holder(inst.ptr(), existing_holder);
-
-        internals.registered_instances.emplace(wrapper->value, inst.ptr());
+        tinfo->init_instance(wrapper, existing_holder);
 
         return inst.release();
     }
 
-protected:
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            vptr = type->operator_new(type->type_size);
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = src.get_type();
+        if (!hasattr(pytype, local_key))
+            return false;
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            value = nullptr;
+            return true;
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            else if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        return try_load_foreign_module_local(src);
+    }
+
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
     const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
     void *value = nullptr;
-    object temp;
 };
 
-/* Determine suitable casting operator */
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
 template <typename T>
-using cast_op_type = typename std::conditional<std::is_pointer<typename std::remove_reference<T>::type>::value,
-    typename std::add_pointer<intrinsic_t<T>>::type,
-    typename std::add_lvalue_reference<intrinsic_t<T>>::type>::type;
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
 
 // std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
 // T is non-copyable, but code containing such a copy constructor fails to actually compile.
@@ -370,10 +762,17 @@ template <typename T, typename SFINAE = void> struct is_copy_constructible : std
 // Specialization for types that appear to be copy constructible but also look like stl containers
 // (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
 // so, copy constructability depends on whether the value_type is copy constructible.
-template <typename Container> struct is_copy_constructible<Container, enable_if_t<
-        std::is_copy_constructible<Container>::value &&
-        std::is_same<typename Container::value_type &, typename Container::reference>::value
-    >> : std::is_copy_constructible<typename Container::value_type> {};
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+
+#if !defined(PYBIND11_CPP17)
+// Likewise for std::pair before C++17 (which mandates that the copy constructor not exist when the
+// two types aren't themselves copy constructible).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+#endif
 
 /// Generic type caster for objects stored on the heap
 template <typename type> class type_caster_base : public type_caster_generic {
@@ -394,36 +793,71 @@ public:
         return cast(&src, return_value_policy::move, parent);
     }
 
+    // Returns a (pointer, type_info) pair taking care of necessary RTTI type lookup for a
+    // polymorphic type.  If the instance isn't derived, returns the non-RTTI base version.
+    template <typename T = itype, enable_if_t<std::is_polymorphic<T>::value, int> = 0>
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        const void *vsrc = src;
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        if (vsrc) {
+            instance_type = &typeid(*src);
+            if (!same_type(cast_type, *instance_type)) {
+                // This is a base pointer to a derived type; if it is a pybind11-registered type, we
+                // can get the correct derived pointer (which may be != base pointer) by a
+                // dynamic_cast to most derived type:
+                if (auto *tpi = get_type_info(*instance_type))
+                    return {dynamic_cast<const void *>(src), const_cast<const type_info *>(tpi)};
+            }
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(vsrc, cast_type, instance_type);
+    }
+
+    // Non-polymorphic type, so no dynamic casting; just call the generic version directly
+    template <typename T = itype, enable_if_t<!std::is_polymorphic<T>::value, int> = 0>
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        return type_caster_generic::src_and_type(src, typeid(itype));
+    }
+
     static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
         return type_caster_generic::cast(
-            src, policy, parent, src ? &typeid(*src) : nullptr, &typeid(type),
+            st.first, policy, parent, st.second,
             make_copy_constructor(src), make_move_constructor(src));
     }
 
-    template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+
+    template <typename T> using cast_op_type = cast_op_type<T>;
 
     operator itype*() { return (type *) value; }
     operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
 
 protected:
-    typedef void *(*Constructor)(const void *stream);
-#if !defined(_MSC_VER)
+    using Constructor = void *(*)(const void *);
+
     /* Only enabled when the types are {copy,move}-constructible *and* when the type
-       does not have a private operator new implementaton. */
-    template <typename T = type, typename = enable_if_t<is_copy_constructible<T>::value>> static auto make_copy_constructor(const T *value) -> decltype(new T(*value), Constructor(nullptr)) {
-        return [](const void *arg) -> void * { return new T(*((const T *) arg)); }; }
-    template <typename T = type> static auto make_move_constructor(const T *value) -> decltype(new T(std::move(*((T *) value))), Constructor(nullptr)) {
-        return [](const void *arg) -> void * { return (void *) new T(std::move(*((T *) arg))); }; }
-#else
-    /* Visual Studio 2015's SFINAE implementation doesn't yet handle the above robustly in all situations.
-       Use a workaround that only tests for constructibility for now. */
-    template <typename T = type, typename = enable_if_t<is_copy_constructible<T>::value>>
-    static Constructor make_copy_constructor(const T *value) {
-        return [](const void *arg) -> void * { return new T(*((const T *)arg)); }; }
-    template <typename T = type, typename = enable_if_t<std::is_move_constructible<T>::value>>
-    static Constructor make_move_constructor(const T *value) {
-        return [](const void *arg) -> void * { return (void *) new T(std::move(*((T *)arg))); }; }
-#endif
+       does not have a private operator new implementation. */
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
 
     static Constructor make_copy_constructor(...) { return nullptr; }
     static Constructor make_move_constructor(...) { return nullptr; }
@@ -436,17 +870,30 @@ template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
 template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
     return caster.operator typename make_caster<T>::template cast_op_type<T>();
 }
-template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &&caster) {
-    return cast_op<T>(caster);
+template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator
+        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
 }
 
-template <typename type> class type_caster<std::reference_wrapper<type>> : public type_caster_base<type> {
+template <typename type> class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
+            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
 public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static PYBIND11_DESCR name() { return caster_t::name(); }
     static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
-        return type_caster_base<type>::cast(&src.get(), policy, parent);
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
+            policy = return_value_policy::automatic_reference;
+        return caster_t::cast(&src.get(), policy, parent);
     }
     template <typename T> using cast_op_type = std::reference_wrapper<type>;
-    operator std::reference_wrapper<type>() { return std::ref(*((type *) this->value)); }
+    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
 };
 
 #define PYBIND11_TYPE_CASTER(type, py_name) \
@@ -454,58 +901,72 @@ public:
         type value; \
     public: \
         static PYBIND11_DESCR name() { return type_descr(py_name); } \
-        static handle cast(const type *src, return_value_policy policy, handle parent) { \
-            return cast(*src, policy, parent); \
+        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
+        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
+            if (!src) return none().release(); \
+            if (policy == return_value_policy::take_ownership) { \
+                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
+            } else { \
+                return cast(*src, policy, parent); \
+            } \
         } \
         operator type*() { return &value; } \
         operator type&() { return value; } \
-        template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>
+        operator type&&() && { return std::move(value); } \
+        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
 
 
+template <typename CharT> using is_std_char_type = any_of<
+    std::is_same<CharT, char>, /* std::string */
+    std::is_same<CharT, char16_t>, /* std::u16string */
+    std::is_same<CharT, char32_t>, /* std::u32string */
+    std::is_same<CharT, wchar_t> /* std::wstring */
+>;
+
 template <typename T>
-struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value>> {
-    typedef typename std::conditional<sizeof(T) <= sizeof(long), long, long long>::type _py_type_0;
-    typedef typename std::conditional<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>::type _py_type_1;
-    typedef typename std::conditional<std::is_floating_point<T>::value, double, _py_type_1>::type py_type;
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
 public:
 
-    bool load(handle src, bool) {
+    bool load(handle src, bool convert) {
         py_type py_value;
 
-        if (!src) {
+        if (!src)
             return false;
-        } if (std::is_floating_point<T>::value) {
-            py_value = (py_type) PyFloat_AsDouble(src.ptr());
-        } else if (sizeof(T) <= sizeof(long)) {
-            if (PyFloat_Check(src.ptr()))
-                return false;
-            if (std::is_signed<T>::value)
-                py_value = (py_type) PyLong_AsLong(src.ptr());
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr()))
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
             else
-                py_value = (py_type) PyLong_AsUnsignedLong(src.ptr());
-        } else {
-            if (PyFloat_Check(src.ptr()))
                 return false;
-            if (std::is_signed<T>::value)
-                py_value = (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
-            else
-                py_value = (py_type) PYBIND11_LONG_AS_UNSIGNED_LONGLONG(src.ptr());
+        } else if (PyFloat_Check(src.ptr())) {
+            return false;
+        } else if (std::is_unsigned<py_type>::value) {
+            py_value = as_unsigned<py_type>(src.ptr());
+        } else { // signed integer:
+            py_value = sizeof(T) <= sizeof(long)
+                ? (py_type) PyLong_AsLong(src.ptr())
+                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
         }
 
-        if ((py_value == (py_type) -1 && PyErr_Occurred()) ||
-            (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
-               (py_value < (py_type) std::numeric_limits<T>::min() ||
-                py_value > (py_type) std::numeric_limits<T>::max()))) {
-#if PY_VERSION_HEX < 0x03000000
-            bool type_error = PyErr_ExceptionMatches(PyExc_SystemError);
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                       (py_value < (py_type) std::numeric_limits<T>::min() ||
+                        py_value > (py_type) std::numeric_limits<T>::max()))) {
+            bool type_error = py_err && PyErr_ExceptionMatches(
+#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
+                PyExc_SystemError
 #else
-            bool type_error = PyErr_ExceptionMatches(PyExc_TypeError);
+                PyExc_TypeError
 #endif
+            );
             PyErr_Clear();
-            if (type_error && PyNumber_Check(src.ptr())) {
-                auto tmp = reinterpret_borrow<object>(std::is_floating_point<T>::value
-                                                      ? PyNumber_Float(src.ptr())
-                                                      : PyNumber_Long(src.ptr()));
+            if (type_error && convert && PyNumber_Check(src.ptr())) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                     ? PyNumber_Float(src.ptr())
+                                                     : PyNumber_Long(src.ptr()));
                 PyErr_Clear();
                 return load(tmp, false);
             }
@@ -537,7 +998,11 @@ public:
 
 template<typename T> struct void_caster {
 public:
-    bool load(handle, bool) { return false; }
+    bool load(handle src, bool) {
+        if (src && src.is_none())
+            return true;
+        return false;
+    }
     static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
         return none().inc_ref();
     }
@@ -565,8 +1030,9 @@ public:
         }
 
         /* Check if this is a C++ type */
-        if (get_type_info((PyTypeObject *) h.get_type().ptr())) {
-            value = ((instance<void> *) h.ptr())->value;
+        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
             return true;
         }
 
@@ -588,15 +1054,41 @@ private:
     void *value = nullptr;
 };
 
-template <> class type_caster<std::nullptr_t> : public type_caster<void_type> { };
+template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
 
 template <> class type_caster<bool> {
 public:
-    bool load(handle src, bool) {
+    bool load(handle src, bool convert) {
         if (!src) return false;
         else if (src.ptr() == Py_True) { value = true; return true; }
         else if (src.ptr() == Py_False) { value = false; return true; }
-        else return false;
+        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0;  // None is implicitly converted to False
+            }
+            #if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+            #else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+            #endif
+            if (res == 0 || res == 1) {
+                value = (bool) res;
+                return true;
+            }
+        }
+        return false;
     }
     static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
         return handle(src ? Py_True : Py_False).inc_ref();
@@ -604,180 +1096,210 @@ public:
     PYBIND11_TYPE_CASTER(bool, _("bool"));
 };
 
-template <> class type_caster<std::string> {
-public:
-    bool load(handle src, bool) {
-        object temp;
-        handle load_src = src;
-        if (!src) {
-            return false;
-        } else if (PyUnicode_Check(load_src.ptr())) {
-            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(load_src.ptr()));
-            if (!temp) { PyErr_Clear(); return false; }  // UnicodeEncodeError
-            load_src = temp;
-        }
-        char *buffer;
-        ssize_t length;
-        int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(load_src.ptr(), &buffer, &length);
-        if (err == -1) { PyErr_Clear(); return false; }  // TypeError
-        value = std::string(buffer, (size_t) length);
-        success = true;
-        return true;
-    }
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false> struct string_caster {
+    using CharT = typename StringType::value_type;
 
-    static handle cast(const std::string &src, return_value_policy /* policy */, handle /* parent */) {
-        return PyUnicode_FromStringAndSize(src.c_str(), (ssize_t) src.length());
-    }
-
-    PYBIND11_TYPE_CASTER(std::string, _(PYBIND11_STRING_NAME));
-protected:
-    bool success = false;
-};
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+            "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
 
-template <typename type, typename deleter> class type_caster<std::unique_ptr<type, deleter>> {
-public:
-    static handle cast(std::unique_ptr<type, deleter> &&src, return_value_policy policy, handle parent) {
-        handle result = type_caster_base<type>::cast(src.get(), policy, parent);
-        if (result)
-            src.release();
-        return result;
-    }
-    static PYBIND11_DESCR name() { return type_caster_base<type>::name(); }
-};
-
-template <> class type_caster<std::wstring> {
-public:
     bool load(handle src, bool) {
+#if PY_MAJOR_VERSION < 3
         object temp;
+#endif
         handle load_src = src;
         if (!src) {
             return false;
         } else if (!PyUnicode_Check(load_src.ptr())) {
+#if PY_MAJOR_VERSION >= 3
+            return load_bytes(load_src);
+#else
+            if (sizeof(CharT) == 1) {
+                return load_bytes(load_src);
+            }
+
+            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
+            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+                return false;
+
             temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
             if (!temp) { PyErr_Clear(); return false; }
             load_src = temp;
-        }
-        wchar_t *buffer = nullptr;
-        ssize_t length = -1;
-#if PY_MAJOR_VERSION >= 3
-        buffer = PyUnicode_AsWideCharString(load_src.ptr(), &length);
-#else
-        temp = reinterpret_steal<object>(PyUnicode_AsEncodedString(
-            load_src.ptr(), sizeof(wchar_t) == sizeof(short)
-            ? "utf16" : "utf32", nullptr));
-
-        if (temp) {
-            int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), (char **) &buffer, &length);
-            if (err == -1) { buffer = nullptr; }  // TypeError
-            length = length / (ssize_t) sizeof(wchar_t) - 1; ++buffer; // Skip BOM
-        }
 #endif
-        if (!buffer) { PyErr_Clear(); return false; }
-        value = std::wstring(buffer, (size_t) length);
-        success = true;
-        return true;
-    }
+        }
 
-    static handle cast(const std::wstring &src, return_value_policy /* policy */, handle /* parent */) {
-        return PyUnicode_FromWideChar(src.c_str(), (ssize_t) src.length());
-    }
+        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+        if (!utfNbytes) { PyErr_Clear(); return false; }
 
-    PYBIND11_TYPE_CASTER(std::wstring, _(PYBIND11_STRING_NAME));
-protected:
-    bool success = false;
-};
+        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        value = StringType(buffer, length);
 
-template <> class type_caster<char> : public type_caster<std::string> {
-public:
-    bool load(handle src, bool convert) {
-        if (src.is_none()) return true;
-        return type_caster<std::string>::load(src, convert);
-    }
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView)
+            loader_life_support::add_patient(utfNbytes);
 
-    static handle cast(const char *src, return_value_policy /* policy */, handle /* parent */) {
-        if (src == nullptr) return none().inc_ref();
-        return PyUnicode_FromString(src);
+        return true;
     }
 
-    static handle cast(char src, return_value_policy /* policy */, handle /* parent */) {
-        char str[2] = { src, '\0' };
-        return PyUnicode_DecodeLatin1(str, 1, nullptr);
+    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) throw error_already_set();
+        return s;
     }
 
-    operator char*() { return success ? (char *) value.c_str() : nullptr; }
-    operator char&() { return value[0]; }
-
-    static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
-};
+    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
 
-template <> class type_caster<wchar_t> : public type_caster<std::wstring> {
-public:
-    bool load(handle src, bool convert) {
-        if (src.is_none()) return true;
-        return type_caster<std::wstring>::load(src, convert);
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return
+            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
+            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
+                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
+        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
+        // non-const char * arguments, which is also a nuissance, so bypass the whole thing by just
+        // passing the encoding as a string value, which works properly:
+        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+#endif
     }
 
-    static handle cast(const wchar_t *src, return_value_policy /* policy */, handle /* parent */) {
-        if (src == nullptr) return none().inc_ref();
-        return PyUnicode_FromWideChar(src, (ssize_t) wcslen(src));
-    }
+    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (bytes) {
+                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+                return true;
+            }
+        }
 
-    static handle cast(wchar_t src, return_value_policy /* policy */, handle /* parent */) {
-        wchar_t wstr[2] = { src, L'\0' };
-        return PyUnicode_FromWideChar(wstr, 1);
+        return false;
     }
 
-    operator wchar_t*() { return success ? (wchar_t *) value.c_str() : nullptr; }
-    operator wchar_t&() { return value[0]; }
-
-    static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
 };
 
-template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {
-    typedef std::pair<T1, T2> type;
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = type_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
 public:
     bool load(handle src, bool convert) {
-        if (!isinstance<sequence>(src))
-            return false;
-        const auto seq = reinterpret_borrow<sequence>(src);
-        if (seq.size() != 2)
-            return false;
-        return first.load(seq[0], convert) && second.load(seq[1], convert);
+        if (!src) return false;
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
     }
 
-    static handle cast(const type &src, return_value_policy policy, handle parent) {
-        auto o1 = reinterpret_steal<object>(make_caster<T1>::cast(src.first, policy, parent));
-        auto o2 = reinterpret_steal<object>(make_caster<T2>::cast(src.second, policy, parent));
-        if (!o1 || !o2)
-            return handle();
-        tuple result(2);
-        PyTuple_SET_ITEM(result.ptr(), 0, o1.release().ptr());
-        PyTuple_SET_ITEM(result.ptr(), 1, o2.release().ptr());
-        return result.release();
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) return pybind11::none().inc_ref();
+        return StringCaster::cast(StringType(src), policy, parent);
     }
 
-    static PYBIND11_DESCR name() {
-        return type_descr(
-            _("Tuple[") + make_caster<T1>::name() + _(", ") + make_caster<T2>::name() + _("]")
-        );
-    }
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) throw error_already_set();
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
+    operator CharT() {
+        if (none)
+            throw value_error("Cannot convert None to a character");
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0)
+            throw value_error("Cannot convert empty string to a character");
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to figure
+        // out how long the first encoded character is in bytes to distinguish between these two
+        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
+        // can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            unsigned char v0 = static_cast<unsigned char>(value[0]);
+            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
+                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
+                4; // 0b11110xxx - start of 4-byte sequence
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    return static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
 
-    template <typename T> using cast_op_type = type;
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            char16_t v0 = static_cast<char16_t>(value[0]);
+            if (v0 >= 0xD800 && v0 < 0xE000)
+                throw value_error("Character code point not in range(0x10000)");
+        }
+
+        if (str_len != 1)
+            throw value_error("Expected a character, but multi-character string found");
 
-    operator type() {
-        return type(cast_op<T1>(first), cast_op<T2>(second));
+        return value[0];
     }
-protected:
-    make_caster<T1> first;
-    make_caster<T2> second;
-};
 
-template <typename... Tuple> class type_caster<std::tuple<Tuple...>> {
-    using type = std::tuple<Tuple...>;
-    using indices = make_index_sequence<sizeof...(Tuple)>;
-    static constexpr auto size = sizeof...(Tuple);
+    static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
+    template <typename _T> using cast_op_type = remove_reference_t<pybind11::detail::cast_op_type<_T>>;
+};
 
+// Base implementation for std::tuple and std::pair
+template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
 public:
+
     bool load(handle src, bool convert) {
         if (!isinstance<sequence>(src))
             return false;
@@ -787,40 +1309,41 @@ public:
         return load_impl(seq, convert, indices{});
     }
 
-    static handle cast(const type &src, return_value_policy policy, handle parent) {
-        return cast_impl(src, policy, parent, indices{});
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
     }
 
     static PYBIND11_DESCR name() {
-        return type_descr(_("Tuple[") + detail::concat(make_caster<Tuple>::name()...) + _("]"));
+        return type_descr(_("Tuple[") + detail::concat(make_caster<Ts>::name()...) + _("]"));
     }
 
     template <typename T> using cast_op_type = type;
 
-    operator type() { return implicit_cast(indices{}); }
+    operator type() & { return implicit_cast(indices{}); }
+    operator type() && { return std::move(*this).implicit_cast(indices{}); }
 
 protected:
     template <size_t... Is>
-    type implicit_cast(index_sequence<Is...>) { return type(cast_op<Tuple>(std::get<Is>(value))...); }
+    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
 
     static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
 
     template <size_t... Is>
     bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
-        for (bool r : {std::get<Is>(value).load(seq[Is], convert)...})
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
             if (!r)
                 return false;
         return true;
     }
 
-    static handle cast_impl(const type &, return_value_policy, handle,
-                            index_sequence<>) { return tuple().release(); }
-
     /* Implementation: Convert a C++ tuple into a Python tuple */
-    template <size_t... Is>
-    static handle cast_impl(const type &src, return_value_policy policy, handle parent, index_sequence<Is...>) {
-        std::array<object, size> entries {{
-            reinterpret_steal<object>(make_caster<Tuple>::cast(std::get<Is>(src), policy, parent))...
+    template <typename T, size_t... Is>
+    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        std::array<object, size> entries{{
+            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
         }};
         for (const auto &entry: entries)
             if (!entry)
@@ -832,72 +1355,66 @@ protected:
         return result.release();
     }
 
-    std::tuple<make_caster<Tuple>...> value;
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
+    : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts> class type_caster<std::tuple<Ts...>>
+    : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
 };
 
 /// Type caster for holder types like std::shared_ptr, etc.
-template <typename type, typename holder_type> class type_caster_holder : public type_caster_base<type> {
+template <typename type, typename holder_type>
+struct copyable_holder_caster : public type_caster_base<type> {
 public:
     using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
     using base::base;
     using base::cast;
     using base::typeinfo;
     using base::value;
-    using base::temp;
 
-    PYBIND11_NOINLINE bool load(handle src, bool convert) {
-        return load(src, convert, Py_TYPE(src.ptr()));
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
     }
 
-    bool load(handle src, bool convert, PyTypeObject *tobj) {
-        if (!src || !typeinfo)
-            return false;
-        if (src.is_none()) {
-            value = nullptr;
-            return true;
-        }
-
-        if (typeinfo->simple_type) { /* Case 1: no multiple inheritance etc. involved */
-            /* Check if we can safely perform a reinterpret-style cast */
-            if (PyType_IsSubtype(tobj, typeinfo->type))
-                return load_value_and_holder(src);
-        } else { /* Case 2: multiple inheritance */
-            /* Check if we can safely perform a reinterpret-style cast */
-            if (tobj == typeinfo->type)
-                return load_value_and_holder(src);
-
-            /* If this is a python class, also check the parents recursively */
-            auto const &type_dict = get_internals().registered_types_py;
-            bool new_style_class = PyType_Check((PyObject *) tobj);
-            if (type_dict.find(tobj) == type_dict.end() && new_style_class && tobj->tp_bases) {
-                auto parents = reinterpret_borrow<tuple>(tobj->tp_bases);
-                for (handle parent : parents) {
-                    bool result = load(src, convert, (PyTypeObject *) parent.ptr());
-                    if (result)
-                        return true;
-                }
-            }
+    explicit operator type*() { return this->value; }
+    explicit operator type&() { return *(this->value); }
+    explicit operator holder_type*() { return &holder; }
 
-            if (try_implicit_casts(src, convert))
-                return true;
-        }
+    // Workaround for Intel compiler bug
+    // see pybind11 issue 94
+    #if defined(__ICC) || defined(__INTEL_COMPILER)
+    operator holder_type&() { return holder; }
+    #else
+    explicit operator holder_type&() { return holder; }
+    #endif
 
-        if (convert) {
-            for (auto &converter : typeinfo->implicit_conversions) {
-                temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
-                if (load(temp, false))
-                    return true;
-            }
-        }
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
 
-        return false;
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder)
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
     }
 
-    bool load_value_and_holder(handle src) {
-        auto inst = (instance<type, holder_type> *) src.ptr();
-        value = (void *) inst->value;
-        if (inst->holder_constructed) {
-            holder = inst->holder;
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.holder<holder_type>();
             return true;
         } else {
             throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
@@ -915,7 +1432,7 @@ public:
     template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
     bool try_implicit_casts(handle src, bool convert) {
         for (auto &cast : typeinfo->implicit_casts) {
-            type_caster_holder sub_caster(*cast.first);
+            copyable_holder_caster sub_caster(*cast.first);
             if (sub_caster.load(src, convert)) {
                 value = cast.second(sub_caster.value);
                 holder = holder_type(sub_caster.holder, (type *) value);
@@ -925,32 +1442,36 @@ public:
         return false;
     }
 
-    explicit operator type*() { return this->value; }
-    explicit operator type&() { return *(this->value); }
-    explicit operator holder_type*() { return &holder; }
-
-    // Workaround for Intel compiler bug
-    // see pybind11 issue 94
-    #if defined(__ICC) || defined(__INTEL_COMPILER)
-    operator holder_type&() { return holder; }
-    #else
-    explicit operator holder_type&() { return holder; }
-    #endif
+    static bool try_direct_conversions(handle) { return false; }
 
-    static handle cast(const holder_type &src, return_value_policy, handle) {
-        return type_caster_generic::cast(
-            src.get(), return_value_policy::take_ownership, handle(),
-            src.get() ? &typeid(*src.get()) : nullptr, &typeid(type),
-            nullptr, nullptr, &src);
-    }
 
-protected:
     holder_type holder;
 };
 
 /// Specialize for the common std::shared_ptr, so users don't need to
 template <typename T>
-class type_caster<std::shared_ptr<T>> : public type_caster_holder<T, std::shared_ptr<T>> { };
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+
+template <typename type, typename holder_type>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+    static PYBIND11_DESCR name() { return type_caster_base<type>::name(); }
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
 
 template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
 
@@ -1007,13 +1528,13 @@ class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caste
 // - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
 //   must have ref_count() == 1)h
 // If any of the above are not satisfied, we fall back to copying.
-template <typename T> using move_is_plain_type = none_of<
-    std::is_void<T>, std::is_pointer<T>, std::is_reference<T>, std::is_const<T>
+template <typename T> using move_is_plain_type = satisfies_none_of<T,
+    std::is_void, std::is_pointer, std::is_reference, std::is_const
 >;
 template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
 template <typename T> struct move_always<T, enable_if_t<all_of<
     move_is_plain_type<T>,
-    negation<std::is_copy_constructible<T>>,
+    negation<is_copy_constructible<T>>,
     std::is_move_constructible<T>,
     std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
 >::value>> : std::true_type {};
@@ -1035,6 +1556,17 @@ template <typename type> using cast_is_temporary_value_reference = bool_constant
     !std::is_base_of<type_caster_generic, make_caster<type>>::value
 >;
 
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.  Some classes (most notably Eigen::Ref and related) need to avoid this, and so can do so by
+// specializing this struct.
+template <typename Return, typename SFINAE = void> struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value && !std::is_pointer<Return>::value
+            ? return_value_policy::move : p;
+    }
+};
+
 // Basic python -> C++ casting; throws if casting fails
 template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
     if (!conv.load(handle, true)) {
@@ -1153,18 +1685,19 @@ NAMESPACE_END(detail)
 
 template <return_value_policy policy = return_value_policy::automatic_reference,
           typename... Args> tuple make_tuple(Args&&... args_) {
-    const size_t size = sizeof...(Args);
+    constexpr size_t size = sizeof...(Args);
     std::array<object, size> args {
         { reinterpret_steal<object>(detail::make_caster<Args>::cast(
             std::forward<Args>(args_), policy, nullptr))... }
     };
-    for (auto &arg_value : args) {
-        if (!arg_value) {
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
 #if defined(NDEBUG)
             throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
 #else
-            throw cast_error("make_tuple(): unable to convert arguments of types '" +
-                (std::string) type_id<std::tuple<Args...>>() + "' to Python object");
+            std::array<std::string, size> argtypes { {type_id<Args>()...} };
+            throw cast_error("make_tuple(): unable to convert argument of type '" +
+                argtypes[i] + "' to Python object");
 #endif
         }
     }
@@ -1175,19 +1708,30 @@ template <return_value_policy policy = return_value_policy::automatic_reference,
     return result;
 }
 
-/// Annotation for keyword arguments
+/// \ingroup annotations
+/// Annotation for arguments
 struct arg {
-    constexpr explicit arg(const char *name) : name(name) { }
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
+    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
+    /// Assign a value to this argument
     template <typename T> arg_v operator=(T &&value) const;
-
-    const char *name;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) { flag_none = flag; return *this; }
+
+    const char *name; ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
+    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
 };
 
-/// Annotation for keyword arguments with values
+/// \ingroup annotations
+/// Annotation for arguments with values
 struct arg_v : arg {
+private:
     template <typename T>
-    arg_v(const char *name, T &&x, const char *descr = nullptr)
-        : arg(name),
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base),
           value(reinterpret_steal<object>(
               detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
           )),
@@ -1197,108 +1741,128 @@ struct arg_v : arg {
 #endif
     { }
 
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) { }
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) { }
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+
+    /// The default value
     object value;
+    /// The (optional) description of the default value
     const char *descr;
 #if !defined(NDEBUG)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
     std::string type;
 #endif
 };
 
 template <typename T>
-arg_v arg::operator=(T &&value) const { return {name, std::forward<T>(value)}; }
+arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
 
 /// Alias for backward compatibility -- to be removed in version 2.0
 template <typename /*unused*/> using arg_t = arg_v;
 
 inline namespace literals {
-/// String literal version of arg
+/** \rst
+    String literal version of `arg`
+ \endrst */
 constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
 }
 
 NAMESPACE_BEGIN(detail)
 
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+
 /// Helper class which loads arguments for C++ functions called from Python
 template <typename... Args>
 class argument_loader {
-    using itypes = type_list<intrinsic_t<Args>...>;
     using indices = make_index_sequence<sizeof...(Args)>;
 
-public:
-    argument_loader() : value() {} // Helps gcc-7 properly initialize value
+    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get args/kwargs argument positions relative to the end of the argument list:
+    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
+                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+
+    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
 
-    static constexpr auto has_kwargs = std::is_same<itypes, type_list<args, kwargs>>::value;
-    static constexpr auto has_args = has_kwargs || std::is_same<itypes, type_list<args>>::value;
+    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos < 0;
+    static constexpr bool has_args = args_pos < 0;
 
     static PYBIND11_DESCR arg_names() { return detail::concat(make_caster<Args>::name()...); }
 
-    bool load_args(handle args, handle kwargs) {
-        return load_impl(args, kwargs, itypes{});
+    bool load_args(function_call &call) {
+        return load_impl_sequence(call, indices{});
     }
 
-    template <typename Return, typename Func>
-    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) {
-        return call_impl<Return>(std::forward<Func>(f), indices{});
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
     }
 
-    template <typename Return, typename Func>
-    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) {
-        call_impl<Return>(std::forward<Func>(f), indices{});
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
         return void_type();
     }
 
 private:
-    bool load_impl(handle args_, handle, type_list<args>) {
-        std::get<0>(value).load(args_, true);
-        return true;
-    }
-
-    bool load_impl(handle args_, handle kwargs_, type_list<args, kwargs>) {
-        std::get<0>(value).load(args_, true);
-        std::get<1>(value).load(kwargs_, true);
-        return true;
-    }
 
-    bool load_impl(handle args, handle, ... /* anything else */) {
-        return load_impl_sequence(args, indices{});
-    }
-
-    static bool load_impl_sequence(handle, index_sequence<>) { return true; }
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
 
     template <size_t... Is>
-    bool load_impl_sequence(handle src, index_sequence<Is...>) {
-        for (bool r : {std::get<Is>(value).load(PyTuple_GET_ITEM(src.ptr(), Is), true)...})
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
             if (!r)
                 return false;
         return true;
     }
 
-    template <typename Return, typename Func, size_t... Is>
-    Return call_impl(Func &&f, index_sequence<Is...>) {
-        return std::forward<Func>(f)(cast_op<Args>(std::get<Is>(value))...);
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
     }
 
-    std::tuple<make_caster<Args>...> value;
+    std::tuple<make_caster<Args>...> argcasters;
 };
 
-NAMESPACE_BEGIN(constexpr_impl)
-/// Implementation details for constexpr functions
-constexpr int first(int i) { return i; }
-template <typename T, typename... Ts>
-constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
-
-constexpr int last(int /*i*/, int result) { return result; }
-template <typename T, typename... Ts>
-constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
-NAMESPACE_END(constexpr_impl)
-
-/// Return the index of the first type in Ts which satisfies Predicate<T>
-template <template<typename> class Predicate, typename... Ts>
-constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
-
-/// Return the index of the last type in Ts which satisfies Predicate<T>
-template <template<typename> class Predicate, typename... Ts>
-constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
-
 /// Helper class which collects only positional arguments for a Python function call.
 /// A fancier version below can collect any argument, but this one is optimal for simple calls.
 template <return_value_policy policy>
@@ -1374,6 +1938,13 @@ private:
     }
 
     void process(list &/*args_list*/, arg_v a) {
+        if (!a.name)
+#if defined(NDEBUG)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+
         if (m_kwargs.contains(a.name)) {
 #if defined(NDEBUG)
             multiple_values_error();
@@ -1406,6 +1977,15 @@ private:
         }
     }
 
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error("Got kwargs without a name; only named arguments "
+                         "may be passed via py::arg() to a python function call. "
+                         "(compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(std::string type) {
+        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
+                         "arguments may be passed via py::arg() to a python function call. ");
+    }
     [[noreturn]] static void multiple_values_error() {
         throw type_error("Got multiple values for keyword argument "
                          "(compile in debug mode for details)");
@@ -1470,4 +2050,4 @@ NAMESPACE_END(detail)
         template<> class type_caster<Type> : public type_caster_base<Type> { }; \
     }}
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/chrono.h b/pybind11/include/pybind11/chrono.h
index 2b37f56f1..95ada76e0 100644
--- a/pybind11/include/pybind11/chrono.h
+++ b/pybind11/include/pybind11/chrono.h
@@ -27,7 +27,7 @@
 #define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 template <typename type> class duration_caster {
@@ -85,9 +85,11 @@ public:
         using ss_t = duration<int, std::ratio<1>>;
         using us_t = duration<int, std::micro>;
 
-        return PyDelta_FromDSU(duration_cast<dd_t>(d).count(),
-                               duration_cast<ss_t>(d % days(1)).count(),
-                               duration_cast<us_t>(d % seconds(1)).count());
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
     }
 
     PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
@@ -157,4 +159,4 @@ template <typename Rep, typename Period> class type_caster<std::chrono::duration
 };
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/common.h b/pybind11/include/pybind11/common.h
index c0c71b131..6c8a4f1e8 100644
--- a/pybind11/include/pybind11/common.h
+++ b/pybind11/include/pybind11/common.h
@@ -1,617 +1,2 @@
-/*
-    pybind11/common.h -- Basic macros
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#pragma once
-
-#if !defined(NAMESPACE_BEGIN)
-#  define NAMESPACE_BEGIN(name) namespace name {
-#endif
-#if !defined(NAMESPACE_END)
-#  define NAMESPACE_END(name) }
-#endif
-
-// Neither MSVC nor Intel support enough of C++14 yet (in particular, as of MSVC 2015 and ICC 17
-// beta, neither support extended constexpr, which we rely on in descr.h), so don't enable pybind
-// CPP14 features for them.
-#if !defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#  if __cplusplus >= 201402L
-#    define PYBIND11_CPP14
-#    if __cplusplus > 201402L /* Temporary: should be updated to >= the final C++17 value once known */
-#      define PYBIND11_CPP17
-#    endif
-#  endif
-#endif
-
-#if !defined(PYBIND11_EXPORT)
-#  if defined(WIN32) || defined(_WIN32)
-#    define PYBIND11_EXPORT __declspec(dllexport)
-#  else
-#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
-#  endif
-#endif
-
-#if defined(_MSC_VER)
-#  define PYBIND11_NOINLINE __declspec(noinline)
-#else
-#  define PYBIND11_NOINLINE __attribute__ ((noinline))
-#endif
-
-#if defined(PYBIND11_CPP14)
-#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
-#elif defined(__clang__)
-#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
-#elif defined(__GNUG__)
-#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated))
-#elif defined(_MSC_VER)
-#  define PYBIND11_DEPRECATED(reason) __declspec(deprecated)
-#endif
-
-#define PYBIND11_VERSION_MAJOR 2
-#define PYBIND11_VERSION_MINOR 0
-#define PYBIND11_VERSION_PATCH 1
-
-/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
-#if defined(_MSC_VER)
-#  define HAVE_ROUND
-#  pragma warning(push)
-#  pragma warning(disable: 4510 4610 4512 4005)
-#  if _DEBUG
-#    define PYBIND11_DEBUG_MARKER
-#    undef _DEBUG
-#  endif
-#endif
-
-#include <Python.h>
-#include <frameobject.h>
-#include <pythread.h>
-
-#if defined(_WIN32) && (defined(min) || defined(max))
-#  error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
-#endif
-
-#if defined(isalnum)
-#  undef isalnum
-#  undef isalpha
-#  undef islower
-#  undef isspace
-#  undef isupper
-#  undef tolower
-#  undef toupper
-#endif
-
-#if defined(_MSC_VER)
-#  if defined(PYBIND11_DEBUG_MARKER)
-#    define _DEBUG
-#    undef PYBIND11_DEBUG_MARKER
-#  endif
-#  pragma warning(pop)
-#endif
-
-#include <cstddef>
-#include <forward_list>
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include <unordered_set>
-#include <unordered_map>
-#include <memory>
-#include <typeindex>
-#include <type_traits>
-
-#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
-#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
-#define PYBIND11_BYTES_CHECK PyBytes_Check
-#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
-#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
-#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
-#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
-#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
-#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
-#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o)
-#define PYBIND11_BYTES_NAME "bytes"
-#define PYBIND11_STRING_NAME "str"
-#define PYBIND11_SLICE_OBJECT PyObject
-#define PYBIND11_FROM_STRING PyUnicode_FromString
-#define PYBIND11_STR_TYPE ::pybind11::str
-#define PYBIND11_OB_TYPE(ht_type) (ht_type).ob_base.ob_base.ob_type
-#define PYBIND11_PLUGIN_IMPL(name) \
-    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
-#else
-#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
-#define PYBIND11_BYTES_CHECK PyString_Check
-#define PYBIND11_BYTES_FROM_STRING PyString_FromString
-#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
-#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
-#define PYBIND11_BYTES_AS_STRING PyString_AsString
-#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
-#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
-#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o))
-#define PYBIND11_BYTES_NAME "str"
-#define PYBIND11_STRING_NAME "unicode"
-#define PYBIND11_SLICE_OBJECT PySliceObject
-#define PYBIND11_FROM_STRING PyString_FromString
-#define PYBIND11_STR_TYPE ::pybind11::bytes
-#define PYBIND11_OB_TYPE(ht_type) (ht_type).ob_type
-#define PYBIND11_PLUGIN_IMPL(name) \
-    extern "C" PYBIND11_EXPORT PyObject *init##name()
-#endif
-
-#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
-extern "C" {
-    struct _Py_atomic_address { void *value; };
-    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
-}
-#endif
-
-#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
-#define PYBIND11_STRINGIFY(x) #x
-#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
-#define PYBIND11_INTERNALS_ID "__pybind11_" \
-    PYBIND11_TOSTRING(PYBIND11_VERSION_MAJOR) "_" PYBIND11_TOSTRING(PYBIND11_VERSION_MINOR) "__"
-
-#define PYBIND11_PLUGIN(name)                                                  \
-    static PyObject *pybind11_init();                                          \
-    PYBIND11_PLUGIN_IMPL(name) {                                               \
-        int major, minor;                                                      \
-        if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) {           \
-            PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \
-            return nullptr;                                                    \
-        } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) {   \
-            PyErr_Format(PyExc_ImportError,                                    \
-                         "Python version mismatch: module was compiled for "   \
-                         "version %i.%i, while the interpreter is running "    \
-                         "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \
-                         major, minor);                                        \
-            return nullptr;                                                    \
-        }                                                                      \
-        try {                                                                  \
-            return pybind11_init();                                            \
-        } catch (const std::exception &e) {                                    \
-            PyErr_SetString(PyExc_ImportError, e.what());                      \
-            return nullptr;                                                    \
-        }                                                                      \
-    }                                                                          \
-    PyObject *pybind11_init()
-
-// Function return value and argument type deduction support.  When compiling under C++17 these
-// differ as C++17 makes the noexcept specifier part of the function type, while it is not part of
-// the type under earlier standards.
-#ifdef __cpp_noexcept_function_type
-#  define PYBIND11_NOEXCEPT_TPL_ARG , bool NoExceptions
-#  define PYBIND11_NOEXCEPT_SPECIFIER noexcept(NoExceptions)
-#else
-#  define PYBIND11_NOEXCEPT_TPL_ARG
-#  define PYBIND11_NOEXCEPT_SPECIFIER
-#endif
-
-NAMESPACE_BEGIN(pybind11)
-
-using ssize_t = Py_ssize_t;
-using size_t  = std::size_t;
-
-/// Approach used to cast a previously unknown C++ instance into a Python object
-enum class return_value_policy : uint8_t {
-    /** This is the default return value policy, which falls back to the policy
-        return_value_policy::take_ownership when the return value is a pointer.
-        Otherwise, it uses return_value::move or return_value::copy for rvalue
-        and lvalue references, respectively. See below for a description of what
-        all of these different policies do. */
-    automatic = 0,
-
-    /** As above, but use policy return_value_policy::reference when the return
-        value is a pointer. This is the default conversion policy for function
-        arguments when calling Python functions manually from C++ code (i.e. via
-        handle::operator()). You probably won't need to use this. */
-    automatic_reference,
-
-    /** Reference an existing object (i.e. do not create a new copy) and take
-        ownership. Python will call the destructor and delete operator when the
-        object’s reference count reaches zero. Undefined behavior ensues when
-        the C++ side does the same.. */
-    take_ownership,
-
-    /** Create a new copy of the returned object, which will be owned by
-        Python. This policy is comparably safe because the lifetimes of the two
-        instances are decoupled. */
-    copy,
-
-    /** Use std::move to move the return value contents into a new instance
-        that will be owned by Python. This policy is comparably safe because the
-        lifetimes of the two instances (move source and destination) are
-        decoupled. */
-    move,
-
-    /** Reference an existing object, but do not take ownership. The C++ side
-        is responsible for managing the object’s lifetime and deallocating it
-        when it is no longer used. Warning: undefined behavior will ensue when
-        the C++ side deletes an object that is still referenced and used by
-        Python. */
-    reference,
-
-    /** This policy only applies to methods and properties. It references the
-        object without taking ownership similar to the above
-        return_value_policy::reference policy. In contrast to that policy, the
-        function or property’s implicit this argument (called the parent) is
-        considered to be the the owner of the return value (the child).
-        pybind11 then couples the lifetime of the parent to the child via a
-        reference relationship that ensures that the parent cannot be garbage
-        collected while Python is still using the child. More advanced
-        variations of this scheme are also possible using combinations of
-        return_value_policy::reference and the keep_alive call policy */
-    reference_internal
-};
-
-/// Information record describing a Python buffer object
-struct buffer_info {
-    void *ptr = nullptr;         // Pointer to the underlying storage
-    size_t itemsize = 0;         // Size of individual items in bytes
-    size_t size = 0;             // Total number of entries
-    std::string format;          // For homogeneous buffers, this should be set to format_descriptor<T>::format()
-    size_t ndim = 0;             // Number of dimensions
-    std::vector<size_t> shape;   // Shape of the tensor (1 entry per dimension)
-    std::vector<size_t> strides; // Number of entries between adjacent entries (for each per dimension)
-
-    buffer_info() { }
-
-    buffer_info(void *ptr, size_t itemsize, const std::string &format, size_t ndim,
-                const std::vector<size_t> &shape, const std::vector<size_t> &strides)
-        : ptr(ptr), itemsize(itemsize), size(1), format(format),
-          ndim(ndim), shape(shape), strides(strides) {
-        for (size_t i = 0; i < ndim; ++i)
-            size *= shape[i];
-    }
-
-    buffer_info(void *ptr, size_t itemsize, const std::string &format, size_t size)
-    : buffer_info(ptr, itemsize, format, 1, std::vector<size_t> { size },
-                  std::vector<size_t> { itemsize }) { }
-
-    explicit buffer_info(Py_buffer *view, bool ownview = true)
-        : ptr(view->buf), itemsize((size_t) view->itemsize), size(1), format(view->format),
-          ndim((size_t) view->ndim), shape((size_t) view->ndim), strides((size_t) view->ndim), view(view), ownview(ownview) {
-        for (size_t i = 0; i < (size_t) view->ndim; ++i) {
-            shape[i] = (size_t) view->shape[i];
-            strides[i] = (size_t) view->strides[i];
-            size *= shape[i];
-        }
-    }
-
-    buffer_info(const buffer_info &) = delete;
-    buffer_info& operator=(const buffer_info &) = delete;
-
-    buffer_info(buffer_info &&other) {
-        (*this) = std::move(other);
-    }
-
-    buffer_info& operator=(buffer_info &&rhs) {
-        ptr = rhs.ptr;
-        itemsize = rhs.itemsize;
-        size = rhs.size;
-        format = std::move(rhs.format);
-        ndim = rhs.ndim;
-        shape = std::move(rhs.shape);
-        strides = std::move(rhs.strides);
-        std::swap(view, rhs.view);
-        std::swap(ownview, rhs.ownview);
-        return *this;
-    }
-
-    ~buffer_info() {
-        if (view && ownview) { PyBuffer_Release(view); delete view; }
-    }
-
-private:
-    Py_buffer *view = nullptr;
-    bool ownview = false;
-};
-
-NAMESPACE_BEGIN(detail)
-
-inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
-
-inline std::string error_string();
-
-/// Core part of the 'instance' type which POD (needed to be able to use 'offsetof')
-template <typename type> struct instance_essentials {
-    PyObject_HEAD
-    type *value;
-    PyObject *weakrefs;
-    bool owned : 1;
-    bool holder_constructed : 1;
-};
-
-/// PyObject wrapper around generic types, includes a special holder type that is responsible for lifetime management
-template <typename type, typename holder_type = std::unique_ptr<type>> struct instance : instance_essentials<type> {
-    holder_type holder;
-};
-
-struct overload_hash {
-    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
-        size_t value = std::hash<const void *>()(v.first);
-        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
-        return value;
-    }
-};
-
-/// Internal data struture used to track registered instances and types
-struct internals {
-    std::unordered_map<std::type_index, void*> registered_types_cpp;   // std::type_index -> type_info
-    std::unordered_map<const void *, void*> registered_types_py;       // PyTypeObject* -> type_info
-    std::unordered_multimap<const void *, void*> registered_instances; // void * -> PyObject*
-    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
-    std::unordered_map<std::type_index, std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
-    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
-    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
-#if defined(WITH_THREAD)
-    decltype(PyThread_create_key()) tstate = 0; // Usually an int but a long on Cygwin64 with Python 3.x
-    PyInterpreterState *istate = nullptr;
-#endif
-};
-
-/// Return a reference to the current 'internals' information
-inline internals &get_internals();
-
-/// from __cpp_future__ import (convenient aliases from C++14/17)
-#ifdef PYBIND11_CPP14
-using std::enable_if_t;
-using std::conditional_t;
-#else
-template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
-#endif
-
-/// Index sequences
-#if defined(PYBIND11_CPP14) || defined(_MSC_VER)
-using std::index_sequence;
-using std::make_index_sequence;
-#else
-template<size_t ...> struct index_sequence  { };
-template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
-template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
-template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
-#endif
-
-#if defined(PYBIND11_CPP17) || defined(_MSC_VER)
-using std::bool_constant;
-using std::negation;
-#else
-template <bool B> using bool_constant = std::integral_constant<bool, B>;
-template <class T> using negation = bool_constant<!T::value>;
-#endif
-
-/// Compile-time all/any/none of that check the ::value of all template types
-#ifdef PYBIND11_CPP17
-template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
-template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
-#elif !defined(_MSC_VER)
-template <bool...> struct bools {};
-template <class... Ts> using all_of = std::is_same<
-    bools<Ts::value..., true>,
-    bools<true, Ts::value...>>;
-template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
-#else
-// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
-// at a slight loss of compilation efficiency).
-template <class... Ts> using all_of = std::conjunction<Ts...>;
-template <class... Ts> using any_of = std::disjunction<Ts...>;
-#endif
-template <class... Ts> using none_of = negation<any_of<Ts...>>;
-
-/// Strip the class from a method type
-template <typename T> struct remove_class { };
-template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
-template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
-
-/// Helper template to strip away type modifiers
-template <typename T> struct intrinsic_type                       { typedef T type; };
-template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
-template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
-template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
-template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
-template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
-
-/// Helper type to replace 'void' in some expressions
-struct void_type { };
-
-/// Helper template which holds a list of types
-template <typename...> struct type_list { };
-
-/// Compile-time integer sum
-constexpr size_t constexpr_sum() { return 0; }
-template <typename T, typename... Ts>
-constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
-
-// Extracts the first type from the template parameter pack matching the predicate, or Default if none match.
-template <template<class> class Predicate, class Default, class... Ts> struct first_of;
-template <template<class> class Predicate, class Default> struct first_of<Predicate, Default> {
-    using type = Default;
-};
-template <template<class> class Predicate, class Default, class T, class... Ts>
-struct first_of<Predicate, Default, T, Ts...> {
-    using type = typename std::conditional<
-        Predicate<T>::value,
-        T,
-        typename first_of<Predicate, Default, Ts...>::type
-    >::type;
-};
-template <template<class> class Predicate, class Default, class... T> using first_of_t = typename first_of<Predicate, Default, T...>::type;
-
-/// Defer the evaluation of type T until types Us are instantiated
-template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
-template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
-
-template <template<typename...> class Base>
-struct is_template_base_of_impl {
-    template <typename... Us> static std::true_type check(Base<Us...> *);
-    static std::false_type check(...);
-};
-
-/// Check if a template is the base of a type. For example:
-/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
-template <template<typename...> class Base, typename T>
-#if !defined(_MSC_VER)
-using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((T*)nullptr));
-#else // MSVC2015 has trouble with decltype in template aliases
-struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((T*)nullptr)) { };
-#endif
-
-/// Check if T is std::shared_ptr<U> where U can be anything
-template <typename T> struct is_shared_ptr : std::false_type { };
-template <typename U> struct is_shared_ptr<std::shared_ptr<U>> : std::true_type { };
-
-/// Ignore that a variable is unused in compiler warnings
-inline void ignore_unused(const int *) { }
-
-NAMESPACE_END(detail)
-
-/// Returns a named pointer that is shared among all extension modules (using the same
-/// pybind11 version) running in the current interpreter. Names starting with underscores
-/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
-inline PYBIND11_NOINLINE void* get_shared_data(const std::string& name) {
-    auto& internals = detail::get_internals();
-    auto it = internals.shared_data.find(name);
-    return it != internals.shared_data.end() ? it->second : nullptr;
-}
-
-/// Set the shared data that can be later recovered by `get_shared_data()`.
-inline PYBIND11_NOINLINE void *set_shared_data(const std::string& name, void *data) {
-    detail::get_internals().shared_data[name] = data;
-    return data;
-}
-
-/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
-/// such entry exists. Otherwise, a new object of default-constructible type `T` is
-/// added to the shared data under the given name and a reference to it is returned.
-template<typename T> T& get_or_create_shared_data(const std::string& name) {
-    auto& internals = detail::get_internals();
-    auto it = internals.shared_data.find(name);
-    T* ptr = (T*) (it != internals.shared_data.end() ? it->second : nullptr);
-    if (!ptr) {
-        ptr = new T();
-        internals.shared_data[name] = ptr;
-    }
-    return *ptr;
-}
-
-/// Fetch and hold an error which was already set in Python
-class error_already_set : public std::runtime_error {
-public:
-    error_already_set() : std::runtime_error(detail::error_string()) {
-        PyErr_Fetch(&type, &value, &trace);
-    }
-
-    error_already_set(const error_already_set &) = delete;
-
-    error_already_set(error_already_set &&e)
-        : std::runtime_error(e.what()), type(e.type), value(e.value),
-          trace(e.trace) { e.type = e.value = e.trace = nullptr; }
-
-    inline ~error_already_set(); // implementation in pybind11.h
-
-    error_already_set& operator=(const error_already_set &) = delete;
-
-    /// Give the error back to Python
-    void restore() { PyErr_Restore(type, value, trace); type = value = trace = nullptr; }
-
-private:
-    PyObject *type, *value, *trace;
-};
-
-/// C++ bindings of builtin Python exceptions
-class builtin_exception : public std::runtime_error {
-public:
-    using std::runtime_error::runtime_error;
-    virtual void set_error() const = 0; /// Set the error using the Python C API
-};
-
-#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
-    class name : public builtin_exception { public: \
-        using builtin_exception::builtin_exception; \
-        name() : name("") { } \
-        void set_error() const override { PyErr_SetString(type, what()); } \
-    };
-
-PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
-PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
-PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
-PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
-PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
-PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
-PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
-
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
-
-/// Format strings for basic number types
-#define PYBIND11_DECL_FMT(t, v) template<> struct format_descriptor<t> \
-    { static constexpr const char* value = v; /* for backwards compatibility */ \
-      static std::string format() { return value; } }
-
-template <typename T, typename SFINAE = void> struct format_descriptor { };
-
-template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_integral<T>::value>> {
-    static constexpr const char c = "bBhHiIqQ"[detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value];
-    static constexpr const char value[2] = { c, '\0' };
-    static std::string format() { return std::string(1, c); }
-};
-
-template <typename T> constexpr const char format_descriptor<
-    T, detail::enable_if_t<std::is_integral<T>::value>>::value[2];
-
-/// RAII wrapper that temporarily clears any Python error state
-struct error_scope {
-    PyObject *type, *value, *trace;
-    error_scope() { PyErr_Fetch(&type, &value, &trace); }
-    ~error_scope() { PyErr_Restore(type, value, trace); }
-};
-
-PYBIND11_DECL_FMT(float, "f");
-PYBIND11_DECL_FMT(double, "d");
-PYBIND11_DECL_FMT(bool, "?");
-
-/// Dummy destructor wrapper that can be used to expose classes with a private destructor
-struct nodelete { template <typename T> void operator()(T*) { } };
-
-// overload_cast requires variable templates: C++14 or MSVC 2015 Update 2
-#if defined(PYBIND11_CPP14) || _MSC_FULL_VER >= 190023918
-#define PYBIND11_OVERLOAD_CAST 1
-
-NAMESPACE_BEGIN(detail)
-template <typename... Args>
-struct overload_cast_impl {
-    template <typename Return /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    constexpr auto operator()(Return (*pf)(Args...) PYBIND11_NOEXCEPT_SPECIFIER) const noexcept
-                              -> decltype(pf) { return pf; }
-
-    template <typename Return, typename Class /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    constexpr auto operator()(Return (Class::*pmf)(Args...) PYBIND11_NOEXCEPT_SPECIFIER, std::false_type = {}) const noexcept
-                              -> decltype(pmf) { return pmf; }
-
-    template <typename Return, typename Class /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    constexpr auto operator()(Return (Class::*pmf)(Args...) const PYBIND11_NOEXCEPT_SPECIFIER, std::true_type) const noexcept
-                              -> decltype(pmf) { return pmf; }
-};
-NAMESPACE_END(detail)
-
-/// Syntax sugar for resolving overloaded function pointers:
-///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
-///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
-template <typename... Args>
-static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
-// MSVC 2015 only accepts this particular initialization syntax for this variable template.
-
-/// Const member function selector for overload_cast
-///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
-///  - sweet:   overload_cast<Arg>(&Class::func, const_)
-static constexpr auto const_ = std::true_type{};
-
-#endif // overload_cast
-
-NAMESPACE_END(pybind11)
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/pybind11/include/pybind11/complex.h b/pybind11/include/pybind11/complex.h
index f767f354c..5dac27cc4 100644
--- a/pybind11/include/pybind11/complex.h
+++ b/pybind11/include/pybind11/complex.h
@@ -17,17 +17,31 @@
 #  undef I
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
-PYBIND11_DECL_FMT(std::complex<float>, "Zf");
-PYBIND11_DECL_FMT(std::complex<double>, "Zd");
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
 
 NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
 template <typename T> class type_caster<std::complex<T>> {
 public:
-    bool load(handle src, bool) {
+    bool load(handle src, bool convert) {
         if (!src)
             return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
         Py_complex result = PyComplex_AsCComplex(src.ptr());
         if (result.real == -1.0 && PyErr_Occurred()) {
             PyErr_Clear();
@@ -44,4 +58,4 @@ public:
     PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
 };
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/class.h b/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 000000000..f745992a0
--- /dev/null
+++ b/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,606 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || obj == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
+    auto ht_qualname = name;
+    if (rec.scope && hasattr(rec.scope, "__qualname__")) {
+        ht_qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    }
+#endif
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
+    heap_type->ht_qualname = ht_qualname.release().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    return (PyObject *) type;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/common.h b/pybind11/include/pybind11/detail/common.h
new file mode 100644
index 000000000..8f763f08a
--- /dev/null
+++ b/pybind11/include/pybind11/detail/common.h
@@ -0,0 +1,800 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if !defined(NAMESPACE_BEGIN)
+#  define NAMESPACE_BEGIN(name) namespace name {
+#endif
+#if !defined(NAMESPACE_END)
+#  define NAMESPACE_END(name) }
+#endif
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus > 201402L /* Temporary: should be updated to >= the final C++17 value once known */
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER)
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1500
+#    error pybind11 requires Intel C++ compiler v15 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 2
+#define PYBIND11_VERSION_PATCH 1
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+#if defined(_WIN32) && (defined(min) || defined(max))
+#  error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#endif
+
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();               \
+    extern "C" PYBIND11_EXPORT void init##name() {          \
+        (void)pybind11_init_wrapper();                      \
+    }                                                       \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        int major, minor;                                                      \
+        if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) {           \
+            PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \
+            return nullptr;                                                    \
+        } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) {   \
+            PyErr_Format(PyExc_ImportError,                                    \
+                         "Python version mismatch: module was compiled for "   \
+                         "version %i.%i, while the interpreter is running "    \
+                         "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \
+                         major, minor);                                        \
+            return nullptr;                                                    \
+        }                                                                      \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } catch (pybind11::error_already_set &e) {                             \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        int major, minor;                                                      \
+        if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) {           \
+            PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \
+            return nullptr;                                                    \
+        } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) {   \
+            PyErr_Format(PyExc_ImportError,                                    \
+                         "Python version mismatch: module was compiled for "   \
+                         "version %i.%i, while the interpreter is running "    \
+                         "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \
+                         major, minor);                                        \
+            return nullptr;                                                    \
+        }                                                                      \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } catch (pybind11::error_already_set &e) {                             \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        struct {
+            void **values_and_holders;
+            uint8_t *status;
+        } nonsimple;
+    };
+    /// Weak references (needed for keep alive):
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.holder_constructed` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accomodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#ifdef __cpp_fold_expressions
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+
+NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+NAMESPACE_END(detail)
+
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#else // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+NAMESPACE_END(detail)
+
+
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/descr.h b/pybind11/include/pybind11/detail/descr.h
similarity index 92%
rename from pybind11/include/pybind11/descr.h
rename to pybind11/include/pybind11/detail/descr.h
index 2c3fb3d13..e3bf2ba97 100644
--- a/pybind11/include/pybind11/descr.h
+++ b/pybind11/include/pybind11/detail/descr.h
@@ -1,5 +1,5 @@
 /*
-    pybind11/descr.h: Helper type for concatenating type signatures
+    pybind11/detail/descr.h: Helper type for concatenating type signatures
     either at runtime (C++11) or compile time (C++14)
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
@@ -12,10 +12,12 @@
 
 #include "common.h"
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-#if defined(PYBIND11_CPP14) /* Concatenate type signatures at compile time using C++14 */
+/* Concatenate type signatures at compile time using C++14 */
+#if defined(PYBIND11_CPP14) && !defined(_MSC_VER)
+#define PYBIND11_CONSTEXPR_DESCR
 
 template <size_t Size1, size_t Size2> class descr {
     template <size_t Size1_, size_t Size2_> friend class descr;
@@ -113,20 +115,20 @@ public:
         memcpy(m_types, types, nTypes * sizeof(const std::type_info *));
     }
 
-    PYBIND11_NOINLINE descr friend operator+(descr &&d1, descr &&d2) {
+    PYBIND11_NOINLINE descr operator+(descr &&d2) && {
         descr r;
 
-        size_t nChars1 = len(d1.m_text), nTypes1 = len(d1.m_types);
+        size_t nChars1 = len(m_text),    nTypes1 = len(m_types);
         size_t nChars2 = len(d2.m_text), nTypes2 = len(d2.m_types);
 
         r.m_text  = new char[nChars1 + nChars2 - 1];
         r.m_types = new const std::type_info *[nTypes1 + nTypes2 - 1];
-        memcpy(r.m_text, d1.m_text, (nChars1-1) * sizeof(char));
+        memcpy(r.m_text, m_text, (nChars1-1) * sizeof(char));
         memcpy(r.m_text + nChars1 - 1, d2.m_text, nChars2 * sizeof(char));
-        memcpy(r.m_types, d1.m_types, (nTypes1-1) * sizeof(std::type_info *));
+        memcpy(r.m_types, m_types, (nTypes1-1) * sizeof(std::type_info *));
         memcpy(r.m_types + nTypes1 - 1, d2.m_types, nTypes2 * sizeof(std::type_info *));
 
-        delete[] d1.m_text; delete[] d1.m_types;
+        delete[] m_text;    delete[] m_types;
         delete[] d2.m_text; delete[] d2.m_types;
 
         return r;
@@ -180,4 +182,4 @@ PYBIND11_NOINLINE inline descr type_descr(descr&& d) { return _("{") + std::move
 #endif
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/detail/init.h b/pybind11/include/pybind11/detail/init.h
new file mode 100644
index 000000000..c3594a190
--- /dev/null
+++ b/pybind11/include/pybind11/detail/init.h
@@ -0,0 +1,325 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static PYBIND11_DESCR name() { return type_descr(_<value_and_holder>()); }
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = new Cpp<Class>{std::forward<Args>(args)...};
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = new Cpp<Class>{std::forward<Args>(args)...};
+            else
+                v_h.value_ptr() = new Alias<Class>{std::forward<Args>(args)...};
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = new Alias<Class>{std::forward<Args>(args)...};
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = new Alias<Class>{std::forward<Args>(args)...};
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+NAMESPACE_END(initimpl)
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/pybind11/include/pybind11/detail/internals.h b/pybind11/include/pybind11/detail/internals.h
new file mode 100644
index 000000000..213cbaeb2
--- /dev/null
+++ b/pybind11/include/pybind11/detail/internals.h
@@ -0,0 +1,247 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    decltype(PyThread_create_key()) tstate = 0; // Usually an int but a long on Cygwin64 with Python 3.x
+    PyInterpreterState *istate = nullptr;
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 1
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals *&get_internals_ptr() {
+    static internals *internals_ptr = nullptr;
+    return internals_ptr;
+}
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto *&internals_ptr = get_internals_ptr();
+    if (internals_ptr)
+        return *internals_ptr;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_ptr = *static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        internals_ptr->registered_exception_translators.push_front(
+            [](std::exception_ptr p) -> void {
+                try {
+                    if (p) std::rethrow_exception(p);
+                } catch (error_already_set &e)       { e.restore();   return;
+                } catch (const builtin_exception &e) { e.set_error(); return;
+                }
+            }
+        );
+#endif
+    } else {
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+        PyEval_InitThreads();
+        PyThreadState *tstate = PyThreadState_Get();
+        internals_ptr->tstate = PyThread_create_key();
+        PyThread_set_key_value(internals_ptr->tstate, tstate);
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(&internals_ptr);
+        internals_ptr->registered_exception_translators.push_front(
+            [](std::exception_ptr p) -> void {
+                try {
+                    if (p) std::rethrow_exception(p);
+                } catch (error_already_set &e)           { e.restore();                                    return;
+                } catch (const builtin_exception &e)     { e.set_error();                                  return;
+                } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+                } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+                } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+                } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+                } catch (...) {
+                    PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+                    return;
+                }
+            }
+        );
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return *internals_ptr;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/typeid.h b/pybind11/include/pybind11/detail/typeid.h
similarity index 89%
rename from pybind11/include/pybind11/typeid.h
rename to pybind11/include/pybind11/detail/typeid.h
index c903fb14c..6f36aab75 100644
--- a/pybind11/include/pybind11/typeid.h
+++ b/pybind11/include/pybind11/detail/typeid.h
@@ -1,5 +1,5 @@
 /*
-    pybind11/typeid.h: Compiler-independent access to type identifiers
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 
@@ -16,7 +16,7 @@
 #include <cxxabi.h>
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 /// Erase all occurrences of a substring
 inline void erase_all(std::string &string, const std::string &search) {
@@ -50,4 +50,4 @@ template <typename T> static std::string type_id() {
     return name;
 }
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/eigen.h b/pybind11/include/pybind11/eigen.h
index ff720d5f6..a702bf39e 100644
--- a/pybind11/include/pybind11/eigen.h
+++ b/pybind11/include/pybind11/eigen.h
@@ -30,153 +30,522 @@
 #  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
 NAMESPACE_BEGIN(detail)
 
-template <typename T> using is_eigen_dense = is_template_base_of<Eigen::DenseBase, T>;
-template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
-template <typename T> using is_eigen_ref = is_template_base_of<Eigen::RefBase, T>;
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
 
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
 // Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
 // basically covers anything that can be assigned to a dense matrix but that don't have a typical
 // matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
 // SelfAdjointView fall into this category.
-template <typename T> using is_eigen_base = all_of<
+template <typename T> using is_eigen_other = all_of<
     is_template_base_of<Eigen::EigenBase, T>,
-    negation<is_eigen_dense<T>>,
-    negation<is_eigen_sparse<T>>
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
 >;
 
-template<typename Type>
-struct type_caster<Type, enable_if_t<is_eigen_dense<Type>::value && !is_eigen_ref<Type>::value>> {
-    typedef typename Type::Scalar Scalar;
-    static constexpr bool rowMajor = Type::Flags & Eigen::RowMajorBit;
-    static constexpr bool isVector = Type::IsVectorAtCompileTime;
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
 
-    bool load(handle src, bool) {
-        auto buf = array_t<Scalar>::ensure(src);
-        if (!buf)
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
             return false;
 
-        if (buf.ndim() == 1) {
-            typedef Eigen::InnerStride<> Strides;
-            if (!isVector &&
-                !(Type::RowsAtCompileTime == Eigen::Dynamic &&
-                  Type::ColsAtCompileTime == Eigen::Dynamic))
-                return false;
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
 
-            if (Type::SizeAtCompileTime != Eigen::Dynamic &&
-                buf.shape(0) != (size_t) Type::SizeAtCompileTime)
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
                 return false;
 
-            Strides::Index n_elts = (Strides::Index) buf.shape(0);
-            Strides::Index unity = 1;
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
 
-            value = Eigen::Map<Type, 0, Strides>(
-                buf.mutable_data(),
-                rowMajor ? unity : n_elts,
-                rowMajor ? n_elts : unity,
-                Strides(buf.strides(0) / sizeof(Scalar))
-            );
-        } else if (buf.ndim() == 2) {
-            typedef Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic> Strides;
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
 
-            if ((Type::RowsAtCompileTime != Eigen::Dynamic && buf.shape(0) != (size_t) Type::RowsAtCompileTime) ||
-                (Type::ColsAtCompileTime != Eigen::Dynamic && buf.shape(1) != (size_t) Type::ColsAtCompileTime))
-                return false;
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
 
-            value = Eigen::Map<Type, 0, Strides>(
-                buf.mutable_data(),
-                typename Strides::Index(buf.shape(0)),
-                typename Strides::Index(buf.shape(1)),
-                Strides(buf.strides(rowMajor ? 0 : 1) / sizeof(Scalar),
-                        buf.strides(rowMajor ? 1 : 0) / sizeof(Scalar))
-            );
-        } else {
+    static PYBIND11_DESCR descriptor() {
+        constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+        constexpr bool show_order = is_eigen_dense_map<Type>::value;
+        constexpr bool show_c_contiguous = show_order && requires_row_major;
+        constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+        return type_descr(_("numpy.ndarray[") + npy_format_descriptor<Scalar>::name() +
+            _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+            _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+            _("]") +
+            // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+            // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+            // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+            // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+            // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+            // *gave* a numpy.ndarray of the right type and dimensions.
+            _<show_writeable>(", flags.writeable", "") +
+            _<show_c_contiguous>(", flags.c_contiguous", "") +
+            _<show_f_contiguous>(", flags.f_contiguous", "") +
+            _("]")
+        );
+    }
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
             return false;
         }
+
         return true;
     }
 
-    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
-        if (isVector) {
-            return array(
-                { (size_t) src.size() },                                      // shape
-                { sizeof(Scalar) * static_cast<size_t>(src.innerStride()) },  // strides
-                src.data()                                                    // data
-            ).release();
-        } else {
-            return array(
-                { (size_t) src.rows(),                                        // shape
-                  (size_t) src.cols() },
-                { sizeof(Scalar) * static_cast<size_t>(src.rowStride()),      // strides
-                  sizeof(Scalar) * static_cast<size_t>(src.colStride()) },
-                src.data()                                                    // data
-            ).release();
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static PYBIND11_DESCR name() { return props::descriptor(); }
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Eigen Ref/Map classes have slightly different policy requirements, meaning we don't want to force
+// `move` when a Ref/Map rvalue is returned; we treat Ref<> sort of like a pointer (we care about
+// the underlying data, not the outer shell).
+template <typename Return>
+struct return_value_policy_override<Return, enable_if_t<is_eigen_dense_map<Return>::value>> {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
         }
     }
 
-    PYBIND11_TYPE_CASTER(Type, _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name() +
-            _("[") + rows() + _(", ") + cols() + _("]]"));
+    static PYBIND11_DESCR name() { return props::descriptor(); }
 
-protected:
-    template <typename T = Type, enable_if_t<T::RowsAtCompileTime == Eigen::Dynamic, int> = 0>
-    static PYBIND11_DESCR rows() { return _("m"); }
-    template <typename T = Type, enable_if_t<T::RowsAtCompileTime != Eigen::Dynamic, int> = 0>
-    static PYBIND11_DESCR rows() { return _<T::RowsAtCompileTime>(); }
-    template <typename T = Type, enable_if_t<T::ColsAtCompileTime == Eigen::Dynamic, int> = 0>
-    static PYBIND11_DESCR cols() { return _("n"); }
-    template <typename T = Type, enable_if_t<T::ColsAtCompileTime != Eigen::Dynamic, int> = 0>
-    static PYBIND11_DESCR cols() { return _<T::ColsAtCompileTime>(); }
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
 };
 
-// Eigen::Ref<Derived> satisfies is_eigen_dense, but isn't constructable, so it needs a special
-// type_caster to handle argument copying/forwarding.
-template <typename CVDerived, int Options, typename StrideType>
-struct type_caster<Eigen::Ref<CVDerived, Options, StrideType>> {
-protected:
-    using Type = Eigen::Ref<CVDerived, Options, StrideType>;
-    using Derived = typename std::remove_const<CVDerived>::type;
-    using DerivedCaster = make_caster<Derived>;
-    DerivedCaster derived_caster;
-    std::unique_ptr<Type> value;
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
 public:
-    bool load(handle src, bool convert) { if (derived_caster.load(src, convert)) { value.reset(new Type(derived_caster.operator Derived&())); return true; } return false; }
-    static handle cast(const Type &src, return_value_policy policy, handle parent) { return DerivedCaster::cast(src, policy, parent); }
-    static handle cast(const Type *src, return_value_policy policy, handle parent) { return DerivedCaster::cast(*src, policy, parent); }
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
 
-    static PYBIND11_DESCR name() { return DerivedCaster::name(); }
+        return true;
+    }
 
-    operator Type*() { return value.get(); }
-    operator Type&() { if (!value) pybind11_fail("Eigen::Ref<...> value not loaded"); return *value; }
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
     template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
 };
 
-// type_caster for special matrix types (e.g. DiagonalMatrix): load() is not supported, but we can
-// cast them into the python domain by first copying to a regular Eigen::Matrix, then casting that.
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
 template <typename Type>
-struct type_caster<Type, enable_if_t<is_eigen_base<Type>::value && !is_eigen_ref<Type>::value>> {
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
 protected:
-    using Matrix = Eigen::Matrix<typename Type::Scalar, Eigen::Dynamic, Eigen::Dynamic>;
-    using MatrixCaster = make_caster<Matrix>;
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
 public:
-    [[noreturn]] bool load(handle, bool) { pybind11_fail("Unable to load() into specialized EigenBase object"); }
-    static handle cast(const Type &src, return_value_policy policy, handle parent) { return MatrixCaster::cast(Matrix(src), policy, parent); }
-    static handle cast(const Type *src, return_value_policy policy, handle parent) { return MatrixCaster::cast(Matrix(*src), policy, parent); }
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
 
-    static PYBIND11_DESCR name() { return MatrixCaster::name(); }
+    static PYBIND11_DESCR name() { return props::descriptor(); }
 
-    [[noreturn]] operator Type*() { pybind11_fail("Loading not supported for specialized EigenBase object"); }
-    [[noreturn]] operator Type&() { pybind11_fail("Loading not supported for specialized EigenBase object"); }
-    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
 };
 
 template<typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
     typedef typename Type::Scalar Scalar;
-    typedef typename std::remove_reference<decltype(*std::declval<Type>().outerIndexPtr())>::type StorageIndex;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
     typedef typename Type::Index Index;
-    static constexpr bool rowMajor = Type::Flags & Eigen::RowMajorBit;
+    static constexpr bool rowMajor = Type::IsRowMajor;
 
     bool load(handle src, bool) {
         if (!src)
@@ -187,7 +556,7 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
         object matrix_type = sparse_module.attr(
             rowMajor ? "csr_matrix" : "csc_matrix");
 
-        if (obj.get_type() != matrix_type.ptr()) {
+        if (!obj.get_type().is(matrix_type)) {
             try {
                 obj = matrix_type(obj);
             } catch (const error_already_set &) {
@@ -217,9 +586,9 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
         object matrix_type = module::import("scipy.sparse").attr(
             rowMajor ? "csr_matrix" : "csc_matrix");
 
-        array data((size_t) src.nonZeros(), src.valuePtr());
-        array outerIndices((size_t) (rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
-        array innerIndices((size_t) src.nonZeros(), src.innerIndexPtr());
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
 
         return matrix_type(
             std::make_tuple(data, innerIndices, outerIndices),
@@ -227,12 +596,12 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
         ).release();
     }
 
-    PYBIND11_TYPE_CASTER(Type, _<(Type::Flags & Eigen::RowMajorBit) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
             + npy_format_descriptor<Scalar>::name() + _("]"));
 };
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(__GNUG__) || defined(__clang__)
 #  pragma GCC diagnostic pop
diff --git a/pybind11/include/pybind11/embed.h b/pybind11/include/pybind11/embed.h
new file mode 100644
index 000000000..6664967c1
--- /dev/null
+++ b/pybind11/include/pybind11/embed.h
@@ -0,0 +1,194 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    Python documentation for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = &detail::get_internals_ptr();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/eval.h b/pybind11/include/pybind11/eval.h
index 5b2b98272..ea85ba1db 100644
--- a/pybind11/include/pybind11/eval.h
+++ b/pybind11/include/pybind11/eval.h
@@ -11,11 +11,9 @@
 
 #pragma once
 
-#pragma once
-
 #include "pybind11.h"
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 enum eval_mode {
     /// Evaluate a string containing an isolated expression
@@ -29,12 +27,7 @@ enum eval_mode {
 };
 
 template <eval_mode mode = eval_expr>
-object eval(str expr, object global = object(), object local = object()) {
-    if (!global) {
-        global = reinterpret_borrow<object>(PyEval_GetGlobals());
-        if (!global)
-            global = dict();
-    }
+object eval(str expr, object global = globals(), object local = object()) {
     if (!local)
         local = global;
 
@@ -56,13 +49,25 @@ object eval(str expr, object global = object(), object local = object()) {
     return reinterpret_steal<object>(result);
 }
 
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
 template <eval_mode mode = eval_statements>
-object eval_file(str fname, object global = object(), object local = object()) {
-    if (!global) {
-        global = reinterpret_borrow<object>(PyEval_GetGlobals());
-        if (!global)
-            global = dict();
-    }
+object eval_file(str fname, object global = globals(), object local = object()) {
     if (!local)
         local = global;
 
@@ -109,4 +114,4 @@ object eval_file(str fname, object global = object(), object local = object()) {
     return reinterpret_steal<object>(result);
 }
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/functional.h b/pybind11/include/pybind11/functional.h
index 8e7e75e6b..eda14ba58 100644
--- a/pybind11/include/pybind11/functional.h
+++ b/pybind11/include/pybind11/functional.h
@@ -12,24 +12,28 @@
 #include "pybind11.h"
 #include <functional>
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
-template <typename Return, typename... Args /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-struct type_caster<std::function<Return(Args...) PYBIND11_NOEXCEPT_SPECIFIER>> {
-    using type = std::function<Return(Args...) PYBIND11_NOEXCEPT_SPECIFIER>;
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
     using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
-    using function_type = Return (*) (Args...) PYBIND11_NOEXCEPT_SPECIFIER;
+    using function_type = Return (*) (Args...);
 
 public:
-    bool load(handle src_, bool) {
-        if (src_.is_none())
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
             return true;
+        }
 
-        src_ = detail::get_function(src_);
-        if (!src_ || !PyCallable_Check(src_.ptr()))
+        if (!isinstance<function>(src))
             return false;
 
+        auto func = reinterpret_borrow<function>(src);
+
         /*
            When passing a C++ function as an argument to another C++
            function via Python, every function call would normally involve
@@ -38,21 +42,21 @@ public:
            stateless (i.e. function pointer or lambda function without
            captured variables), in which case the roundtrip can be avoided.
          */
-        if (PyCFunction_Check(src_.ptr())) {
-            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(src_.ptr()));
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
             auto rec = (function_record *) c;
 
-            if (rec && rec->is_stateless && rec->data[1] == &typeid(function_type)) {
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
                 struct capture { function_type f; };
                 value = ((capture *) &rec->data)->f;
                 return true;
             }
         }
 
-        auto src = reinterpret_borrow<object>(src_);
-        value = [src](Args... args) -> Return {
+        value = [func](Args... args) -> Return {
             gil_scoped_acquire acq;
-            object retval(src(std::move(args)...));
+            object retval(func(std::forward<Args>(args)...));
             /* Visual studio 2015 parser issue: need parentheses around this expression */
             return (retval.template cast<Return>());
         };
@@ -78,4 +82,4 @@ public:
 };
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/iostream.h b/pybind11/include/pybind11/iostream.h
new file mode 100644
index 000000000..a9c27aac1
--- /dev/null
+++ b/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,200 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    char d_buffer[1024];
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            pywrite(line);
+            pyflush();
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+    pythonbuf(object pyostream)
+        : pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer, d_buffer + sizeof(d_buffer) - 1);
+    }
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self, args) { self.exit(); });
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/numpy.h b/pybind11/include/pybind11/numpy.h
index 6fecf2853..55bb81698 100644
--- a/pybind11/include/pybind11/numpy.h
+++ b/pybind11/include/pybind11/numpy.h
@@ -29,15 +29,17 @@
 #endif
 
 /* This will be true on all flat address space platforms and allows us to reduce the
-   whole npy_intp / size_t / Py_intptr_t business down to just size_t for all size
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
    and dimension types (e.g. shape, strides, indexing), instead of inflicting this
    upon the library user. */
-static_assert(sizeof(size_t) == sizeof(Py_intptr_t), "size_t != Py_intptr_t");
+static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class array; // Forward declaration
 
-NAMESPACE_BEGIN(pybind11)
 NAMESPACE_BEGIN(detail)
-template <typename type, typename SFINAE = void> struct npy_format_descriptor { };
-template <typename type> struct is_pod_struct;
+template <typename type, typename SFINAE = void> struct npy_format_descriptor;
 
 struct PyArrayDescr_Proxy {
     PyObject_HEAD
@@ -108,11 +110,11 @@ inline numpy_internals& get_numpy_internals() {
 
 struct npy_api {
     enum constants {
-        NPY_C_CONTIGUOUS_ = 0x0001,
-        NPY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
         NPY_ARRAY_OWNDATA_ = 0x0004,
         NPY_ARRAY_FORCECAST_ = 0x0010,
-        NPY_ENSURE_ARRAY_ = 0x0040,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
         NPY_ARRAY_ALIGNED_ = 0x0100,
         NPY_ARRAY_WRITEABLE_ = 0x0400,
         NPY_BOOL_ = 0,
@@ -127,6 +129,11 @@ struct npy_api {
         NPY_STRING_, NPY_UNICODE_, NPY_VOID_
     };
 
+    typedef struct {
+        Py_intptr_t *ptr;
+        int len;
+    } PyArray_Dims;
+
     static npy_api& get() {
         static npy_api api = lookup();
         return api;
@@ -139,11 +146,13 @@ struct npy_api {
         return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
     }
 
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
     PyObject *(*PyArray_DescrFromType_)(int);
     PyObject *(*PyArray_NewFromDescr_)
         (PyTypeObject *, PyObject *, int, Py_intptr_t *,
          Py_intptr_t *, void *, int, PyObject *);
     PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
     PyObject *(*PyArray_NewCopy_)(PyObject *, int);
     PyTypeObject *PyArray_Type_;
     PyTypeObject *PyVoidArrType_Type_;
@@ -155,21 +164,27 @@ struct npy_api {
     int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *,
                                              Py_ssize_t *, PyObject **, PyObject *);
     PyObject *(*PyArray_Squeeze_)(PyObject *);
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
 private:
     enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
         API_PyArray_Type = 2,
         API_PyArrayDescr_Type = 3,
         API_PyVoidArrType_Type = 39,
         API_PyArray_DescrFromType = 45,
         API_PyArray_DescrFromScalar = 57,
         API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
         API_PyArray_NewCopy = 85,
         API_PyArray_NewFromDescr = 94,
         API_PyArray_DescrNewFromType = 9,
         API_PyArray_DescrConverter = 174,
         API_PyArray_EquivTypes = 182,
         API_PyArray_GetArrayParamsFromObject = 278,
-        API_PyArray_Squeeze = 136
+        API_PyArray_Squeeze = 136,
+        API_PyArray_SetBaseObject = 282
     };
 
     static npy_api lookup() {
@@ -182,12 +197,17 @@ private:
 #endif
         npy_api api;
 #define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
         DECL_NPY_API(PyArray_Type);
         DECL_NPY_API(PyVoidArrType_Type);
         DECL_NPY_API(PyArrayDescr_Type);
         DECL_NPY_API(PyArray_DescrFromType);
         DECL_NPY_API(PyArray_DescrFromScalar);
         DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
         DECL_NPY_API(PyArray_NewCopy);
         DECL_NPY_API(PyArray_NewFromDescr);
         DECL_NPY_API(PyArray_DescrNewFromType);
@@ -195,6 +215,7 @@ private:
         DECL_NPY_API(PyArray_EquivTypes);
         DECL_NPY_API(PyArray_GetArrayParamsFromObject);
         DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_SetBaseObject);
 #undef DECL_NPY_API
         return api;
     }
@@ -220,6 +241,181 @@ inline bool check_flags(const void* ptr, int flag) {
     return (flag == (array_proxy(ptr)->flags & flag));
 }
 
+template <typename T> struct is_std_array : std::false_type { };
+template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
+template <typename T> struct is_complex : std::false_type { };
+template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+
+template <typename T> struct array_info_scalar {
+    typedef T type;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static PYBIND11_DESCR extents() { return _(""); }
+    static void append_extents(list& /* shape */) { }
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T, size_t N> struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list& shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    template<typename T2 = T, enable_if_t<!array_info<T2>::is_array, int> = 0>
+    static PYBIND11_DESCR extents() {
+        return _<N>();
+    }
+
+    template<typename T2 = T, enable_if_t<array_info<T2>::is_array, int> = 0>
+    static PYBIND11_DESCR extents() {
+        return concat(_<N>(), array_info<T>::extents());
+    }
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
+template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
+template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+template <typename T> using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T> using is_pod_struct = all_of<
+    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
+    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
+    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
+    std::is_trivially_copyable<T>,
+#else
+    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+    std::is_trivially_destructible<T>,
+    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#endif
+    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+>;
+
+template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
+            shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix> const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const { return operator()(index); }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
+    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+public:
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix> T& operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) { return operator()(index); }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+
 NAMESPACE_END(detail)
 
 class dtype : public object {
@@ -238,7 +434,7 @@ public:
 
     dtype(const char *format) : dtype(std::string(format)) { }
 
-    dtype(list names, list formats, list offsets, size_t itemsize) {
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
         dict args;
         args["names"] = names;
         args["formats"] = formats;
@@ -261,8 +457,8 @@ public:
     }
 
     /// Size of the data type in bytes.
-    size_t itemsize() const {
-        return (size_t) detail::array_descriptor_proxy(m_ptr)->elsize;
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(m_ptr)->elsize;
     }
 
     /// Returns true for structured data types.
@@ -282,7 +478,7 @@ private:
         return reinterpret_borrow<object>(obj);
     }
 
-    dtype strip_padding(size_t itemsize) {
+    dtype strip_padding(ssize_t itemsize) {
         // Recursively strip all void fields with empty names that are generated for
         // padding fields (as of NumPy v1.11).
         if (!has_fields())
@@ -321,40 +517,47 @@ public:
     PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
 
     enum {
-        c_style = detail::npy_api::NPY_C_CONTIGUOUS_,
-        f_style = detail::npy_api::NPY_F_CONTIGUOUS_,
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
         forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
     };
 
-    array() : array(0, static_cast<const double *>(nullptr)) {}
+    array() : array({{0}}, static_cast<const double *>(nullptr)) {}
 
-    array(const pybind11::dtype &dt, const std::vector<size_t> &shape,
-          const std::vector<size_t> &strides, const void *ptr = nullptr,
-          handle base = handle()) {
-        auto& api = detail::npy_api::get();
-        auto ndim = shape.size();
-        if (shape.size() != strides.size())
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
+          const void *ptr = nullptr, handle base = handle()) {
+
+        if (strides->empty())
+            *strides = c_strides(*shape, dt.itemsize());
+
+        auto ndim = shape->size();
+        if (ndim != strides->size())
             pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
         auto descr = dt;
 
         int flags = 0;
         if (base && ptr) {
             if (isinstance<array>(base))
-                /* Copy flags from base (except baseship bit) */
+                /* Copy flags from base (except ownership bit) */
                 flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
             else
                 /* Writable by default, easy to downgrade later on if needed */
                 flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
         }
 
+        auto &api = detail::npy_api::get();
         auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
-            api.PyArray_Type_, descr.release().ptr(), (int) ndim, (Py_intptr_t *) shape.data(),
-            (Py_intptr_t *) strides.data(), const_cast<void *>(ptr), flags, nullptr));
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            const_cast<void *>(ptr), flags, nullptr));
         if (!tmp)
-            pybind11_fail("NumPy: unable to create array!");
+            throw error_already_set();
         if (ptr) {
             if (base) {
-                detail::array_proxy(tmp.ptr())->base = base.inc_ref().ptr();
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
             } else {
                 tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
             }
@@ -362,27 +565,23 @@ public:
         m_ptr = tmp.release().ptr();
     }
 
-    array(const pybind11::dtype &dt, const std::vector<size_t> &shape,
-          const void *ptr = nullptr, handle base = handle())
-        : array(dt, shape, default_strides(shape, dt.itemsize()), ptr, base) { }
+    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) { }
 
-    array(const pybind11::dtype &dt, size_t count, const void *ptr = nullptr,
-          handle base = handle())
-        : array(dt, std::vector<size_t>{ count }, ptr, base) { }
+    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) { }
 
-    template<typename T> array(const std::vector<size_t>& shape,
-                               const std::vector<size_t>& strides,
-                               const T* ptr, handle base = handle())
-    : array(pybind11::dtype::of<T>(), shape, strides, (void *) ptr, base) { }
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
 
     template <typename T>
-    array(const std::vector<size_t> &shape, const T *ptr,
-          handle base = handle())
-        : array(shape, default_strides(shape, sizeof(T)), ptr, base) { }
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) { }
 
     template <typename T>
-    array(size_t count, const T *ptr, handle base = handle())
-        : array(std::vector<size_t>{ count }, ptr, base) { }
+    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
 
     explicit array(const buffer_info &info)
     : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { }
@@ -393,23 +592,23 @@ public:
     }
 
     /// Total number of elements
-    size_t size() const {
-        return std::accumulate(shape(), shape() + ndim(), (size_t) 1, std::multiplies<size_t>());
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
     }
 
     /// Byte size of a single element
-    size_t itemsize() const {
-        return (size_t) detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
     }
 
     /// Total number of bytes
-    size_t nbytes() const {
+    ssize_t nbytes() const {
         return size() * itemsize();
     }
 
     /// Number of dimensions
-    size_t ndim() const {
-        return (size_t) detail::array_proxy(m_ptr)->nd;
+    ssize_t ndim() const {
+        return detail::array_proxy(m_ptr)->nd;
     }
 
     /// Base object
@@ -418,24 +617,24 @@ public:
     }
 
     /// Dimensions of the array
-    const size_t* shape() const {
-        return reinterpret_cast<const size_t *>(detail::array_proxy(m_ptr)->dimensions);
+    const ssize_t* shape() const {
+        return detail::array_proxy(m_ptr)->dimensions;
     }
 
     /// Dimension along a given axis
-    size_t shape(size_t dim) const {
+    ssize_t shape(ssize_t dim) const {
         if (dim >= ndim())
             fail_dim_check(dim, "invalid axis");
         return shape()[dim];
     }
 
     /// Strides of the array
-    const size_t* strides() const {
-        return reinterpret_cast<const size_t *>(detail::array_proxy(m_ptr)->strides);
+    const ssize_t* strides() const {
+        return detail::array_proxy(m_ptr)->strides;
     }
 
     /// Stride along a given axis
-    size_t strides(size_t dim) const {
+    ssize_t strides(ssize_t dim) const {
         if (dim >= ndim())
             fail_dim_check(dim, "invalid axis");
         return strides()[dim];
@@ -472,26 +671,68 @@ public:
 
     /// Byte offset from beginning of the array to a given index (full or partial).
     /// May throw if the index would lead to out of bounds access.
-    template<typename... Ix> size_t offset_at(Ix... index) const {
-        if (sizeof...(index) > ndim())
+    template<typename... Ix> ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim())
             fail_dim_check(sizeof...(index), "too many indices for an array");
-        return byte_offset(size_t(index)...);
+        return byte_offset(ssize_t(index)...);
     }
 
-    size_t offset_at() const { return 0; }
+    ssize_t offset_at() const { return 0; }
 
     /// Item count from beginning of the array to a given index (full or partial).
     /// May throw if the index would lead to out of bounds access.
-    template<typename... Ix> size_t index_at(Ix... index) const {
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
         return offset_at(index...) / itemsize();
     }
 
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
+     * reshaped for the duration of the returned object, and the caller must take care not to access
+     * invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
     /// Return a new view with all of the dimensions of length 1 removed
     array squeeze() {
         auto& api = detail::npy_api::get();
         return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
     }
 
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d = {
+            new_shape->data(), int(new_shape->size())
+        };
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        object new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
+        );
+        if (!new_array) throw error_already_set();
+        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    }
+
     /// Ensure that the argument is a NumPy array
     /// In case of an error, nullptr is returned and the Python error is cleared.
     static array ensure(handle h, int ExtraFlags = 0) {
@@ -504,46 +745,46 @@ public:
 protected:
     template<typename, typename> friend struct detail::npy_format_descriptor;
 
-    void fail_dim_check(size_t dim, const std::string& msg) const {
+    void fail_dim_check(ssize_t dim, const std::string& msg) const {
         throw index_error(msg + ": " + std::to_string(dim) +
                           " (ndim = " + std::to_string(ndim()) + ")");
     }
 
-    template<typename... Ix> size_t byte_offset(Ix... index) const {
+    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
         check_dimensions(index...);
-        return byte_offset_unsafe(index...);
-    }
-
-    template<size_t dim = 0, typename... Ix> size_t byte_offset_unsafe(size_t i, Ix... index) const {
-        return i * strides()[dim] + byte_offset_unsafe<dim + 1>(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
     }
 
-    template<size_t dim = 0> size_t byte_offset_unsafe() const { return 0; }
-
     void check_writeable() const {
         if (!writeable())
-            throw std::runtime_error("array is not writeable");
+            throw std::domain_error("array is not writeable");
     }
 
-    static std::vector<size_t> default_strides(const std::vector<size_t>& shape, size_t itemsize) {
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
         auto ndim = shape.size();
-        std::vector<size_t> strides(ndim);
-        if (ndim) {
-            std::fill(strides.begin(), strides.end(), itemsize);
-            for (size_t i = 0; i < ndim - 1; i++)
-                for (size_t j = 0; j < ndim - 1 - i; j++)
-                    strides[j] *= shape[ndim - 1 - i];
-        }
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = ndim - 1; i > 0; --i)
+            strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
         return strides;
     }
 
     template<typename... Ix> void check_dimensions(Ix... index) const {
-        check_dimensions_impl(size_t(0), shape(), size_t(index)...);
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
     }
 
-    void check_dimensions_impl(size_t, const size_t*) const { }
+    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
 
-    template<typename... Ix> void check_dimensions_impl(size_t axis, const size_t* shape, size_t i, Ix... index) const {
+    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
         if (i >= *shape) {
             throw index_error(std::string("index ") + std::to_string(i) +
                               " is out of bounds for axis " + std::to_string(axis) +
@@ -554,48 +795,58 @@ protected:
 
     /// Create array from any object -- always returns a new reference
     static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
-        if (ptr == nullptr)
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
             return nullptr;
+        }
         return detail::npy_api::get().PyArray_FromAny_(
-            ptr, nullptr, 0, 0, detail::npy_api::NPY_ENSURE_ARRAY_ | ExtraFlags, nullptr);
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
     }
 };
 
 template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
 public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
     array_t() : array(0, static_cast<const T *>(nullptr)) {}
-    array_t(handle h, borrowed_t) : array(h, borrowed) { }
-    array_t(handle h, stolen_t) : array(h, stolen) { }
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
 
     PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
-    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen) {
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
         if (!m_ptr) PyErr_Clear();
         if (!is_borrowed) Py_XDECREF(h.ptr());
     }
 
-    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen) {
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
         if (!m_ptr) throw error_already_set();
     }
 
     explicit array_t(const buffer_info& info) : array(info) { }
 
-    array_t(const std::vector<size_t> &shape,
-            const std::vector<size_t> &strides, const T *ptr = nullptr,
-            handle base = handle())
-        : array(shape, strides, ptr, base) { }
+    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) { }
 
-    explicit array_t(const std::vector<size_t> &shape, const T *ptr = nullptr,
-            handle base = handle())
-        : array(shape, ptr, base) { }
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
 
     explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
-        : array(count, ptr, base) { }
+        : array({count}, {}, ptr, base) { }
 
-    constexpr size_t itemsize() const {
+    constexpr ssize_t itemsize() const {
         return sizeof(T);
     }
 
-    template<typename... Ix> size_t index_at(Ix... index) const {
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
         return offset_at(index...) / itemsize();
     }
 
@@ -611,18 +862,39 @@ public:
     template<typename... Ix> const T& at(Ix... index) const {
         if (sizeof...(index) != ndim())
             fail_dim_check(sizeof...(index), "index dimension mismatch");
-        return *(static_cast<const T*>(array::data()) + byte_offset(size_t(index)...) / itemsize());
+        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
     }
 
     // Mutable reference to element at a given index
     template<typename... Ix> T& mutable_at(Ix... index) {
         if (sizeof...(index) != ndim())
             fail_dim_check(sizeof...(index), "index dimension mismatch");
-        return *(static_cast<T*>(array::mutable_data()) + byte_offset(size_t(index)...) / itemsize());
+        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
     }
 
-    /// Ensure that the argument is a NumPy array of the correct dtype.
-    /// In case of an error, nullptr is returned and the Python error is cleared.
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
+     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
+     * for the duration of the returned object, and the caller must take care not to access invalid
+     * dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
     static array_t ensure(handle h) {
         auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
         if (!result)
@@ -630,7 +902,7 @@ public:
         return result;
     }
 
-    static bool _check(handle h) {
+    static bool check_(handle h) {
         const auto &api = detail::npy_api::get();
         return api.PyArray_Check_(h.ptr())
                && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
@@ -639,11 +911,13 @@ public:
 protected:
     /// Create array from any object -- always returns a new reference
     static PyObject *raw_array_t(PyObject *ptr) {
-        if (ptr == nullptr)
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
             return nullptr;
+        }
         return detail::npy_api::get().PyArray_FromAny_(
             ptr, dtype::of<T>().release().ptr(), 0, 0,
-            detail::npy_api::NPY_ENSURE_ARRAY_ | ExtraFlags, nullptr);
+            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
     }
 };
 
@@ -669,12 +943,23 @@ struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
     }
 };
 
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using detail::_;
+        PYBIND11_DESCR extents = _("(") + detail::array_info<T>::extents() + _(")");
+        return extents.text() + format_descriptor<detail::remove_all_extents_t<T>>::format();
+    }
+};
+
 NAMESPACE_BEGIN(detail)
 template <typename T, int ExtraFlags>
 struct pyobject_caster<array_t<T, ExtraFlags>> {
     using type = array_t<T, ExtraFlags>;
 
-    bool load(handle src, bool /* convert */) {
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src))
+            return false;
         value = type::ensure(src);
         return static_cast<bool>(value);
     }
@@ -685,65 +970,69 @@ struct pyobject_caster<array_t<T, ExtraFlags>> {
     PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name());
 };
 
-template <typename T> struct is_std_array : std::false_type { };
-template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
-
 template <typename T>
-struct is_pod_struct {
-    enum { value = std::is_pod<T>::value && // offsetof only works correctly for POD types
-           !std::is_reference<T>::value &&
-           !std::is_array<T>::value &&
-           !is_std_array<T>::value &&
-           !std::is_integral<T>::value &&
-           !std::is_enum<T>::value &&
-           !std::is_same<typename std::remove_cv<T>::type, float>::value &&
-           !std::is_same<typename std::remove_cv<T>::type, double>::value &&
-           !std::is_same<typename std::remove_cv<T>::type, bool>::value &&
-           !std::is_same<typename std::remove_cv<T>::type, std::complex<float>>::value &&
-           !std::is_same<typename std::remove_cv<T>::type, std::complex<double>>::value };
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
 };
 
-template <typename T> struct npy_format_descriptor<T, enable_if_t<std::is_integral<T>::value>> {
+template <typename T> struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>> {
 private:
-    constexpr static const int values[8] = {
-        npy_api::NPY_BYTE_, npy_api::NPY_UBYTE_, npy_api::NPY_SHORT_,    npy_api::NPY_USHORT_,
-        npy_api::NPY_INT_,  npy_api::NPY_UINT_,  npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_ };
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {
+        npy_api::NPY_BOOL_,
+        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_SHORT_,    npy_api::NPY_USHORT_,
+        npy_api::NPY_INT_,    npy_api::NPY_UINT_,    npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_,
+        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
+        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
+    };
+
 public:
-    enum { value = values[detail::log2(sizeof(T)) * 2 + (std::is_unsigned<T>::value ? 1 : 0)] };
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
     static pybind11::dtype dtype() {
         if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
             return reinterpret_borrow<pybind11::dtype>(ptr);
         pybind11_fail("Unsupported buffer format!");
     }
-    template <typename T2 = T, enable_if_t<std::is_signed<T2>::value, int> = 0>
-    static PYBIND11_DESCR name() { return _("int") + _<sizeof(T)*8>(); }
-    template <typename T2 = T, enable_if_t<!std::is_signed<T2>::value, int> = 0>
-    static PYBIND11_DESCR name() { return _("uint") + _<sizeof(T)*8>(); }
+    template <typename T2 = T, enable_if_t<std::is_integral<T2>::value, int> = 0>
+    static PYBIND11_DESCR name() {
+        return _<std::is_same<T, bool>::value>(_("bool"),
+            _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T)*8>());
+    }
+    template <typename T2 = T, enable_if_t<std::is_floating_point<T2>::value, int> = 0>
+    static PYBIND11_DESCR name() {
+        return _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
+                _("float") + _<sizeof(T)*8>(), _("longdouble"));
+    }
+    template <typename T2 = T, enable_if_t<is_complex<T2>::value, int> = 0>
+    static PYBIND11_DESCR name() {
+        return _<std::is_same<typename T2::value_type, float>::value || std::is_same<typename T2::value_type, double>::value>(
+                _("complex") + _<sizeof(typename T2::value_type)*16>(), _("longcomplex"));
+    }
 };
-template <typename T> constexpr const int npy_format_descriptor<
-    T, enable_if_t<std::is_integral<T>::value>>::values[8];
-
-#define DECL_FMT(Type, NumPyName, Name) template<> struct npy_format_descriptor<Type> { \
-    enum { value = npy_api::NumPyName }; \
-    static pybind11::dtype dtype() { \
-        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value)) \
-            return reinterpret_borrow<pybind11::dtype>(ptr); \
-        pybind11_fail("Unsupported buffer format!"); \
-    } \
-    static PYBIND11_DESCR name() { return _(Name); } }
-DECL_FMT(float, NPY_FLOAT_, "float32");
-DECL_FMT(double, NPY_DOUBLE_, "float64");
-DECL_FMT(bool, NPY_BOOL_, "bool");
-DECL_FMT(std::complex<float>, NPY_CFLOAT_, "complex64");
-DECL_FMT(std::complex<double>, NPY_CDOUBLE_, "complex128");
-#undef DECL_FMT
-
-#define DECL_CHAR_FMT \
+
+#define PYBIND11_DECL_CHAR_FMT \
     static PYBIND11_DESCR name() { return _("S") + _<N>(); } \
     static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
-template <size_t N> struct npy_format_descriptor<char[N]> { DECL_CHAR_FMT };
-template <size_t N> struct npy_format_descriptor<std::array<char, N>> { DECL_CHAR_FMT };
-#undef DECL_CHAR_FMT
+template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#undef PYBIND11_DECL_CHAR_FMT
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static PYBIND11_DESCR name() { return _("(") + array_info<T>::extents() + _(")") + base_descr::name(); }
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
+    }
+};
 
 template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
 private:
@@ -755,16 +1044,15 @@ public:
 
 struct field_descriptor {
     const char *name;
-    size_t offset;
-    size_t size;
-    size_t alignment;
+    ssize_t offset;
+    ssize_t size;
     std::string format;
     dtype descr;
 };
 
 inline PYBIND11_NOINLINE void register_structured_dtype(
     const std::initializer_list<field_descriptor>& fields,
-    const std::type_info& tinfo, size_t itemsize,
+    const std::type_info& tinfo, ssize_t itemsize,
     bool (*direct_converter)(PyObject *, void *&)) {
 
     auto& numpy_internals = get_numpy_internals();
@@ -792,15 +1080,17 @@ inline PYBIND11_NOINLINE void register_structured_dtype(
     std::vector<field_descriptor> ordered_fields(fields);
     std::sort(ordered_fields.begin(), ordered_fields.end(),
         [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
-    size_t offset = 0;
+    ssize_t offset = 0;
     std::ostringstream oss;
-    oss << "T{";
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
     for (auto& field : ordered_fields) {
         if (field.offset > offset)
             oss << (field.offset - offset) << 'x';
-        // mark unaligned fields with '='
-        if (field.offset % field.alignment)
-            oss << '=';
         oss << field.format << ':' << field.name << ':';
         offset = field.offset + field.size;
     }
@@ -820,9 +1110,10 @@ inline PYBIND11_NOINLINE void register_structured_dtype(
     get_internals().direct_conversions[tindex].push_back(direct_converter);
 }
 
-template <typename T>
-struct npy_format_descriptor<T, enable_if_t<is_pod_struct<T>::value>> {
-    static PYBIND11_DESCR name() { return _("struct"); }
+template <typename T, typename SFINAE> struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static PYBIND11_DESCR name() { return make_caster<T>::name(); }
 
     static pybind11::dtype dtype() {
         return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
@@ -858,10 +1149,14 @@ private:
     }
 };
 
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#else
+
 #define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
     ::pybind11::detail::field_descriptor {                                                    \
         Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
-        alignof(decltype(std::declval<T>().Field)),                                           \
         ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
         ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
     }
@@ -926,6 +1221,8 @@ private:
     ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
         ({PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
 
+#endif // __CLION_IDE__
+
 template  <class T>
 using array_iterator = typename std::add_pointer<T>::type;
 
@@ -941,13 +1238,13 @@ array_iterator<T> array_end(const buffer_info& buffer) {
 
 class common_iterator {
 public:
-    using container_type = std::vector<size_t>;
+    using container_type = std::vector<ssize_t>;
     using value_type = container_type::value_type;
     using size_type = container_type::size_type;
 
     common_iterator() : p_ptr(0), m_strides() {}
 
-    common_iterator(void* ptr, const container_type& strides, const std::vector<size_t>& shape)
+    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
         : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
         m_strides.back() = static_cast<value_type>(strides.back());
         for (size_type i = m_strides.size() - 1; i != 0; --i) {
@@ -972,16 +1269,16 @@ private:
 
 template <size_t N> class multi_array_iterator {
 public:
-    using container_type = std::vector<size_t>;
+    using container_type = std::vector<ssize_t>;
 
     multi_array_iterator(const std::array<buffer_info, N> &buffers,
-                         const std::vector<size_t> &shape)
+                         const container_type &shape)
         : m_shape(shape.size()), m_index(shape.size(), 0),
           m_common_iterator() {
 
         // Manual copy to avoid conversion warning if using std::copy
         for (size_t i = 0; i < shape.size(); ++i)
-            m_shape[i] = static_cast<container_type::value_type>(shape[i]);
+            m_shape[i] = shape[i];
 
         container_type strides(shape.size());
         for (size_t i = 0; i < N; ++i)
@@ -1001,8 +1298,8 @@ public:
         return *this;
     }
 
-    template <size_t K, class T> const T& data() const {
-        return *reinterpret_cast<T*>(m_common_iterator[K].data());
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
     }
 
 private:
@@ -1010,8 +1307,9 @@ private:
     using common_iter = common_iterator;
 
     void init_common_iterator(const buffer_info &buffer,
-                              const std::vector<size_t> &shape,
-                              common_iter &iterator, container_type &strides) {
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
         auto buffer_shape_iter = buffer.shape.rbegin();
         auto buffer_strides_iter = buffer.strides.rbegin();
         auto shape_iter = shape.rbegin();
@@ -1019,7 +1317,7 @@ private:
 
         while (buffer_shape_iter != buffer.shape.rend()) {
             if (*shape_iter == *buffer_shape_iter)
-                *strides_iter = static_cast<size_t>(*buffer_strides_iter);
+                *strides_iter = *buffer_strides_iter;
             else
                 *strides_iter = 0;
 
@@ -1043,128 +1341,259 @@ private:
     std::array<common_iter, N> m_common_iterator;
 };
 
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
+// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
+// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// buffer; returns `non_trivial` otherwise.
 template <size_t N>
-bool broadcast(const std::array<buffer_info, N>& buffers, size_t& ndim, std::vector<size_t>& shape) {
-    ndim = std::accumulate(buffers.begin(), buffers.end(), size_t(0), [](size_t res, const buffer_info& buf) {
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
         return std::max(res, buf.ndim);
     });
 
-    shape = std::vector<size_t>(ndim, 1);
-    bool trivial_broadcast = true;
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
+    // the full size).
     for (size_t i = 0; i < N; ++i) {
         auto res_iter = shape.rbegin();
-        bool i_trivial_broadcast = (buffers[i].size == 1) || (buffers[i].ndim == ndim);
-        for (auto shape_iter = buffers[i].shape.rbegin();
-             shape_iter != buffers[i].shape.rend(); ++shape_iter, ++res_iter) {
-
-            if (*res_iter == 1)
-                *res_iter = *shape_iter;
-            else if ((*shape_iter != 1) && (*res_iter != *shape_iter))
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
+            if (dim_size_out == 1)
+                dim_size_out = dim_size_in;
+            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
                 pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1)
+            continue;
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim)
+            return broadcast_trivial::non_trivial;
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
+            return broadcast_trivial::non_trivial;
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
+                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_c = false;
+            }
+        }
 
-            i_trivial_broadcast = i_trivial_broadcast && (*res_iter == *shape_iter);
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
+                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_f = false;
+            }
         }
-        trivial_broadcast = trivial_broadcast && i_trivial_broadcast;
     }
-    return trivial_broadcast;
+
+    return
+        trivial_broadcast_c ? broadcast_trivial::c_trivial :
+        trivial_broadcast_f ? broadcast_trivial::f_trivial :
+        broadcast_trivial::non_trivial;
 }
 
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
 template <typename Func, typename Return, typename... Args>
 struct vectorize_helper {
-    typename std::remove_reference<Func>::type f;
+private:
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
 
+public:
     template <typename T>
-    explicit vectorize_helper(T&&f) : f(std::forward<T>(f)) { }
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
 
-    object operator()(array_t<Args, array::c_style | array::forcecast>... args) {
-        return run(args..., make_index_sequence<sizeof...(Args)>());
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
     }
 
-    template <size_t ... Index> object run(array_t<Args, array::c_style | array::forcecast>&... args, index_sequence<Index...> index) {
-        /* Request buffers from all parameters */
-        const size_t N = sizeof...(Args);
+private:
+    remove_reference_t<Func> f;
+
+    template <size_t Index> using param_n_t = typename pack_element<Index, typename vectorize_arg<Args>::call_type...>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
 
-        std::array<buffer_info, N> buffers {{ args.request()... }};
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
 
         /* Determine dimensions parameters of output array */
-        size_t ndim = 0;
-        std::vector<size_t> shape(0);
-        bool trivial_broadcast = broadcast(buffers, ndim, shape);
-
-        size_t size = 1;
-        std::vector<size_t> strides(ndim);
-        if (ndim > 0) {
-            strides[ndim-1] = sizeof(Return);
-            for (size_t i = ndim - 1; i > 0; --i) {
-                strides[i - 1] = strides[i] * shape[i];
-                size *= shape[i];
-            }
-            size *= shape[0];
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        size_t ndim = (size_t) nd;
+
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
         }
 
-        if (size == 1)
-            return cast(f(*((Args *) buffers[Index].ptr)...));
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
 
-        array_t<Return> result(shape, strides);
-        auto buf = result.request();
-        auto output = (Return *) buf.ptr;
+        if (size == 0) return result;
 
-        if (trivial_broadcast) {
-            /* Call the function */
-            for (size_t i = 0; i < size; ++i) {
-                output[i] = f((buffers[Index].size == 1
-                               ? *((Args *) buffers[Index].ptr)
-                               : ((Args *) buffers[Index].ptr)[i])...);
-            }
-        } else {
-            apply_broadcast<N, Index...>(buffers, buf, index);
-        }
+        /* Call the function */
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
 
         return result;
     }
 
-    template <size_t N, size_t... Index>
-    void apply_broadcast(const std::array<buffer_info, N> &buffers,
-                         buffer_info &output, index_sequence<Index...>) {
-        using input_iterator = multi_array_iterator<N>;
-        using output_iterator = array_iterator<Return>;
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
 
-        input_iterator input_iter(buffers, output.shape);
-        output_iterator output_end = array_end<Return>(output);
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
 
-        for (output_iterator iter = array_begin<Return>(output);
-             iter != output_end; ++iter, ++input_iter) {
-            *iter = f((input_iter.template data<Index, Args>())...);
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
         }
     }
 };
 
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
 template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
-    static PYBIND11_DESCR name() { return _("numpy.ndarray[") + make_caster<T>::name() + _("]"); }
+    static PYBIND11_DESCR name() {
+        return _("numpy.ndarray[") + npy_format_descriptor<T>::name() + _("]");
+    }
 };
 
 NAMESPACE_END(detail)
 
-template <typename Func, typename Return, typename... Args /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-detail::vectorize_helper<Func, Return, Args...>
-vectorize(const Func &f, Return (*) (Args ...) PYBIND11_NOEXCEPT_SPECIFIER) {
-    return detail::vectorize_helper<Func, Return, Args...>(f);
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
+vectorize(Return (*f) (Args ...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
 }
 
-template <typename Return, typename... Args /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-detail::vectorize_helper<Return (*) (Args ...) PYBIND11_NOEXCEPT_SPECIFIER, Return, Args...>
-vectorize(Return (*f) (Args ...) PYBIND11_NOEXCEPT_SPECIFIER) {
-    return vectorize<Return (*) (Args ...), Return, Args...>(f, f);
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(
+        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
 }
 
-template <typename Func>
-auto vectorize(Func &&f) -> decltype(
-        vectorize(std::forward<Func>(f), (typename detail::remove_class<decltype(&std::remove_reference<Func>::type::operator())>::type *) nullptr)) {
-    return vectorize(std::forward<Func>(f), (typename detail::remove_class<decltype(
-                   &std::remove_reference<Func>::type::operator())>::type *) nullptr);
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
 }
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
diff --git a/pybind11/include/pybind11/operators.h b/pybind11/include/pybind11/operators.h
index 2e78c01a3..b3dd62c3b 100644
--- a/pybind11/include/pybind11/operators.h
+++ b/pybind11/include/pybind11/operators.h
@@ -13,9 +13,12 @@
 
 #if defined(__clang__) && !defined(__INTEL_COMPILER)
 #  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 /// Enumeration with all supported operator types
@@ -25,7 +28,7 @@ enum op_id : int {
     op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
     op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
     op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
-    op_repr, op_truediv
+    op_repr, op_truediv, op_itruediv, op_hash
 };
 
 enum op_type : int {
@@ -49,22 +52,32 @@ template <op_id, op_type, typename B, typename L, typename R> struct op_impl { }
 /// Operator implementation generator
 template <op_id id, op_type ot, typename L, typename R> struct op_ {
     template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
-        typedef typename Class::type Base;
-        typedef typename std::conditional<std::is_same<L, self_t>::value, Base, L>::type L_type;
-        typedef typename std::conditional<std::is_same<R, self_t>::value, Base, R>::type R_type;
-        typedef op_impl<id, ot, Base, L_type, R_type> op;
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
         cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
     }
     template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
-        typedef typename Class::type Base;
-        typedef typename std::conditional<std::is_same<L, self_t>::value, Base, L>::type L_type;
-        typedef typename std::conditional<std::is_same<R, self_t>::value, Base, R>::type R_type;
-        typedef op_impl<id, ot, Base, L_type, R_type> op;
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
         cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
     }
 };
 
-#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                      \
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
 template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
     static char const* name() { return "__" #id "__"; }                                \
     static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
@@ -85,7 +98,7 @@ template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &
     return op_<op_##id, op_r, T, self_t>();                                            \
 }
 
-#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                          \
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
 template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
     static char const* name() { return "__" #id "__"; }                                \
     static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
@@ -95,7 +108,7 @@ template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &
     return op_<op_##id, op_l, self_t, T>();                                            \
 }
 
-#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                            \
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
 template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
     static char const* name() { return "__" #id "__"; }                                \
     static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
@@ -108,11 +121,7 @@ inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {
 PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
 PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
 PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
-#if PY_MAJOR_VERSION >= 3
 PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
-#else
-PYBIND11_BINARY_OPERATOR(div,       rdiv,         operator/,    l / r)
-#endif
 PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
 PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
 PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
@@ -129,7 +138,7 @@ PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
 PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
 PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
 PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
-PYBIND11_INPLACE_OPERATOR(idiv,     operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
 PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
 PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
 PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
@@ -139,6 +148,7 @@ PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
 PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
 PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
 PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
 PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
 PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
 PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
@@ -151,4 +161,8 @@ NAMESPACE_END(detail)
 
 using detail::self;
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pybind11/include/pybind11/options.h b/pybind11/include/pybind11/options.h
index 3105551dd..cc1e1f6f0 100644
--- a/pybind11/include/pybind11/options.h
+++ b/pybind11/include/pybind11/options.h
@@ -9,9 +9,9 @@
 
 #pragma once
 
-#include "common.h"
+#include "detail/common.h"
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 class options {
 public:
@@ -62,4 +62,4 @@ private:
     state previous_state;
 };
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/pybind11.h b/pybind11/include/pybind11/pybind11.h
index addcce74b..613135a7a 100644
--- a/pybind11/include/pybind11/pybind11.h
+++ b/pybind11/include/pybind11/pybind11.h
@@ -21,8 +21,12 @@
 #  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
 #elif defined(__INTEL_COMPILER)
 #  pragma warning(push)
+#  pragma warning(disable: 68)    // integer conversion resulted in a change of sign
 #  pragma warning(disable: 186)   // pointless comparison of unsigned integer with zero
+#  pragma warning(disable: 878)   // incompatible exception specifications
 #  pragma warning(disable: 1334)  // the "template" keyword used for syntactic disambiguation may only be used within a template
+#  pragma warning(disable: 1682)  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#  pragma warning(disable: 1875)  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
 #  pragma warning(disable: 2196)  // warning #2196: routine is both "inline" and "noinline"
 #elif defined(__GNUG__) && !defined(__clang__)
 #  pragma GCC diagnostic push
@@ -31,12 +35,17 @@
 #  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #  pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #  pragma GCC diagnostic ignored "-Wattributes"
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+#  endif
 #endif
 
 #include "attr.h"
 #include "options.h"
+#include "detail/class.h"
+#include "detail/init.h"
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 /// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
 class cpp_function : public function {
@@ -44,30 +53,31 @@ public:
     cpp_function() { }
 
     /// Construct a cpp_function from a vanilla function pointer
-    template <typename Return, typename... Args, typename... Extra /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    cpp_function(Return (*f)(Args...) PYBIND11_NOEXCEPT_SPECIFIER, const Extra&... extra) {
+    template <typename Return, typename... Args, typename... Extra>
+    cpp_function(Return (*f)(Args...), const Extra&... extra) {
         initialize(f, f, extra...);
     }
 
     /// Construct a cpp_function from a lambda function (possibly with internal state)
-    template <typename Func, typename... Extra> cpp_function(Func &&f, const Extra&... extra) {
+    template <typename Func, typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    cpp_function(Func &&f, const Extra&... extra) {
         initialize(std::forward<Func>(f),
-                   (typename detail::remove_class<decltype(
-                       &std::remove_reference<Func>::type::operator())>::type *) nullptr, extra...);
+                   (detail::function_signature_t<Func> *) nullptr, extra...);
     }
 
     /// Construct a cpp_function from a class method (non-const)
-    template <typename Return, typename Class, typename... Arg, typename... Extra /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    cpp_function(Return (Class::*f)(Arg...) PYBIND11_NOEXCEPT_SPECIFIER, const Extra&... extra) {
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
         initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
-                   (Return (*) (Class *, Arg...) PYBIND11_NOEXCEPT_SPECIFIER) nullptr, extra...);
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
     }
 
     /// Construct a cpp_function from a class method (const)
-    template <typename Return, typename Class, typename... Arg, typename... Extra /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    cpp_function(Return (Class::*f)(Arg...) const PYBIND11_NOEXCEPT_SPECIFIER, const Extra&... extra) {
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
         initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
-                   (Return (*)(const Class *, Arg ...) PYBIND11_NOEXCEPT_SPECIFIER) nullptr, extra...);
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
     }
 
     /// Return the function name
@@ -80,12 +90,10 @@ protected:
     }
 
     /// Special internal constructor for functors, lambda functions, etc.
-    template <typename Func, typename Return, typename... Args, typename... Extra /*,*/ PYBIND11_NOEXCEPT_TPL_ARG>
-    void initialize(Func &&f, Return (*)(Args...) PYBIND11_NOEXCEPT_SPECIFIER, const Extra&... extra) {
-        static_assert(detail::expected_num_args<Extra...>(sizeof...(Args)),
-                      "The number of named arguments does not match the function signature");
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
 
-        struct capture { typename std::remove_reference<Func>::type f; };
+        struct capture { detail::remove_reference_t<Func> f; };
 
         /* Store the function including any extra state it might have (e.g. a lambda capture object) */
         auto rec = make_function_record();
@@ -116,32 +124,37 @@ protected:
             detail::conditional_t<std::is_void<Return>::value, detail::void_type, Return>
         >;
 
+        static_assert(detail::expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                      "The number of argument annotations does not match the number of function arguments");
+
         /* Dispatch code which converts function arguments and performs the actual function call */
-        rec->impl = [](detail::function_record *rec, handle args, handle kwargs, handle parent) -> handle {
+        rec->impl = [](detail::function_call &call) -> handle {
             cast_in args_converter;
 
             /* Try to cast the function arguments into the C++ domain */
-            if (!args_converter.load_args(args, kwargs))
+            if (!args_converter.load_args(call))
                 return PYBIND11_TRY_NEXT_OVERLOAD;
 
             /* Invoke call policy pre-call hook */
-            detail::process_attributes<Extra...>::precall(args);
+            detail::process_attributes<Extra...>::precall(call);
 
             /* Get a pointer to the capture object */
-            capture *cap = (capture *) (sizeof(capture) <= sizeof(rec->data)
-                                        ? &rec->data : rec->data[0]);
+            auto data = (sizeof(capture) <= sizeof(call.func.data)
+                         ? &call.func.data : call.func.data[0]);
+            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            const auto policy = detail::return_value_policy_override<Return>::policy(call.func.policy);
 
-            /* Override policy for rvalues -- always move */
-            constexpr auto is_rvalue = !std::is_pointer<Return>::value
-                                       && !std::is_lvalue_reference<Return>::value;
-            const auto policy = is_rvalue ? return_value_policy::move : rec->policy;
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = detail::extract_guard_t<Extra...>;
 
             /* Perform the function call */
-            handle result = cast_out::cast(args_converter.template call<Return>(cap->f),
-                                           policy, parent);
+            handle result = cast_out::cast(
+                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
 
             /* Invoke call policy post-call hook */
-            detail::process_attributes<Extra...>::postcall(args, result);
+            detail::process_attributes<Extra...>::postcall(call, result);
 
             return result;
         };
@@ -160,13 +173,13 @@ protected:
         if (cast_in::has_kwargs) rec->has_kwargs = true;
 
         /* Stash some additional information used by an important optimization in 'functional.h' */
-        using FunctionType = Return (*)(Args...) PYBIND11_NOEXCEPT_SPECIFIER;
+        using FunctionType = Return (*)(Args...);
         constexpr bool is_function_ptr =
             std::is_convertible<Func, FunctionType>::value &&
             sizeof(capture) == sizeof(void *);
         if (is_function_ptr) {
             rec->is_stateless = true;
-            rec->data[1] = (void *) &typeid(FunctionType);
+            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
         }
     }
 
@@ -186,6 +199,22 @@ protected:
                 a.descr = strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
         }
 
+        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+
+#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(
+                PyExc_FutureWarning,
+                ("pybind11-bound class '" + class_name + "' is using an old-style "
+                 "placement-new '" + func_name + "' which has been deprecated. See "
+                 "the upgrade guide in pybind11's docs. This message is only visible "
+                 "when compiled in debug mode.").c_str(), 0
+            );
+        }
+#endif
+
         /* Generate a proper function signature */
         std::string signature;
         size_t type_depth = 0, char_index = 0, type_index = 0, arg_index = 0;
@@ -197,7 +226,7 @@ protected:
             if (c == '{') {
                 // Write arg name for everything except *args, **kwargs and return type.
                 if (type_depth == 0 && text[char_index] != '*' && arg_index < args) {
-                    if (!rec->args.empty()) {
+                    if (!rec->args.empty() && rec->args[arg_index].name) {
                         signature += rec->args[arg_index].name;
                     } else if (arg_index == 0 && rec->is_method) {
                         signature += "self";
@@ -227,6 +256,13 @@ protected:
                                      .cast<std::string>() + ".";
 #endif
                     signature += tinfo->type->tp_name;
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+#if defined(PYPY_VERSION)
+                    signature += rec->scope.attr("__module__").cast<std::string>() + ".";
+#endif
+                    signature += ((PyTypeObject *) rec->scope.ptr())->tp_name;
                 } else {
                     std::string tname(t->name());
                     detail::clean_type_id(tname);
@@ -239,7 +275,7 @@ protected:
         if (type_depth != 0 || types[type_index] != nullptr)
             pybind11_fail("Internal error while parsing type signature (2)");
 
-        #if !defined(PYBIND11_CPP14)
+        #if !defined(PYBIND11_CONSTEXPR_DESCR)
             delete[] types;
             delete[] text;
         #endif
@@ -255,13 +291,10 @@ protected:
 #endif
         rec->signature = strdup(signature.c_str());
         rec->args.shrink_to_fit();
-        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
-        rec->nargs = (uint16_t) args;
+        rec->nargs = (std::uint16_t) args;
 
-#if PY_MAJOR_VERSION < 3
-        if (rec->sibling && PyMethod_Check(rec->sibling.ptr()))
-            rec->sibling = PyMethod_GET_FUNCTION(rec->sibling.ptr());
-#endif
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
 
         detail::function_record *chain = nullptr, *chain_start = rec;
         if (rec->sibling) {
@@ -270,7 +303,7 @@ protected:
                 chain = (detail::function_record *) rec_capsule;
                 /* Never append a method to an overload chain of a parent class;
                    instead, hide the parent's overloads in this case */
-                if (chain->scope != rec->scope)
+                if (!chain->scope.is(rec->scope))
                     chain = nullptr;
             }
             // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
@@ -282,13 +315,13 @@ protected:
         if (!chain) {
             /* No existing overload was found, create a new function object */
             rec->def = new PyMethodDef();
-            memset(rec->def, 0, sizeof(PyMethodDef));
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
             rec->def->ml_name = rec->name;
             rec->def->ml_meth = reinterpret_cast<PyCFunction>(*dispatcher);
             rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
 
-            capsule rec_capsule(rec, [](PyObject *o) {
-                destruct((detail::function_record *) PyCapsule_GetPointer(o, nullptr));
+            capsule rec_capsule(rec, [](void *ptr) {
+                destruct((detail::function_record *) ptr);
             });
 
             object scope_module;
@@ -308,6 +341,15 @@ protected:
             m_ptr = rec->sibling.ptr();
             inc_ref();
             chain_start = chain;
+            if (chain->is_method != rec->is_method)
+                pybind11_fail("overloading a method with both static and instance methods is not supported; "
+                    #if defined(NDEBUG)
+                        "compile in debug mode for more details"
+                    #else
+                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
+                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
+                    #endif
+                );
             while (chain->next)
                 chain = chain->next;
             chain->next = rec;
@@ -324,8 +366,10 @@ protected:
             signatures += "Overloaded function.\n\n";
         }
         // Then specific overload signatures
+        bool first_user_def = true;
         for (auto it = chain_start; it != nullptr; it = it->next) {
             if (options::show_function_signatures()) {
+                if (index > 0) signatures += "\n";
                 if (chain)
                     signatures += std::to_string(++index) + ". ";
                 signatures += rec->name;
@@ -333,18 +377,22 @@ protected:
                 signatures += "\n";
             }
             if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures, we
+                // need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) first_user_def = false;
+                    else signatures += "\n";
+                }
                 if (options::show_function_signatures()) signatures += "\n";
                 signatures += it->doc;
                 if (options::show_function_signatures()) signatures += "\n";
             }
-            if (it->next)
-                signatures += "\n";
         }
 
         /* Install docstring */
         PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
         if (func->m_ml->ml_doc)
-            std::free((char *) func->m_ml->ml_doc);
+            std::free(const_cast<char *>(func->m_ml->ml_doc));
         func->m_ml->ml_doc = strdup(signatures.c_str());
 
         if (rec->is_method) {
@@ -365,12 +413,12 @@ protected:
             std::free((char *) rec->doc);
             std::free((char *) rec->signature);
             for (auto &arg: rec->args) {
-                std::free((char *) arg.name);
-                std::free((char *) arg.descr);
+                std::free(const_cast<char *>(arg.name));
+                std::free(const_cast<char *>(arg.descr));
                 arg.value.dec_ref();
             }
             if (rec->def) {
-                std::free((char *) rec->def->ml_doc);
+                std::free(const_cast<char *>(rec->def->ml_doc));
                 delete rec->def;
             }
             delete rec;
@@ -379,72 +427,242 @@ protected:
     }
 
     /// Main dispatch logic for calls to functions bound using pybind11
-    static PyObject *dispatcher(PyObject *self, PyObject *args, PyObject *kwargs) {
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+
         /* Iterator over the list of potentially admissible overloads */
-        detail::function_record *overloads = (detail::function_record *) PyCapsule_GetPointer(self, nullptr),
-                                *it = overloads;
+        function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
+                        *it = overloads;
 
         /* Need to know how many arguments + keyword arguments there are to pick the right overload */
-        size_t nargs = (size_t) PyTuple_GET_SIZE(args),
-               nkwargs = kwargs ? (size_t) PyDict_Size(kwargs) : 0;
+        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
 
-        handle parent = nargs > 0 ? PyTuple_GET_ITEM(args, 0) : nullptr,
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
                result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+                return nullptr;
+            }
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered())
+                return none().release().ptr();
+        }
+
         try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
             for (; it != nullptr; it = it->next) {
-                auto args_ = reinterpret_borrow<tuple>(args);
-                size_t kwargs_consumed = 0;
 
                 /* For each overload:
-                   1. If the required list of arguments is longer than the
-                      actually provided amount, create a copy of the argument
-                      list and fill in any available keyword/default arguments.
-                   2. Ensure that all keyword arguments were "consumed"
-                   3. Call the function call dispatcher (function_record::impl)
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
+                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the function
+                      takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get a
+                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
                  */
-                size_t nargs_ = nargs;
-                if (nargs < it->args.size()) {
-                    nargs_ = it->args.size();
-                    args_ = tuple(nargs_);
-                    for (size_t i = 0; i < nargs; ++i) {
-                        handle item = PyTuple_GET_ITEM(args, i);
-                        PyTuple_SET_ITEM(args_.ptr(), i, item.inc_ref().ptr());
+
+                function_record &func = *it;
+                size_t pos_args = func.nargs;    // Number of positional arguments that we need
+                if (func.has_args) --pos_args;   // (but don't count py::args
+                if (func.has_kwargs) --pos_args; //  or py::kwargs)
+
+                if (!func.has_args && n_args_in > pos_args)
+                    continue; // Too many arguments for this overload
+
+                if (n_args_in < pos_args && func.args.size() < pos_args)
+                    continue; // Not enough arguments given, and not enough defaults to fill in the blanks
+
+                function_call call(func, parent);
+
+                size_t args_to_copy = std::min(pos_args, n_args_in);
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder)
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
                     }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg)
+                    continue; // Maybe it was meant for another overload (issue #688)
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
 
-                    int arg_ctr = 0;
-                    for (auto const &it2 : it->args) {
-                        int index = arg_ctr++;
-                        if (PyTuple_GET_ITEM(args_.ptr(), index))
-                            continue;
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < pos_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < pos_args; ++args_copied) {
+                        const auto &arg = func.args[args_copied];
 
                         handle value;
-                        if (kwargs)
-                            value = PyDict_GetItemString(kwargs, it2.name);
+                        if (kwargs_in && arg.name)
+                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
 
-                        if (value)
-                            kwargs_consumed++;
-                        else if (it2.value)
-                            value = it2.value;
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            PyDict_DelItemString(kwargs.ptr(), arg.name);
+                        } else if (arg.value) {
+                            value = arg.value;
+                        }
 
                         if (value) {
-                            PyTuple_SET_ITEM(args_.ptr(), index, value.inc_ref().ptr());
-                        } else {
-                            kwargs_consumed = (size_t) -1; /* definite failure */
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg.convert);
+                        }
+                        else
                             break;
+                    }
+
+                    if (args_copied < pos_args)
+                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                tuple extra_args;
+                if (func.has_args) {
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            handle item = PyTuple_GET_ITEM(args_in, args_copied + i);
+                            extra_args[i] = item.inc_ref().ptr();
                         }
                     }
+                    call.args.push_back(extra_args);
+                    call.args_convert.push_back(false);
                 }
 
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr())
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                }
+
+                // 5. Put everything in a vector.  Not technically step 5, we've been building it
+                // in `call.args` all along.
+                #if !defined(NDEBUG)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
+                #endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
                 try {
-                    if ((kwargs_consumed == nkwargs || it->has_kwargs) &&
-                        (nargs_ == it->nargs || it->has_args))
-                        result = it->impl(it, args_, kwargs, parent);
+                    loader_life_support guard{};
+                    result = func.impl(call);
                 } catch (reference_cast_error &) {
                     result = PYBIND11_TRY_NEXT_OVERLOAD;
                 }
 
                 if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
                     break;
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+                        break;
+                }
             }
         } catch (error_already_set &e) {
             e.restore();
@@ -462,7 +680,7 @@ protected:
                 - delegate translation to the next translator by throwing a new type of exception. */
 
             auto last_exception = std::current_exception();
-            auto &registered_exception_translators = pybind11::detail::get_internals().registered_exception_translators;
+            auto &registered_exception_translators = get_internals().registered_exception_translators;
             for (auto& translator : registered_exception_translators) {
                 try {
                     translator(last_exception);
@@ -476,6 +694,16 @@ protected:
             return nullptr;
         }
 
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
         if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
             if (overloads->is_operator)
                 return handle(Py_NotImplemented).inc_ref().ptr();
@@ -485,7 +713,7 @@ protected:
                 " arguments. The following argument types are supported:\n";
 
             int ctr = 0;
-            for (detail::function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+            for (function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
                 msg += "    "+ std::to_string(++ctr) + ". ";
 
                 bool wrote_sig = false;
@@ -512,27 +740,41 @@ protected:
                 msg += "\n";
             }
             msg += "\nInvoked with: ";
-            auto args_ = reinterpret_borrow<tuple>(args);
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
             for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) some_args = true;
+                else msg += ", ";
                 msg += pybind11::repr(args_[ti]);
-                if ((ti + 1) != args_.size() )
-                    msg += ", ";
             }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (kwargs.size() > 0) {
+                    if (some_args) msg += "; ";
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) first = false;
+                        else msg += ", ";
+                        msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
             PyErr_SetString(PyExc_TypeError, msg.c_str());
             return nullptr;
         } else if (!result) {
             std::string msg = "Unable to convert function return value to a "
                               "Python type! The signature was\n\t";
             msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
             PyErr_SetString(PyExc_TypeError, msg.c_str());
             return nullptr;
         } else {
-            if (overloads->is_constructor) {
-                /* When a constructor ran successfully, the corresponding
-                   holder type (e.g. std::unique_ptr) must still be initialized. */
-                PyObject *inst = PyTuple_GET_ITEM(args, 0);
-                auto tinfo = detail::get_type_info(Py_TYPE(inst));
-                tinfo->init_holder(inst, nullptr);
+            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+                auto *pi = reinterpret_cast<instance *>(parent.ptr());
+                self_value_and_holder.type->init_instance(pi, nullptr);
             }
             return result.ptr();
         }
@@ -544,11 +786,12 @@ class module : public object {
 public:
     PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
 
+    /// Create a new top-level Python module with the given name and docstring
     explicit module(const char *name, const char *doc = nullptr) {
         if (!options::show_user_defined_docstrings()) doc = nullptr;
 #if PY_MAJOR_VERSION >= 3
         PyModuleDef *def = new PyModuleDef();
-        memset(def, 0, sizeof(PyModuleDef));
+        std::memset(def, 0, sizeof(PyModuleDef));
         def->m_name = name;
         def->m_doc = doc;
         def->m_size = -1;
@@ -562,6 +805,11 @@ public:
         inc_ref();
     }
 
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
     template <typename Func, typename... Extra>
     module &def(const char *name_, Func &&f, const Extra& ... extra) {
         cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
@@ -572,6 +820,16 @@ public:
         return *this;
     }
 
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module m("example", "pybind11 example plugin");
+            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
     module def_submodule(const char *name, const char *doc = nullptr) {
         std::string full_name = std::string(PyModule_GetName(m_ptr))
             + std::string(".") + std::string(name);
@@ -582,6 +840,7 @@ public:
         return result;
     }
 
+    /// Import and return a module or throws `error_already_set`.
     static module import(const char *name) {
         PyObject *obj = PyImport_ImportModule(name);
         if (!obj)
@@ -589,243 +848,91 @@ public:
         return reinterpret_steal<module>(obj);
     }
 
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj)
+            throw error_already_set();
+        *this = reinterpret_steal<module>(obj);
+    }
+
     // Adds an object to the module using the given name.  Throws if an object with the given name
     // already exists.
     //
     // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
     // established will, in most cases, break things.
-    PYBIND11_NOINLINE void add_object(const char *name, object &obj, bool overwrite = false) {
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
         if (!overwrite && hasattr(*this, name))
             pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
                     std::string(name) + "\"");
 
-        obj.inc_ref(); // PyModule_AddObject() steals a reference
-        PyModule_AddObject(ptr(), name, obj.ptr());
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
     }
 };
 
-NAMESPACE_BEGIN(detail)
-extern "C" inline PyObject *get_dict(PyObject *op, void *) {
-    PyObject *&dict = *_PyObject_GetDictPtr(op);
-    if (!dict)
-        dict = PyDict_New();
-    Py_XINCREF(dict);
-    return dict;
-}
-
-extern "C" inline int set_dict(PyObject *op, PyObject *new_dict, void *) {
-    if (!PyDict_Check(new_dict)) {
-        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
-                     Py_TYPE(new_dict)->tp_name);
-        return -1;
-    }
-    PyObject *&dict = *_PyObject_GetDictPtr(op);
-    Py_INCREF(new_dict);
-    Py_CLEAR(dict);
-    dict = new_dict;
-    return 0;
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
 }
 
-static PyGetSetDef generic_getset[] = {
-    {const_cast<char*>("__dict__"), get_dict, set_dict, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}
-};
-
+NAMESPACE_BEGIN(detail)
 /// Generic support for creating new Python heap types
 class generic_type : public object {
     template <typename...> friend class class_;
 public:
     PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
 protected:
-    void initialize(type_record *rec) {
-        auto &internals = get_internals();
-        auto tindex = std::type_index(*(rec->type));
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, rec.name))
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
+                          "\": an object with that name is already defined");
 
-        if (get_type_info(*(rec->type)))
-            pybind11_fail("generic_type: type \"" + std::string(rec->name) +
+        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
                           "\" is already registered!");
 
-        auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec->name));
-        object scope_module;
-        if (rec->scope) {
-            if (hasattr(rec->scope, rec->name))
-                pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec->name) +
-                        "\": an object with that name is already defined");
-
-            if (hasattr(rec->scope, "__module__")) {
-                scope_module = rec->scope.attr("__module__");
-            } else if (hasattr(rec->scope, "__name__")) {
-                scope_module = rec->scope.attr("__name__");
-            }
-        }
-
-#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
-        /* Qualified names for Python >= 3.3 */
-        object scope_qualname;
-        if (rec->scope && hasattr(rec->scope, "__qualname__"))
-            scope_qualname = rec->scope.attr("__qualname__");
-        object ht_qualname, ht_qualname_meta;
-        if (scope_qualname)
-            ht_qualname = reinterpret_steal<object>(PyUnicode_FromFormat(
-                "%U.%U", scope_qualname.ptr(), name.ptr()));
-        else
-            ht_qualname = name;
-        if (rec->metaclass)
-            ht_qualname_meta = reinterpret_steal<object>(
-                PyUnicode_FromFormat("%U__Meta", ht_qualname.ptr()));
-#endif
-
-#if !defined(PYPY_VERSION)
-        std::string full_name = (scope_module ? ((std::string) pybind11::str(scope_module) + "." + rec->name)
-                                              : std::string(rec->name));
-#else
-        std::string full_name = std::string(rec->name);
-#endif
-
-        /* Create a custom metaclass if requested (used for static properties) */
-        object metaclass;
-        if (rec->metaclass) {
-            std::string meta_name_ = full_name + "__Meta";
-            object meta_name = reinterpret_steal<object>(PYBIND11_FROM_STRING(meta_name_.c_str()));
-            metaclass = reinterpret_steal<object>(PyType_Type.tp_alloc(&PyType_Type, 0));
-            if (!metaclass || !name)
-                pybind11_fail("generic_type::generic_type(): unable to create metaclass!");
-
-            /* Danger zone: from now (and until PyType_Ready), make sure to
-               issue no Python C API calls which could potentially invoke the
-               garbage collector (the GC will call type_traverse(), which will in
-               turn find the newly constructed type in an invalid state) */
-
-            auto type = (PyHeapTypeObject*) metaclass.ptr();
-            type->ht_name = meta_name.release().ptr();
-
-#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
-            /* Qualified names for Python >= 3.3 */
-            type->ht_qualname = ht_qualname_meta.release().ptr();
-#endif
-            type->ht_type.tp_name = strdup(meta_name_.c_str());
-            type->ht_type.tp_base = &PyType_Type;
-            type->ht_type.tp_flags |= (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE) &
-                                      ~Py_TPFLAGS_HAVE_GC;
-
-            if (PyType_Ready(&type->ht_type) < 0)
-                pybind11_fail("generic_type::generic_type(): failure in PyType_Ready() for metaclass!");
-        }
-
-        size_t num_bases = rec->bases.size();
-        auto bases = tuple(rec->bases);
-
-        char *tp_doc = nullptr;
-        if (rec->doc && options::show_user_defined_docstrings()) {
-            /* Allocate memory for docstring (using PyObject_MALLOC, since
-               Python will free this later on) */
-            size_t size = strlen(rec->doc) + 1;
-            tp_doc = (char *) PyObject_MALLOC(size);
-            memcpy((void *) tp_doc, rec->doc, size);
-        }
-
-        /* Danger zone: from now (and until PyType_Ready), make sure to
-           issue no Python C API calls which could potentially invoke the
-           garbage collector (the GC will call type_traverse(), which will in
-           turn find the newly constructed type in an invalid state) */
-
-        auto type_holder = reinterpret_steal<object>(PyType_Type.tp_alloc(&PyType_Type, 0));
-        auto type = (PyHeapTypeObject*) type_holder.ptr();
-
-        if (!type_holder || !name)
-            pybind11_fail(std::string(rec->name) + ": Unable to create type object!");
+        m_ptr = make_new_python_type(rec);
 
         /* Register supplemental type information in C++ dict */
-        detail::type_info *tinfo = new detail::type_info();
-        tinfo->type = (PyTypeObject *) type;
-        tinfo->type_size = rec->type_size;
-        tinfo->init_holder = rec->init_holder;
-        tinfo->direct_conversions = &internals.direct_conversions[tindex];
-        internals.registered_types_cpp[tindex] = tinfo;
-        internals.registered_types_py[type] = tinfo;
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
 
-        /* Basic type attributes */
-        type->ht_type.tp_name = strdup(full_name.c_str());
-        type->ht_type.tp_basicsize = (ssize_t) rec->instance_size;
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local)
+            registered_local_types_cpp()[tindex] = tinfo;
+        else
+            internals.registered_types_cpp[tindex] = tinfo;
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
 
-        if (num_bases > 0) {
-            type->ht_type.tp_base = (PyTypeObject *) ((object) bases[0]).inc_ref().ptr();
-            type->ht_type.tp_bases = bases.release().ptr();
-            rec->multiple_inheritance |= num_bases > 1;
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
         }
-
-        type->ht_name = name.release().ptr();
-
-#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3
-        type->ht_qualname = ht_qualname.release().ptr();
-#endif
-
-        /* Metaclass */
-        PYBIND11_OB_TYPE(type->ht_type) = (PyTypeObject *) metaclass.release().ptr();
-
-        /* Supported protocols */
-        type->ht_type.tp_as_number = &type->as_number;
-        type->ht_type.tp_as_sequence = &type->as_sequence;
-        type->ht_type.tp_as_mapping = &type->as_mapping;
-
-        /* Supported elementary operations */
-        type->ht_type.tp_init = (initproc) init;
-        type->ht_type.tp_new = (newfunc) new_instance;
-        type->ht_type.tp_dealloc = rec->dealloc;
-
-        /* Support weak references (needed for the keep_alive feature) */
-        type->ht_type.tp_weaklistoffset = offsetof(instance_essentials<void>, weakrefs);
-
-        /* Flags */
-        type->ht_type.tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
-#if PY_MAJOR_VERSION < 3
-        type->ht_type.tp_flags |= Py_TPFLAGS_CHECKTYPES;
-#endif
-        type->ht_type.tp_flags &= ~Py_TPFLAGS_HAVE_GC;
-
-        /* Support dynamic attributes */
-        if (rec->dynamic_attr) {
-            #if defined(PYPY_VERSION)
-                pybind11_fail(std::string(rec->name) + ": dynamic attributes are "
-                                                       "currently not supported in "
-                                                       "conunction with PyPy!");
-            #endif
-            type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_GC;
-            type->ht_type.tp_dictoffset = type->ht_type.tp_basicsize; // place the dict at the end
-            type->ht_type.tp_basicsize += sizeof(PyObject *); // and allocate enough space for it
-            type->ht_type.tp_getset = generic_getset;
-            type->ht_type.tp_traverse = traverse;
-            type->ht_type.tp_clear = clear;
+        else if (rec.bases.size() == 1) {
+            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
         }
 
-        if (rec->buffer_protocol) {
-            type->ht_type.tp_as_buffer = &type->as_buffer;
-#if PY_MAJOR_VERSION < 3
-            type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
-#endif
-            type->as_buffer.bf_getbuffer = getbuffer;
-            type->as_buffer.bf_releasebuffer = releasebuffer;
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
         }
-
-        type->ht_type.tp_doc = tp_doc;
-
-        m_ptr = type_holder.ptr();
-
-        if (PyType_Ready(&type->ht_type) < 0)
-            pybind11_fail(std::string(rec->name) + ": PyType_Ready failed (" +
-                          detail::error_string() + ")!");
-
-        if (scope_module) // Needed by pydoc
-            attr("__module__") = scope_module;
-
-        /* Register type with the parent scope */
-        if (rec->scope)
-            rec->scope.attr(handle(type->ht_name)) = *this;
-
-        if (rec->multiple_inheritance)
-            mark_parents_nonsimple(&type->ht_type);
-
-        type_holder.release();
     }
 
     /// Helper function which tags all parents of a type using mult. inheritance
@@ -839,66 +946,6 @@ protected:
         }
     }
 
-    static int init(void *self, PyObject *, PyObject *) {
-        PyTypeObject *type = Py_TYPE(self);
-        std::string msg;
-#if defined(PYPY_VERSION)
-        msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
-#endif
-        msg += type->tp_name;
-        msg += ": No constructor defined!";
-        PyErr_SetString(PyExc_TypeError, msg.c_str());
-        return -1;
-    }
-
-    static PyObject *new_instance(PyTypeObject *type, PyObject *, PyObject *) {
-        instance<void> *self = (instance<void> *) PyType_GenericAlloc((PyTypeObject *) type, 0);
-        auto tinfo = detail::get_type_info(type);
-        self->value = ::operator new(tinfo->type_size);
-        self->owned = true;
-        self->holder_constructed = false;
-        detail::get_internals().registered_instances.emplace(self->value, (PyObject *) self);
-        return (PyObject *) self;
-    }
-
-    static void dealloc(instance<void> *self) {
-        if (self->value) {
-            auto instance_type = Py_TYPE(self);
-            auto &registered_instances = detail::get_internals().registered_instances;
-            auto range = registered_instances.equal_range(self->value);
-            bool found = false;
-            for (auto it = range.first; it != range.second; ++it) {
-                if (instance_type == Py_TYPE(it->second)) {
-                    registered_instances.erase(it);
-                    found = true;
-                    break;
-                }
-            }
-            if (!found)
-                pybind11_fail("generic_type::dealloc(): Tried to deallocate unregistered instance!");
-
-            if (self->weakrefs)
-                PyObject_ClearWeakRefs((PyObject *) self);
-
-            PyObject **dict_ptr = _PyObject_GetDictPtr((PyObject *) self);
-            if (dict_ptr)
-                Py_CLEAR(*dict_ptr);
-        }
-        Py_TYPE(self)->tp_free((PyObject*) self);
-    }
-
-    static int traverse(PyObject *op, visitproc visit, void *arg) {
-        PyObject *&dict = *_PyObject_GetDictPtr(op);
-        Py_VISIT(dict);
-        return 0;
-    }
-
-    static int clear(PyObject *op) {
-        PyObject *&dict = *_PyObject_GetDictPtr(op);
-        Py_CLEAR(dict);
-        return 0;
-    }
-
     void install_buffer_funcs(
             buffer_info *(*get_buffer)(PyObject *, void *),
             void *get_buffer_data) {
@@ -916,115 +963,118 @@ protected:
         tinfo->get_buffer_data = get_buffer_data;
     }
 
-    static int getbuffer(PyObject *obj, Py_buffer *view, int flags) {
-        auto tinfo = detail::get_type_info(Py_TYPE(obj));
-        if (view == nullptr || obj == nullptr || !tinfo || !tinfo->get_buffer) {
-            if (view)
-                view->obj = nullptr;
-            PyErr_SetString(PyExc_BufferError, "generic_type::getbuffer(): Internal error");
-            return -1;
-        }
-        memset(view, 0, sizeof(Py_buffer));
-        buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
-        view->obj = obj;
-        view->ndim = 1;
-        view->internal = info;
-        view->buf = info->ptr;
-        view->itemsize = (ssize_t) info->itemsize;
-        view->len = view->itemsize;
-        for (auto s : info->shape)
-            view->len *= s;
-        if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
-            view->format = const_cast<char *>(info->format.c_str());
-        if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
-            view->ndim = (int) info->ndim;
-            view->strides = (ssize_t *) &info->strides[0];
-            view->shape = (ssize_t *) &info->shape[0];
-        }
-        Py_INCREF(view->obj);
-        return 0;
-    }
-
-    static void releasebuffer(PyObject *, Py_buffer *view) { delete (buffer_info *) view->internal; }
-
     void def_property_static_impl(const char *name,
                                   handle fget, handle fset,
                                   detail::function_record *rec_fget) {
-        pybind11::str doc_obj = pybind11::str(
-            (rec_fget->doc && pybind11::options::show_user_defined_docstrings())
-                ? rec_fget->doc : "");
-        const auto property = reinterpret_steal<object>(
-            PyObject_CallFunctionObjArgs((PyObject *) &PyProperty_Type, fget.ptr() ? fget.ptr() : Py_None,
-                                         fset.ptr() ? fset.ptr() : Py_None, Py_None, doc_obj.ptr(), nullptr));
-        if (rec_fget->is_method && rec_fget->scope) {
-            attr(name) = property;
-        } else {
-            auto mclass = handle((PyObject *) PYBIND11_OB_TYPE(*((PyTypeObject *) m_ptr)));
-
-            if ((PyTypeObject *) mclass.ptr() == &PyType_Type)
-                pybind11_fail(
-                    "Adding static properties to the type '" +
-                    std::string(((PyTypeObject *) m_ptr)->tp_name) +
-                    "' requires the type to have a custom metaclass. Please "
-                    "ensure that one is created by supplying the pybind11::metaclass() "
-                    "annotation to the associated class_<>(..) invocation.");
-            mclass.attr(name) = property;
-        }
+        const auto is_static = !(rec_fget->is_method && rec_fget->scope);
+        const auto has_doc = rec_fget->doc && pybind11::options::show_user_defined_docstrings();
+
+        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
+                                                       : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/none(),
+                              pybind11::str(has_doc ? rec_fget->doc : ""));
     }
 };
 
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+
+template <typename> void set_operator_new(...) { }
+
+template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
+template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type { };
+template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
+template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type { };
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t) { T::operator delete(p); }
+template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t s) { T::operator delete(p, s); }
+
+inline void call_operator_delete(void *p, size_t) { ::operator delete(p); }
+
 NAMESPACE_END(detail)
 
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { return pmf; }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { return pmf; }
+
 template <typename type_, typename... options>
 class class_ : public detail::generic_type {
     template <typename T> using is_holder = detail::is_holder_type<type_, T>;
-    template <typename T> using is_subtype = detail::bool_constant<std::is_base_of<type_, T>::value && !std::is_same<T, type_>::value>;
-    template <typename T> using is_base = detail::bool_constant<std::is_base_of<T, type_>::value && !std::is_same<T, type_>::value>;
+    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
     // struct instead of using here to help MSVC:
     template <typename T> struct is_valid_class_option :
         detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
 
 public:
     using type = type_;
-    using type_alias = detail::first_of_t<is_subtype, void, options...>;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
     constexpr static bool has_alias = !std::is_void<type_alias>::value;
-    using holder_type = detail::first_of_t<is_holder, std::unique_ptr<type>, options...>;
-    using instance_type = detail::instance<type, holder_type>;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
 
     static_assert(detail::all_of<is_valid_class_option<options>...>::value,
             "Unknown/invalid class_ template parameters provided");
 
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+            "Cannot use an alias class with a non-polymorphic type");
+
     PYBIND11_OBJECT(class_, generic_type, PyType_Check)
 
     template <typename... Extra>
     class_(handle scope, const char *name, const Extra &... extra) {
-        detail::type_record record;
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
+                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
         record.scope = scope;
         record.name = name;
         record.type = &typeid(type);
-        record.type_size = sizeof(detail::conditional_t<has_alias, type_alias, type>);
-        record.instance_size = sizeof(instance_type);
-        record.init_holder = init_holder;
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
         record.dealloc = dealloc;
+        record.default_holder = std::is_same<holder_type, std::unique_ptr<type>>::value;
+
+        set_operator_new<type>(&record);
 
         /* Register base classes specified via template arguments to class_, if any */
-        bool unused[] = { (add_base<options>(record), false)..., false };
-        (void) unused;
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
 
         /* Process optional arguments, if any */
-        detail::process_attributes<Extra...>::init(extra..., &record);
+        process_attributes<Extra...>::init(extra..., &record);
 
-        detail::generic_type::initialize(&record);
+        generic_type::initialize(record);
 
         if (has_alias) {
-            auto &instances = pybind11::detail::get_internals().registered_types_cpp;
+            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
             instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
         }
     }
 
     template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
     static void add_base(detail::type_record &rec) {
-        rec.add_base(&typeid(Base), [](void *src) -> void * {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
             return static_cast<Base *>(reinterpret_cast<type *>(src));
         });
     }
@@ -1034,14 +1084,16 @@ public:
 
     template <typename Func, typename... Extra>
     class_ &def(const char *name_, Func&& f, const Extra&... extra) {
-        cpp_function cf(std::forward<Func>(f), name(name_), is_method(*this),
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
                         sibling(getattr(*this, name_, none())), extra...);
         attr(cf.name()) = cf;
         return *this;
     }
 
     template <typename Func, typename... Extra> class_ &
-    def_static(const char *name_, Func f, const Extra&... extra) {
+    def_static(const char *name_, Func &&f, const Extra&... extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                "def_static(...) called with a non-static member function pointer");
         cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
                         sibling(getattr(*this, name_, none())), extra...);
         attr(cf.name()) = cf;
@@ -1061,17 +1113,29 @@ public:
     }
 
     template <typename... Args, typename... Extra>
-    class_ &def(const detail::init<Args...> &init, const Extra&... extra) {
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
         init.execute(*this, extra...);
         return *this;
     }
 
     template <typename... Args, typename... Extra>
-    class_ &def(const detail::init_alias<Args...> &init, const Extra&... extra) {
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
         init.execute(*this, extra...);
         return *this;
     }
 
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
     template <typename Func> class_& def_buffer(Func &&func) {
         struct capture { Func func; };
         capture *ptr = new capture { std::forward<Func>(func) };
@@ -1084,17 +1148,29 @@ public:
         return *this;
     }
 
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func] (type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
+    }
+
     template <typename C, typename D, typename... Extra>
     class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
-        cpp_function fget([pm](const C &c) -> const D &{ return c.*pm; }, is_method(*this)),
-                     fset([pm](C &c, const D &value) { c.*pm = value; }, is_method(*this));
+        static_assert(std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
+                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
         def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
         return *this;
     }
 
     template <typename C, typename D, typename... Extra>
     class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
-        cpp_function fget([pm](const C &c) -> const D &{ return c.*pm; }, is_method(*this));
+        static_assert(std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
         def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
         return *this;
     }
@@ -1117,7 +1193,8 @@ public:
     /// Uses return_value_policy::reference_internal by default
     template <typename Getter, typename... Extra>
     class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
-        return def_property_readonly(name, cpp_function(fget), return_value_policy::reference_internal, extra...);
+        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal, extra...);
     }
 
     /// Uses cpp_function's return_value_policy by default
@@ -1139,9 +1216,14 @@ public:
     }
 
     /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
+        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
+    }
     template <typename Getter, typename... Extra>
     class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
-        return def_property(name, cpp_function(fget), fset, return_value_policy::reference_internal, extra...);
+        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                            return_value_policy::reference_internal, extra...);
     }
 
     /// Uses cpp_function's return_value_policy by default
@@ -1181,55 +1263,68 @@ public:
 private:
     /// Initialize holder object, variant 1: object derives from enable_shared_from_this
     template <typename T>
-    static void init_holder_helper(instance_type *inst, const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
         try {
-            new (&inst->holder) holder_type(std::static_pointer_cast<typename holder_type::element_type>(inst->value->shared_from_this()));
-            inst->holder_constructed = true;
-        } catch (const std::bad_weak_ptr &) {
-            if (inst->owned) {
-                new (&inst->holder) holder_type(inst->value);
-                inst->holder_constructed = true;
+            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                    v_h.value_ptr<type>()->shared_from_this());
+            if (sh) {
+                new (&v_h.holder<holder_type>()) holder_type(std::move(sh));
+                v_h.set_holder_constructed();
             }
+        } catch (const std::bad_weak_ptr &) {}
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (&v_h.holder<holder_type>()) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
         }
     }
 
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
+        new (&v_h.holder<holder_type>()) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
+        new (&v_h.holder<holder_type>()) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
     /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
-    template <typename T = holder_type,
-              detail::enable_if_t<std::is_copy_constructible<T>::value, int> = 0>
-    static void init_holder_helper(instance_type *inst, const holder_type *holder_ptr, const void * /* dummy */) {
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
         if (holder_ptr) {
-            new (&inst->holder) holder_type(*holder_ptr);
-            inst->holder_constructed = true;
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
         } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
-            new (&inst->holder) holder_type(inst->value);
-            inst->holder_constructed = true;
+            new (&v_h.holder<holder_type>()) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
         }
     }
 
-    /// Initialize holder object, variant 3: holder is not copy constructible (e.g. unique_ptr), always initialize from raw pointer
-    template <typename T = holder_type,
-              detail::enable_if_t<!std::is_copy_constructible<T>::value, int> = 0>
-    static void init_holder_helper(instance_type *inst, const holder_type * /* unused */, const void * /* dummy */) {
-        if (inst->owned || detail::always_construct_holder<holder_type>::value) {
-            new (&inst->holder) holder_type(inst->value);
-            inst->holder_constructed = true;
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
+    /// optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
         }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
     }
 
-    /// Initialize holder object of an instance, possibly given a pointer to an existing holder
-    static void init_holder(PyObject *inst_, const void *holder_ptr) {
-        auto inst = (instance_type *) inst_;
-        init_holder_helper(inst, (const holder_type *) holder_ptr, inst->value);
-    }
-
-    static void dealloc(PyObject *inst_) {
-        instance_type *inst = (instance_type *) inst_;
-        if (inst->holder_constructed)
-            inst->holder.~holder_type();
-        else if (inst->owned)
-            ::operator delete(inst->value);
-
-        generic_type::dealloc((detail::instance<void> *) inst);
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        }
+        else {
+            detail::call_operator_delete(v_h.value_ptr<type>(), v_h.type->type_size);
+        }
+        v_h.value_ptr() = nullptr;
     }
 
     static detail::function_record *get_function_record(handle h) {
@@ -1239,31 +1334,62 @@ private:
     }
 };
 
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) { return {std::forward<Func>(f)}; }
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the second
+/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
 /// Binds C++ enumerations and enumeration classes to Python
 template <typename Type> class enum_ : public class_<Type> {
 public:
     using class_<Type>::def;
+    using class_<Type>::def_property_readonly_static;
     using Scalar = typename std::underlying_type<Type>::type;
-    template <typename T> using arithmetic_tag = std::is_same<T, arithmetic>;
 
     template <typename... Extra>
     enum_(const handle &scope, const char *name, const Extra&... extra)
-      : class_<Type>(scope, name, extra...), m_parent(scope) {
-
-        constexpr bool is_arithmetic =
-            !std::is_same<detail::first_of_t<arithmetic_tag, void, Extra...>,
-                          void>::value;
-
-        auto entries = new std::unordered_map<Scalar, const char *>();
-        def("__repr__", [name, entries](Type value) -> std::string {
-            auto it = entries->find((Scalar) value);
-            return std::string(name) + "." +
-                ((it == entries->end()) ? std::string("???")
-                                        : std::string(it->second));
+      : class_<Type>(scope, name, extra...), m_entries(), m_parent(scope) {
+
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+
+        auto m_entries_ptr = m_entries.inc_ref().ptr();
+        def("__repr__", [name, m_entries_ptr](Type value) -> pybind11::str {
+            for (const auto &kv : reinterpret_borrow<dict>(m_entries_ptr)) {
+                if (pybind11::cast<Type>(kv.second) == value)
+                    return pybind11::str("{}.{}").format(name, kv.first);
+            }
+            return pybind11::str("{}.???").format(name);
         });
-        def("__init__", [](Type& value, Scalar i) { value = (Type)i; });
-        def("__init__", [](Type& value, Scalar i) { new (&value) Type((Type) i); });
+        def_property_readonly_static("__members__", [m_entries_ptr](object /* self */) {
+            dict m;
+            for (const auto &kv : reinterpret_borrow<dict>(m_entries_ptr))
+                m[kv.first] = kv.second;
+            return m;
+        }, return_value_policy::copy);
+        def(init([](Scalar i) { return static_cast<Type>(i); }));
         def("__int__", [](Type value) { return (Scalar) value; });
+        #if PY_MAJOR_VERSION < 3
+            def("__long__", [](Type value) { return (Scalar) value; });
+        #endif
         def("__eq__", [](const Type &value, Type *value2) { return value2 && value == *value2; });
         def("__ne__", [](const Type &value, Type *value2) { return !value2 || value != *value2; });
         if (is_arithmetic) {
@@ -1297,127 +1423,103 @@ public:
         }
         def("__hash__", [](const Type &value) { return (Scalar) value; });
         // Pickling and unpickling -- needed for use with the 'multiprocessing' module
-        def("__getstate__", [](const Type &value) { return pybind11::make_tuple((Scalar) value); });
-        def("__setstate__", [](Type &p, tuple t) { new (&p) Type((Type) t[0].cast<Scalar>()); });
-        m_entries = entries;
+        def(pickle([](const Type &value) { return pybind11::make_tuple((Scalar) value); },
+                   [](tuple t) { return static_cast<Type>(t[0].cast<Scalar>()); }));
     }
 
     /// Export enumeration entries into the parent scope
-    enum_ &export_values() {
-#if !defined(PYPY_VERSION)
-        PyObject *dict = ((PyTypeObject *) this->m_ptr)->tp_dict;
-        PyObject *key, *value;
-        ssize_t pos = 0;
-
-        while (PyDict_Next(dict, &pos, &key, &value)) {
-            if (PyObject_IsInstance(value, this->m_ptr))
-                m_parent.attr(key) = value;
-        }
-#else
-        /* PyPy's cpyext still has difficulties with the above
-           CPython API calls; emulate using Python code. */
-        dict d; d["t"] = *this; d["p"] = m_parent;
-        PyObject *result = PyRun_String(
-            "for k, v in t.__dict__.items():\n"
-            "    if isinstance(v, t):\n"
-            "        setattr(p, k, v)\n",
-            Py_file_input, d.ptr(), d.ptr());
-        if (result == nullptr)
-            throw error_already_set();
-        Py_DECREF(result);
-#endif
-
+    enum_& export_values() {
+        for (const auto &kv : m_entries)
+            m_parent.attr(kv.first) = kv.second;
         return *this;
     }
 
     /// Add an enumeration entry
     enum_& value(char const* name, Type value) {
-        this->attr(name) = pybind11::cast(value, return_value_policy::copy);
-        (*m_entries)[(Scalar) value] = name;
+        auto v = pybind11::cast(value, return_value_policy::copy);
+        this->attr(name) = v;
+        m_entries[pybind11::str(name)] = v;
         return *this;
     }
+
 private:
-    std::unordered_map<Scalar, const char *> *m_entries;
+    dict m_entries;
     handle m_parent;
 };
 
 NAMESPACE_BEGIN(detail)
-template <typename... Args> struct init {
-    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        using Base = typename Class::type;
-        /// Function which calls a specific C++ in-place constructor
-        cl.def("__init__", [](Base *self_, Args... args) { new (self_) Base(args...); }, extra...);
-    }
-
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias &&
-                          std::is_constructible<typename Class::type, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        using Base = typename Class::type;
-        using Alias = typename Class::type_alias;
-        handle cl_type = cl;
-        cl.def("__init__", [cl_type](handle self_, Args... args) {
-                if (self_.get_type() == cl_type)
-                    new (self_.cast<Base *>()) Base(args...);
-                else
-                    new (self_.cast<Alias *>()) Alias(args...);
-            }, extra...);
-    }
-
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias &&
-                          !std::is_constructible<typename Class::type, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        init_alias<Args...>::execute(cl, extra...);
-    }
-};
-template <typename... Args> struct init_alias {
-    template <typename Class, typename... Extra,
-              enable_if_t<Class::has_alias && std::is_constructible<typename Class::type_alias, Args...>::value, int> = 0>
-    static void execute(Class &cl, const Extra&... extra) {
-        using Alias = typename Class::type_alias;
-        cl.def("__init__", [](Alias *self_, Args... args) { new (self_) Alias(args...); }, extra...);
-    }
-};
 
 
 inline void keep_alive_impl(handle nurse, handle patient) {
-    /* Clever approach based on weak references taken from Boost.Python */
     if (!nurse || !patient)
         pybind11_fail("Could not activate keep_alive!");
 
     if (patient.is_none() || nurse.is_none())
         return; /* Nothing to keep alive or nothing to be kept alive by */
 
-    cpp_function disable_lifesupport(
-        [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    }
+    else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport(
+            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
 
-    weakref wr(nurse, disable_lifesupport);
+        weakref wr(nurse, disable_lifesupport);
 
-    patient.inc_ref(); /* reference patient and leak the weak reference */
-    (void) wr.release();
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
 }
 
-PYBIND11_NOINLINE inline void keep_alive_impl(int Nurse, int Patient, handle args, handle ret) {
-    handle nurse  (Nurse   > 0 ? PyTuple_GetItem(args.ptr(), Nurse   - 1) : ret.ptr());
-    handle patient(Patient > 0 ? PyTuple_GetItem(args.ptr(), Patient - 1) : ret.ptr());
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0)
+            return ret;
+        else if (n == 1 && call.init_self)
+            return call.init_self;
+        else if (n <= call.args.size())
+            return call.args[n - 1];
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals().registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+        .try_emplace(type);
+#else
+        .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+            get_internals().registered_types_py.erase(type);
+            wr.dec_ref();
+        })).release();
+    }
 
-    keep_alive_impl(nurse, patient);
+    return res;
 }
 
 template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
 struct iterator_state {
     Iterator it;
     Sentinel end;
-    bool first;
+    bool first_or_done;
 };
 
 NAMESPACE_END(detail)
 
-template <typename... Args> detail::init<Args...> init() { return detail::init<Args...>(); }
-template <typename... Args> detail::init_alias<Args...> init_alias() { return detail::init_alias<Args...>(); }
-
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
 template <return_value_policy Policy = return_value_policy::reference_internal,
           typename Iterator,
           typename Sentinel,
@@ -1427,22 +1529,26 @@ iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
     typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
 
     if (!detail::get_type_info(typeid(state), false)) {
-        class_<state>(handle(), "iterator")
+        class_<state>(handle(), "iterator", pybind11::module_local())
             .def("__iter__", [](state &s) -> state& { return s; })
             .def("__next__", [](state &s) -> ValueType {
-                if (!s.first)
+                if (!s.first_or_done)
                     ++s.it;
                 else
-                    s.first = false;
-                if (s.it == s.end)
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
                     throw stop_iteration();
+                }
                 return *s.it;
             }, std::forward<Extra>(extra)..., Policy);
     }
 
-    return (iterator) cast(state { first, last, true });
+    return cast(state{first, last, true});
 }
 
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
 template <return_value_policy Policy = return_value_policy::reference_internal,
           typename Iterator,
           typename Sentinel,
@@ -1452,34 +1558,49 @@ iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
     typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
 
     if (!detail::get_type_info(typeid(state), false)) {
-        class_<state>(handle(), "iterator")
+        class_<state>(handle(), "iterator", pybind11::module_local())
             .def("__iter__", [](state &s) -> state& { return s; })
             .def("__next__", [](state &s) -> KeyType {
-                if (!s.first)
+                if (!s.first_or_done)
                     ++s.it;
                 else
-                    s.first = false;
-                if (s.it == s.end)
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
                     throw stop_iteration();
+                }
                 return (*s.it).first;
             }, std::forward<Extra>(extra)..., Policy);
     }
 
-    return (iterator) cast(state { first, last, true });
+    return cast(state{first, last, true});
 }
 
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
 template <return_value_policy Policy = return_value_policy::reference_internal,
           typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
     return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
 }
 
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
 template <return_value_policy Policy = return_value_policy::reference_internal,
           typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
     return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
 }
 
 template <typename InputType, typename OutputType> void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        set_flag(bool &flag) : flag(flag) { flag = true; }
+        ~set_flag() { flag = false; }
+    };
     auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) // implicit conversions are non-reentrant
+            return nullptr;
+        set_flag flag_helper(currently_used);
         if (!detail::make_caster<InputType>().load(obj, false))
             return nullptr;
         tuple args(1);
@@ -1502,7 +1623,8 @@ void register_exception_translator(ExceptionTranslator&& translator) {
         std::forward<ExceptionTranslator>(translator));
 }
 
-/* Wrapper to generate a new Python exception type.
+/**
+ * Wrapper to generate a new Python exception type.
  *
  * This should only be used with PyErr_SetString for now.
  * It is not (yet) possible to use as a py::base.
@@ -1514,7 +1636,7 @@ public:
     exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
         std::string full_name = scope.attr("__name__").cast<std::string>() +
                                 std::string(".") + name;
-        m_ptr = PyErr_NewException((char *) full_name.c_str(), base, NULL);
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
         if (hasattr(scope, name))
             pybind11_fail("Error during initialization: multiple incompatible "
                           "definitions with name \"" + std::string(name) + "\"");
@@ -1527,7 +1649,8 @@ public:
     }
 };
 
-/** Registers a Python exception in `m` of the given `name` and installs an exception translator to
+/**
+ * Registers a Python exception in `m` of the given `name` and installs an exception translator to
  * translate the C++ exception to the created Python exception using the exceptions what() method.
  * This is intended for simple exception translations; for more complex translation, register the
  * exception object and translator directly.
@@ -1684,9 +1807,13 @@ private:
 class gil_scoped_release {
 public:
     explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
         tstate = PyEval_SaveThread();
         if (disassoc) {
-            auto key = detail::get_internals().tstate;
+            auto key = internals.tstate;
             #if PY_MAJOR_VERSION < 3
                 PyThread_delete_key_value(key);
             #else
@@ -1730,10 +1857,11 @@ class gil_scoped_release { };
 #endif
 
 error_already_set::~error_already_set() {
-    if (value) {
+    if (type) {
         gil_scoped_acquire gil;
-        PyErr_Restore(type, value, trace);
-        PyErr_Clear();
+        type.release().dec_ref();
+        value.release().dec_ref();
+        trace.release().dec_ref();
     }
 }
 
@@ -1786,7 +1914,7 @@ inline function get_type_overload(const void *this_ptr, const detail::type_info
         Py_file_input, d.ptr(), d.ptr());
     if (result == nullptr)
         throw error_already_set();
-    if ((handle) d["self"] == Py_None)
+    if (d["self"].is_none())
         return function();
     Py_DECREF(result);
 #endif
@@ -1826,7 +1954,7 @@ template <class T> function get_overload(const T *this_ptr, const char *name) {
 #define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
     PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, #fn, fn, __VA_ARGS__)
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(_MSC_VER)
 #  pragma warning(pop)
diff --git a/pybind11/include/pybind11/pytypes.h b/pybind11/include/pybind11/pytypes.h
index a89aad78d..d7fa17775 100644
--- a/pybind11/include/pybind11/pytypes.h
+++ b/pybind11/include/pybind11/pytypes.h
@@ -1,5 +1,5 @@
 /*
-    pybind11/typeid.h: Convenience wrapper classes for basic Python types
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
 
@@ -9,11 +9,12 @@
 
 #pragma once
 
-#include "common.h"
+#include "detail/common.h"
+#include "buffer_info.h"
 #include <utility>
 #include <type_traits>
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 
 /* A few forward declarations */
 class handle; class object;
@@ -43,54 +44,140 @@ using tuple_accessor = accessor<accessor_policies::tuple_item>;
 
 /// Tag and check to identify a class which implements the Python object API
 class pyobject_tag { };
-template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, typename std::remove_reference<T>::type>;
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
 
-/// Mixin which adds common functions to handle, object and various accessors.
-/// The only requirement for `Derived` is to implement `PyObject *Derived::ptr() const`.
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
 template <typename Derived>
 class object_api : public pyobject_tag {
     const Derived &derived() const { return static_cast<const Derived &>(*this); }
 
 public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
     iterator begin() const;
+    /// Return a sentinel which ends iteration.
     iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
     item_accessor operator[](handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
     item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
     obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
     str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
     args_proxy operator*() const;
-    template <typename T> bool contains(T &&key) const;
 
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T> bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
     template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
     object operator()(Args &&...args) const;
     template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
     PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
         object call(Args&&... args) const;
 
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
     bool is_none() const { return derived().ptr() == Py_None; }
-    PYBIND11_DEPRECATED("Instead of obj.str(), use py::str(obj)")
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
     pybind11::str str() const;
 
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
     int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+    /// Return a handle to the Python type object underlying the instance
     handle get_type() const;
 };
 
 NAMESPACE_END(detail)
 
-/// Holds a reference to a Python object (no reference counting)
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
 class handle : public detail::object_api<handle> {
 public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
     handle() = default;
+    /// Creates a ``handle`` from the given raw Python object pointer
     handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
 
+    /// Return the underlying ``PyObject *`` pointer
     PyObject *ptr() const { return m_ptr; }
     PyObject *&ptr() { return m_ptr; }
-    const handle& inc_ref() const { Py_XINCREF(m_ptr); return *this; }
-    const handle& dec_ref() const { Py_XDECREF(m_ptr); return *this; }
 
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
     template <typename T> T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
     explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
     bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
     bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
     PYBIND11_DEPRECATED("Use handle::operator bool() instead")
     bool check() const { return m_ptr != nullptr; }
@@ -98,16 +185,33 @@ protected:
     PyObject *m_ptr = nullptr;
 };
 
-/// Holds a reference to a Python object (with reference counting)
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
 class object : public handle {
 public:
     object() = default;
     PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
     object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
+    /// Copy constructor; always increases the reference count
     object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
     object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
     ~object() { dec_ref(); }
 
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
     handle release() {
       PyObject *tmp = m_ptr;
       m_ptr = nullptr;
@@ -138,8 +242,8 @@ public:
 
 protected:
     // Tags for choosing constructors from raw PyObject *
-    struct borrowed_t { }; static constexpr borrowed_t borrowed{};
-    struct stolen_t { }; static constexpr stolen_t stolen{};
+    struct borrowed_t { };
+    struct stolen_t { };
 
     template <typename T> friend T reinterpret_borrow(handle);
     template <typename T> friend T reinterpret_steal(handle);
@@ -150,14 +254,79 @@ public:
     object(handle h, stolen_t) : handle(h) { }
 };
 
-/** The following functions don't do any kind of conversion, they simply declare
-    that a PyObject is a certain type and borrow or steal the reference. */
-template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed}; }
-template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen}; }
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
 
-/// Check if `obj` is an instance of type `T`
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+
+NAMESPACE_BEGIN(detail)
+inline std::string error_string();
+NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class error_already_set : public std::runtime_error {
+public:
+    /// Constructs a new exception from the current Python error indicator, if any.  The current
+    /// Python error indicator will be cleared.
+    error_already_set() : std::runtime_error(detail::error_string()) {
+        PyErr_Fetch(&type.ptr(), &value.ptr(), &trace.ptr());
+    }
+
+    inline ~error_already_set();
+
+    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
+    /// already set it is cleared first.  After this call, the current object no longer stores the
+    /// error variables (but the `.what()` string is still available).
+    void restore() { PyErr_Restore(type.release().ptr(), value.release().ptr(), trace.release().ptr()); }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle ex) const { return PyErr_GivenExceptionMatches(ex.ptr(), type.ptr()); }
+
+private:
+    object type, value, trace;
+};
+
+/** \defgroup python_builtins _
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
 template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
-bool isinstance(handle obj) { return T::_check(obj); }
+bool isinstance(handle obj) { return T::check_(obj); }
 
 template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
 bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
@@ -165,6 +334,8 @@ bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T));
 template <> inline bool isinstance<handle>(handle obj) = delete;
 template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
 
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
 inline bool isinstance(handle obj, handle type) {
     const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
     if (result == -1)
@@ -172,6 +343,8 @@ inline bool isinstance(handle obj, handle type) {
     return result != 0;
 }
 
+/// \addtogroup python_builtins
+/// @{
 inline bool hasattr(handle obj, handle name) {
     return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
 }
@@ -218,12 +391,21 @@ inline void setattr(handle obj, const char *name, handle value) {
     if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
 }
 
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) { throw error_already_set(); }
+    return h;
+}
+
+/// @} python_builtins
+
 NAMESPACE_BEGIN(detail)
 inline handle get_function(handle value) {
     if (value) {
 #if PY_MAJOR_VERSION >= 3
         if (PyInstanceMethod_Check(value.ptr()))
             value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        else
 #endif
         if (PyMethod_Check(value.ptr()))
             value = PyMethod_GET_FUNCTION(value.ptr());
@@ -249,6 +431,8 @@ class accessor : public object_api<accessor<Policy>> {
 
 public:
     accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
+    accessor(const accessor &) = default;
+    accessor(accessor &&) = default;
 
     // accessor overload required to override default assignment operator (templates are not allowed
     // to replace default compiler-generated assignments).
@@ -323,7 +507,7 @@ struct sequence_item {
     static object get(handle obj, size_t index) {
         PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
         if (!result) { throw error_already_set(); }
-        return reinterpret_borrow<object>(result);
+        return reinterpret_steal<object>(result);
     }
 
     static void set(handle obj, size_t index, handle val) {
@@ -369,24 +553,131 @@ struct tuple_item {
 };
 NAMESPACE_END(accessor_policies)
 
-struct dict_iterator {
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
 public:
-    explicit dict_iterator(handle dict = handle(), ssize_t pos = -1) : dict(dict), pos(pos) { }
-    dict_iterator& operator++() {
-        if (!PyDict_Next(dict.ptr(), &pos, &key.ptr(), &value.ptr()))
-            pos = -1;
-        return *this;
-    }
-    std::pair<handle, handle> operator*() const {
-        return std::make_pair(key, value);
-    }
-    bool operator==(const dict_iterator &it) const { return it.pos == pos; }
-    bool operator!=(const dict_iterator &it) const { return it.pos != pos; }
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+
+    reference operator*() const { return Policy::dereference(); }
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() { Policy::increment(); return *this; }
+    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
+    It &operator--() { Policy::decrement(); return *this; }
+    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
+    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
+    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+
+    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
+    friend bool operator> (const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    arrow_proxy(T &&value) : value(std::move(value)) { }
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
 private:
-    handle dict, key, value;
-    ssize_t pos = 0;
+    handle obj;
+    ssize_t index;
 };
 
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type;
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    reference dereference() const { return {key, value}; }
+    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key, *value;
+    ssize_t pos = -1;
+};
+NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
 inline bool PyIterable_Check(PyObject *obj) {
     PyObject *iter = PyObject_GetIter(obj);
     if (iter) {
@@ -417,10 +708,10 @@ public:
 template <typename T> using is_keyword = std::is_base_of<arg, T>;
 template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
 template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
-template <typename T> using is_positional = none_of<
-    is_keyword<T>, is_s_unpacking<T>, is_ds_unpacking<T>
+template <typename T> using is_positional = satisfies_none_of<T,
+    is_keyword, is_s_unpacking, is_ds_unpacking
 >;
-template <typename T> using is_keyword_or_ds = any_of<is_keyword<T>, is_ds_unpacking<T>>;
+template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
 
 // Call argument collector forward declarations
 template <return_value_policy policy = return_value_policy::automatic_reference>
@@ -437,17 +728,24 @@ NAMESPACE_END(detail)
 #define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
     public: \
         PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
-        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed) : Parent(h, stolen)) { } \
-        Name(handle h, borrowed_t) : Parent(h, borrowed) { } \
-        Name(handle h, stolen_t) : Parent(h, stolen) { } \
+        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
+        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
+        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
         PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
         bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
-        static bool _check(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
+        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
 
 #define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
     PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
     /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
-    Name(const object &o) : Parent(ConvertFun(o.ptr()), stolen) { if (!m_ptr) throw error_already_set(); }
+    Name(const object &o) \
+    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    Name(object &&o) \
+    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    template <typename Policy_> \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
 
 #define PYBIND11_OBJECT(Name, Parent, CheckFun) \
     PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
@@ -459,47 +757,74 @@ NAMESPACE_END(detail)
     PYBIND11_OBJECT(Name, Parent, CheckFun) \
     Name() : Parent() { }
 
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
 class iterator : public object {
 public:
-    /** Caveat: copying an iterator does not (and cannot) clone the internal
-        state of the Python iterable */
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = const handle *;
+
     PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
 
     iterator& operator++() {
-        if (m_ptr)
-            advance();
+        advance();
         return *this;
     }
 
-    /** Caveat: this postincrement operator does not (and cannot) clone the
-        internal state of the Python iterable. It should only be used to
-        retrieve the current iterate using <tt>operator*()</tt> */
     iterator operator++(int) {
-        iterator rv(*this);
-        rv.value = value;
-        if (m_ptr)
-            advance();
+        auto rv = *this;
+        advance();
         return rv;
     }
 
-    bool operator==(const iterator &it) const { return *it == **this; }
-    bool operator!=(const iterator &it) const { return *it != **this; }
-
-    handle operator*() const {
-        if (!ready && m_ptr) {
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
             auto& self = const_cast<iterator &>(*this);
             self.advance();
-            self.ready = true;
         }
         return value;
     }
 
+    pointer operator->() const { operator*(); return &value; }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
 private:
-    void advance() { value = reinterpret_steal<object>(PyIter_Next(m_ptr)); }
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (PyErr_Occurred()) { throw error_already_set(); }
+    }
 
 private:
     object value = {};
-    bool ready = false;
 };
 
 class iterable : public object {
@@ -514,13 +839,13 @@ public:
     PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
 
     str(const char *c, size_t n)
-        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen) {
+        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate string object!");
     }
 
     // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
     str(const char *c = "")
-        : object(PyUnicode_FromString(c), stolen) {
+        : object(PyUnicode_FromString(c), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate string object!");
     }
 
@@ -528,7 +853,11 @@ public:
 
     explicit str(const bytes &b);
 
-    explicit str(handle h) : object(raw_str(h.ptr()), stolen) { }
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
 
     operator std::string() const {
         object temp = *this;
@@ -561,24 +890,29 @@ private:
         return str_value;
     }
 };
+/// @} pytypes
 
 inline namespace literals {
-/// String literal version of str
+/** \rst
+    String literal version of `str`
+ \endrst */
 inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
 }
 
+/// \addtogroup pytypes
+/// @{
 class bytes : public object {
 public:
     PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
 
     // Allow implicit conversion:
     bytes(const char *c = "")
-        : object(PYBIND11_BYTES_FROM_STRING(c), stolen) {
+        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
     }
 
     bytes(const char *c, size_t n)
-        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen) {
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
     }
 
@@ -627,15 +961,15 @@ inline str::str(const bytes& b) {
 class none : public object {
 public:
     PYBIND11_OBJECT(none, object, detail::PyNone_Check)
-    none() : object(Py_None, borrowed) { }
+    none() : object(Py_None, borrowed_t{}) { }
 };
 
 class bool_ : public object {
 public:
     PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
-    bool_() : object(Py_False, borrowed) { }
+    bool_() : object(Py_False, borrowed_t{}) { }
     // Allow implicit conversion from and to `bool`:
-    bool_(bool value) : object(value ? Py_True : Py_False, borrowed) { }
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
     operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
 
 private:
@@ -647,10 +981,32 @@ private:
     }
 };
 
+NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)
+#if PY_VERSION_HEX < 0x03000000
+            || PyInt_Check(o)
+#endif
+    ) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    else {
+        unsigned long long v = PyLong_AsUnsignedLongLong(o);
+        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+}
+NAMESPACE_END(detail)
+
 class int_ : public object {
 public:
     PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
-    int_() : object(PyLong_FromLong(0), stolen) { }
+    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
     // Allow implicit conversion from C++ integral types:
     template <typename T,
               detail::enable_if_t<std::is_integral<T>::value, int> = 0>
@@ -672,17 +1028,11 @@ public:
     template <typename T,
               detail::enable_if_t<std::is_integral<T>::value, int> = 0>
     operator T() const {
-        if (sizeof(T) <= sizeof(long)) {
-            if (std::is_signed<T>::value)
-                return (T) PyLong_AsLong(m_ptr);
-            else
-                return (T) PyLong_AsUnsignedLong(m_ptr);
-        } else {
-            if (std::is_signed<T>::value)
-                return (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
-            else
-                return (T) PYBIND11_LONG_AS_UNSIGNED_LONGLONG(m_ptr);
-        }
+        return std::is_unsigned<T>::value
+            ? detail::as_unsigned<T>(m_ptr)
+            : sizeof(T) <= sizeof(long)
+              ? (T) PyLong_AsLong(m_ptr)
+              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
     }
 };
 
@@ -690,10 +1040,10 @@ class float_ : public object {
 public:
     PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
     // Allow implicit conversion from float/double:
-    float_(float value) : object(PyFloat_FromDouble((double) value), stolen) {
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate float object!");
     }
-    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen) {
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate float object!");
     }
     operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
@@ -704,7 +1054,7 @@ class weakref : public object {
 public:
     PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
     explicit weakref(handle obj, handle callback = {})
-        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen) {
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
     }
 };
@@ -730,32 +1080,71 @@ class capsule : public object {
 public:
     PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
     PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
-    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed) : object(ptr, stolen)) { }
-    explicit capsule(const void *value, void (*destruct)(PyObject *) = nullptr)
-        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen) {
-        if (!m_ptr) pybind11_fail("Could not allocate capsule object!");
+    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+
+    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
+    capsule(const void *value, void (*destruct)(PyObject *))
+        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    capsule(const void *value, void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            void *ptr = PyCapsule_GetPointer(o, nullptr);
+            destructor(ptr);
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+
+        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
+            pybind11_fail("Could not set capsule context!");
     }
+
+    capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+            destructor();
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
     template <typename T> operator T *() const {
-        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, nullptr));
+        auto name = this->name();
+        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
         if (!result) pybind11_fail("Unable to extract capsule contents!");
         return result;
     }
+
+    const char *name() const { return PyCapsule_GetName(m_ptr); }
 };
 
 class tuple : public object {
 public:
     PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
-    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen) {
+    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
     }
     size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
     detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
 };
 
 class dict : public object {
 public:
     PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
-    dict() : object(PyDict_New(), stolen) {
+    dict() : object(PyDict_New(), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate dict object!");
     }
     template <typename... Args,
@@ -765,8 +1154,8 @@ public:
     explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
 
     size_t size() const { return (size_t) PyDict_Size(m_ptr); }
-    detail::dict_iterator begin() const { return (++detail::dict_iterator(*this, 0)); }
-    detail::dict_iterator end() const { return detail::dict_iterator(); }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
     void clear() const { PyDict_Clear(ptr()); }
     bool contains(handle key) const { return PyDict_Contains(ptr(), key.ptr()) == 1; }
     bool contains(const char *key) const { return PyDict_Contains(ptr(), pybind11::str(key).ptr()) == 1; }
@@ -782,19 +1171,23 @@ private:
 
 class sequence : public object {
 public:
-    PYBIND11_OBJECT(sequence, object, PySequence_Check)
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
     size_t size() const { return (size_t) PySequence_Size(m_ptr); }
     detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
 };
 
 class list : public object {
 public:
     PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
-    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen) {
+    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate list object!");
     }
     size_t size() const { return (size_t) PyList_Size(m_ptr); }
     detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
     template <typename T> void append(T &&val) const {
         PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
     }
@@ -806,7 +1199,7 @@ class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)
 class set : public object {
 public:
     PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
-    set() : object(PySet_New(nullptr), stolen) {
+    set() : object(PySet_New(nullptr), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate set object!");
     }
     size_t size() const { return (size_t) PySet_Size(m_ptr); }
@@ -819,10 +1212,13 @@ public:
 class function : public object {
 public:
     PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
-    bool is_cpp_function() const {
+    handle cpp_function() const {
         handle fun = detail::get_function(m_ptr);
-        return fun && PyCFunction_Check(fun.ptr());
+        if (fun && PyCFunction_Check(fun.ptr()))
+            return fun;
+        return handle();
     }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
 };
 
 class buffer : public object {
@@ -833,8 +1229,10 @@ public:
         int flags = PyBUF_STRIDES | PyBUF_FORMAT;
         if (writable) flags |= PyBUF_WRITABLE;
         Py_buffer *view = new Py_buffer();
-        if (PyObject_GetBuffer(m_ptr, view, flags) != 0)
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
             throw error_already_set();
+        }
         return buffer_info(view);
     }
 };
@@ -847,15 +1245,15 @@ public:
         static std::vector<Py_ssize_t> py_strides { };
         static std::vector<Py_ssize_t> py_shape { };
         buf.buf = info.ptr;
-        buf.itemsize = (Py_ssize_t) info.itemsize;
+        buf.itemsize = info.itemsize;
         buf.format = const_cast<char *>(info.format.c_str());
         buf.ndim = (int) info.ndim;
-        buf.len = (Py_ssize_t) info.size;
+        buf.len = info.size;
         py_strides.clear();
         py_shape.clear();
-        for (size_t i = 0; i < info.ndim; ++i) {
-            py_strides.push_back((Py_ssize_t) info.strides[i]);
-            py_shape.push_back((Py_ssize_t) info.shape[i]);
+        for (size_t i = 0; i < (size_t) info.ndim; ++i) {
+            py_strides.push_back(info.strides[i]);
+            py_shape.push_back(info.shape[i]);
         }
         buf.strides = py_strides.data();
         buf.shape = py_shape.data();
@@ -870,7 +1268,10 @@ public:
 
     PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
 };
+/// @} pytypes
 
+/// \addtogroup python_builtins
+/// @{
 inline size_t len(handle h) {
     ssize_t result = PyObject_Length(h.ptr());
     if (result < 0)
@@ -889,13 +1290,16 @@ inline str repr(handle h) {
     return reinterpret_steal<str>(str_value);
 }
 
-NAMESPACE_BEGIN(detail)
-template <typename D> iterator object_api<D>::begin() const {
-    return reinterpret_steal<iterator>(PyObject_GetIter(derived().ptr()));
-}
-template <typename D> iterator object_api<D>::end() const {
-    return {};
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<iterator>(result);
 }
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
+template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
 template <typename D> item_accessor object_api<D>::operator[](handle key) const {
     return {derived(), reinterpret_borrow<object>(key)};
 }
@@ -911,15 +1315,18 @@ template <typename D> str_attr_accessor object_api<D>::attr(const char *key) con
 template <typename D> args_proxy object_api<D>::operator*() const {
     return args_proxy(derived().ptr());
 }
-template <typename D> template <typename T> bool object_api<D>::contains(T &&key) const {
-    return attr("__contains__")(std::forward<T>(key)).template cast<bool>();
+template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
 }
 
 template <typename D>
 pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
 
+template <typename D>
+str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+
 template <typename D>
 handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
 
 NAMESPACE_END(detail)
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/include/pybind11/stl.h b/pybind11/include/pybind11/stl.h
index 4b557bd16..db900e674 100644
--- a/pybind11/include/pybind11/stl.h
+++ b/pybind11/include/pybind11/stl.h
@@ -32,15 +32,36 @@
 // std::experimental::optional (but not allowed in c++11 mode)
 #  if defined(PYBIND11_CPP14) && __has_include(<experimental/optional>)
 #    include <experimental/optional>
-#    if __cpp_lib_experimental_optional  // just in case
-#      define PYBIND11_HAS_EXP_OPTIONAL 1
-#    endif
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
 #  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
 #endif
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
 template <typename Type, typename Key> struct set_caster {
     using type = Type;
     using key_conv = make_caster<Key>;
@@ -50,19 +71,20 @@ template <typename Type, typename Key> struct set_caster {
             return false;
         auto s = reinterpret_borrow<pybind11::set>(src);
         value.clear();
-        key_conv conv;
         for (auto entry : s) {
+            key_conv conv;
             if (!conv.load(entry, convert))
                 return false;
-            value.insert(cast_op<Key>(conv));
+            value.insert(cast_op<Key &&>(std::move(conv)));
         }
         return true;
     }
 
-    static handle cast(const type &src, return_value_policy policy, handle parent) {
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
         pybind11::set s;
-        for (auto const &value: src) {
-            auto value_ = reinterpret_steal<object>(key_conv::cast(value, policy, parent));
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
             if (!value_ || !s.add(value_))
                 return handle();
         }
@@ -80,23 +102,24 @@ template <typename Type, typename Key, typename Value> struct map_caster {
         if (!isinstance<dict>(src))
             return false;
         auto d = reinterpret_borrow<dict>(src);
-        key_conv kconv;
-        value_conv vconv;
         value.clear();
         for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
             if (!kconv.load(it.first.ptr(), convert) ||
                 !vconv.load(it.second.ptr(), convert))
                 return false;
-            value.emplace(cast_op<Key>(kconv), cast_op<Value>(vconv));
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
         }
         return true;
     }
 
-    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
         dict d;
-        for (auto const &kv: src) {
-            auto key = reinterpret_steal<object>(key_conv::cast(kv.first, policy, parent));
-            auto value = reinterpret_steal<object>(value_conv::cast(kv.second, policy, parent));
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy, parent));
             if (!key || !value)
                 return handle();
             d[key] = value;
@@ -114,13 +137,13 @@ template <typename Type, typename Value> struct list_caster {
         if (!isinstance<sequence>(src))
             return false;
         auto s = reinterpret_borrow<sequence>(src);
-        value_conv conv;
         value.clear();
         reserve_maybe(s, &value);
         for (auto it : s) {
+            value_conv conv;
             if (!conv.load(it, convert))
                 return false;
-            value.push_back(cast_op<Value>(conv));
+            value.push_back(cast_op<Value &&>(std::move(conv)));
         }
         return true;
     }
@@ -132,11 +155,12 @@ private:
     void reserve_maybe(sequence, void *) { }
 
 public:
-    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
         list l(src.size());
         size_t index = 0;
-        for (auto const &value: src) {
-            auto value_ = reinterpret_steal<object>(value_conv::cast(value, policy, parent));
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
             if (!value_)
                 return handle();
             PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
@@ -175,21 +199,22 @@ public:
         auto l = reinterpret_borrow<list>(src);
         if (!require_size(l.size()))
             return false;
-        value_conv conv;
         size_t ctr = 0;
         for (auto it : l) {
+            value_conv conv;
             if (!conv.load(it, convert))
                 return false;
-            value[ctr++] = cast_op<Value>(conv);
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
         }
         return true;
     }
 
-    static handle cast(const ArrayType &src, return_value_policy policy, handle parent) {
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
         list l(src.size());
         size_t index = 0;
-        for (auto const &value: src) {
-            auto value_ = reinterpret_steal<object>(value_conv::cast(value, policy, parent));
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
             if (!value_)
                 return handle();
             PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
@@ -222,24 +247,24 @@ template <typename Key, typename Value, typename Hash, typename Equal, typename
 template<typename T> struct optional_caster {
     using value_conv = make_caster<typename T::value_type>;
 
-    static handle cast(const T& src, return_value_policy policy, handle parent) {
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
         if (!src)
             return none().inc_ref();
-        return value_conv::cast(*src, policy, parent);
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
     }
 
     bool load(handle src, bool convert) {
         if (!src) {
             return false;
         } else if (src.is_none()) {
-            value = {};  // nullopt
-            return true;
+            return true;  // default-constructed value is already empty
         }
         value_conv inner_caster;
         if (!inner_caster.load(src, convert))
             return false;
 
-        value.emplace(cast_op<typename T::value_type>(inner_caster));
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
         return true;
     }
 
@@ -262,6 +287,74 @@ template<> struct type_caster<std::experimental::nullopt_t>
     : public void_caster<std::experimental::nullopt_t> {};
 #endif
 
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name()...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
 NAMESPACE_END(detail)
 
 inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
@@ -269,7 +362,7 @@ inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
     return os;
 }
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
diff --git a/pybind11/include/pybind11/stl_bind.h b/pybind11/include/pybind11/stl_bind.h
index d1d45e2c0..7ef687878 100644
--- a/pybind11/include/pybind11/stl_bind.h
+++ b/pybind11/include/pybind11/stl_bind.h
@@ -9,13 +9,13 @@
 
 #pragma once
 
-#include "common.h"
+#include "detail/common.h"
 #include "operators.h"
 
 #include <algorithm>
 #include <sstream>
 
-NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
 NAMESPACE_BEGIN(detail)
 
 /* SFINAE helper class used by 'is_comparable */
@@ -66,11 +66,8 @@ template <typename, typename, typename... Args> void vector_if_insertion_operato
 template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
 
 template<typename Vector, typename Class_>
-void vector_if_copy_constructible(enable_if_t<
-    std::is_copy_constructible<Vector>::value &&
-    std::is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
-
-    cl.def(pybind11::init<const Vector &>(), "Copy constructor");
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
 }
 
 template<typename Vector, typename Class_>
@@ -93,7 +90,7 @@ void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_>
             if (p != v.end())
                 v.erase(p);
             else
-                throw pybind11::value_error();
+                throw value_error();
         },
         arg("x"),
         "Remove the first item from the list whose value is x. "
@@ -113,7 +110,7 @@ void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_>
 // (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
 // silly to allow deletion but not insertion, so include them here too.)
 template <typename Vector, typename Class_>
-void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
     using T = typename Vector::value_type;
     using SizeType = typename Vector::size_type;
     using DiffType = typename Vector::difference_type;
@@ -123,21 +120,16 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
            arg("x"),
            "Add an item to the end of the list");
 
-    cl.def("__init__", [](Vector &v, iterable it) {
-        new (&v) Vector();
-        try {
-            v.reserve(len(it));
-            for (handle h : it)
-               v.push_back(h.cast<T>());
-        } catch (...) {
-            v.~Vector();
-            throw;
-        }
-    });
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
 
     cl.def("extend",
        [](Vector &v, const Vector &src) {
-           v.reserve(v.size() + src.size());
            v.insert(v.end(), src.begin(), src.end());
        },
        arg("L"),
@@ -146,6 +138,8 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
 
     cl.def("insert",
         [](Vector &v, SizeType i, const T &x) {
+            if (i > v.size())
+                throw index_error();
             v.insert(v.begin() + (DiffType) i, x);
         },
         arg("i") , arg("x"),
@@ -155,7 +149,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
     cl.def("pop",
         [](Vector &v) {
             if (v.empty())
-                throw pybind11::index_error();
+                throw index_error();
             T t = v.back();
             v.pop_back();
             return t;
@@ -166,7 +160,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
     cl.def("pop",
         [](Vector &v, SizeType i) {
             if (i >= v.size())
-                throw pybind11::index_error();
+                throw index_error();
             T t = v[i];
             v.erase(v.begin() + (DiffType) i);
             return t;
@@ -178,7 +172,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
     cl.def("__setitem__",
         [](Vector &v, SizeType i, const T &t) {
             if (i >= v.size())
-                throw pybind11::index_error();
+                throw index_error();
             v[i] = t;
         }
     );
@@ -189,7 +183,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
             size_t start, stop, step, slicelength;
 
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw pybind11::error_already_set();
+                throw error_already_set();
 
             Vector *seq = new Vector();
             seq->reserve((size_t) slicelength);
@@ -208,7 +202,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
         [](Vector &v, slice slice,  const Vector &value) {
             size_t start, stop, step, slicelength;
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw pybind11::error_already_set();
+                throw error_already_set();
 
             if (slicelength != value.size())
                 throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
@@ -224,7 +218,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
     cl.def("__delitem__",
         [](Vector &v, SizeType i) {
             if (i >= v.size())
-                throw pybind11::index_error();
+                throw index_error();
             v.erase(v.begin() + DiffType(i));
         },
         "Delete the list elements at index ``i``"
@@ -235,7 +229,7 @@ void vector_modifiers(enable_if_t<std::is_copy_constructible<typename Vector::va
             size_t start, stop, step, slicelength;
 
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
-                throw pybind11::error_already_set();
+                throw error_already_set();
 
             if (step == 1 && false) {
                 v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
@@ -266,7 +260,7 @@ void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl)
     cl.def("__getitem__",
         [](Vector &v, SizeType i) -> T & {
             if (i >= v.size())
-                throw pybind11::index_error();
+                throw index_error();
             return v[i];
         },
         return_value_policy::reference_internal // ref + keepalive
@@ -274,7 +268,7 @@ void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl)
 
     cl.def("__iter__",
            [](Vector &v) {
-               return pybind11::make_iterator<
+               return make_iterator<
                    return_value_policy::reference_internal, ItType, ItType, T&>(
                    v.begin(), v.end());
            },
@@ -291,14 +285,14 @@ void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl)
     cl.def("__getitem__",
         [](const Vector &v, SizeType i) -> T {
             if (i >= v.size())
-                throw pybind11::index_error();
+                throw index_error();
             return v[i];
         }
     );
 
     cl.def("__iter__",
            [](Vector &v) {
-               return pybind11::make_iterator<
+               return make_iterator<
                    return_value_policy::copy, ItType, ItType, T>(
                    v.begin(), v.end());
            },
@@ -326,18 +320,72 @@ template <typename Vector, typename Class_> auto vector_if_insertion_operator(Cl
     );
 }
 
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        auto vec = std::unique_ptr<Vector>(new Vector());
+        vec->reserve((size_t) info.shape[0]);
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        for (; p != end; p += step)
+            vec->push_back(*p);
+        return vec.release();
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
 NAMESPACE_END(detail)
 
 //
 // std::vector
 //
 template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
-pybind11::class_<Vector, holder_type> bind_vector(pybind11::module &m, std::string const &name, Args&&... args) {
-    using Class_ = pybind11::class_<Vector, holder_type>;
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
 
-    Class_ cl(m, name.c_str(), std::forward<Args>(args)...);
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
 
-    cl.def(pybind11::init<>());
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
 
     // Register copy constructor (if possible)
     detail::vector_if_copy_constructible<Vector, Class_>(cl);
@@ -368,7 +416,7 @@ pybind11::class_<Vector, holder_type> bind_vector(pybind11::module &m, std::stri
 
 #if 0
     // C++ style functions deprecated, leaving it here as an example
-    cl.def(pybind11::init<size_type>());
+    cl.def(init<size_type>());
 
     cl.def("resize",
          (void (Vector::*) (size_type count)) & Vector::resize,
@@ -377,7 +425,7 @@ pybind11::class_<Vector, holder_type> bind_vector(pybind11::module &m, std::stri
     cl.def("erase",
         [](Vector &v, SizeType i) {
         if (i >= v.size())
-            throw pybind11::index_error();
+            throw index_error();
         v.erase(v.begin() + i);
     }, "erases element at index ``i``");
 
@@ -396,12 +444,12 @@ pybind11::class_<Vector, holder_type> bind_vector(pybind11::module &m, std::stri
 
     cl.def("front", [](Vector &v) {
         if (v.size()) return v.front();
-        else throw pybind11::index_error();
+        else throw index_error();
     }, "access the first element");
 
     cl.def("back", [](Vector &v) {
         if (v.size()) return v.back();
-        else throw pybind11::index_error();
+        else throw index_error();
     }, "access the last element ");
 
 #endif
@@ -440,7 +488,7 @@ void map_assignment(enable_if_t<std::is_copy_assignable<typename Map::mapped_typ
 template<typename Map, typename Class_>
 void map_assignment(enable_if_t<
         !std::is_copy_assignable<typename Map::mapped_type>::value &&
-        std::is_copy_constructible<typename Map::mapped_type>::value,
+        is_copy_constructible<typename Map::mapped_type>::value,
         Class_> &cl) {
     using KeyType = typename Map::key_type;
     using MappedType = typename Map::mapped_type;
@@ -484,14 +532,24 @@ template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &
 NAMESPACE_END(detail)
 
 template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
-pybind11::class_<Map, holder_type> bind_map(module &m, const std::string &name, Args&&... args) {
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
     using KeyType = typename Map::key_type;
     using MappedType = typename Map::mapped_type;
-    using Class_ = pybind11::class_<Map, holder_type>;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
 
-    Class_ cl(m, name.c_str(), std::forward<Args>(args)...);
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
 
-    cl.def(pybind11::init<>());
+    cl.def(init<>());
 
     // Register stream insertion operator (if possible)
     detail::map_if_insertion_operator<Map, Class_>(cl, name);
@@ -502,20 +560,20 @@ pybind11::class_<Map, holder_type> bind_map(module &m, const std::string &name,
     );
 
     cl.def("__iter__",
-           [](Map &m) { return pybind11::make_key_iterator(m.begin(), m.end()); },
-           pybind11::keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
     );
 
     cl.def("items",
-           [](Map &m) { return pybind11::make_iterator(m.begin(), m.end()); },
-           pybind11::keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
     );
 
     cl.def("__getitem__",
         [](Map &m, const KeyType &k) -> MappedType & {
             auto it = m.find(k);
             if (it == m.end())
-              throw pybind11::key_error();
+              throw key_error();
            return it->second;
         },
         return_value_policy::reference_internal // ref + keepalive
@@ -528,7 +586,7 @@ pybind11::class_<Map, holder_type> bind_map(module &m, const std::string &name,
            [](Map &m, const KeyType &k) {
                auto it = m.find(k);
                if (it == m.end())
-                   throw pybind11::key_error();
+                   throw key_error();
                return m.erase(it);
            }
     );
@@ -538,4 +596,4 @@ pybind11::class_<Map, holder_type> bind_map(module &m, const std::string &name,
     return cl;
 }
 
-NAMESPACE_END(pybind11)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/pybind11/pybind11/__main__.py b/pybind11/pybind11/__main__.py
new file mode 100644
index 000000000..9ef837802
--- /dev/null
+++ b/pybind11/pybind11/__main__.py
@@ -0,0 +1,37 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import sysconfig
+
+from . import get_include
+
+
+def print_includes():
+    dirs = [sysconfig.get_path('include'),
+            sysconfig.get_path('platinclude'),
+            get_include(),
+            get_include(True)]
+
+    # Make unique but preserve order
+    unique_dirs = []
+    for d in dirs:
+        if d not in unique_dirs:
+            unique_dirs.append(d)
+
+    print(' '.join('-I' + d for d in unique_dirs))
+
+
+def main():
+    parser = argparse.ArgumentParser(prog='python -m pybind11')
+    parser.add_argument('--includes', action='store_true',
+                        help='Include flags for both pybind11 and Python headers.')
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.includes:
+        print_includes()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pybind11/pybind11/_version.py b/pybind11/pybind11/_version.py
index e67a37027..924115060 100644
--- a/pybind11/pybind11/_version.py
+++ b/pybind11/pybind11/_version.py
@@ -1,2 +1,2 @@
-version_info = (2, 0, 1)
+version_info = (2, 2, 1)
 __version__ = '.'.join(map(str, version_info))
diff --git a/pybind11/setup.py b/pybind11/setup.py
index a25f1af40..b76120573 100644
--- a/pybind11/setup.py
+++ b/pybind11/setup.py
@@ -3,28 +3,33 @@
 # Setup script for PyPI; use CMakeFile.txt to build extension modules
 
 from setuptools import setup
+from distutils.command.install_headers import install_headers
 from pybind11 import __version__
+import os
 
-setup(
-    name='pybind11',
-    version=__version__,
-    description='Seamless operability between C++11 and Python',
-    author='Wenzel Jakob',
-    author_email='wenzel.jakob@epfl.ch',
-    url='https://github.com/wjakob/pybind11',
-    download_url='https://github.com/wjakob/pybind11/tarball/v' + __version__,
-    packages=['pybind11'],
-    license='BSD',
-    headers=[
+# Prevent installation of pybind11 headers by setting
+# PYBIND11_USE_CMAKE.
+if os.environ.get('PYBIND11_USE_CMAKE'):
+    headers = []
+else:
+    headers = [
+        'include/pybind11/detail/class.h',
+        'include/pybind11/detail/common.h',
+        'include/pybind11/detail/descr.h',
+        'include/pybind11/detail/init.h',
+        'include/pybind11/detail/internals.h',
+        'include/pybind11/detail/typeid.h',
         'include/pybind11/attr.h',
+        'include/pybind11/buffer_info.h',
         'include/pybind11/cast.h',
         'include/pybind11/chrono.h',
         'include/pybind11/common.h',
         'include/pybind11/complex.h',
-        'include/pybind11/descr.h',
         'include/pybind11/eigen.h',
+        'include/pybind11/embed.h',
         'include/pybind11/eval.h',
         'include/pybind11/functional.h',
+        'include/pybind11/iostream.h',
         'include/pybind11/numpy.h',
         'include/pybind11/operators.h',
         'include/pybind11/options.h',
@@ -32,8 +37,36 @@ setup(
         'include/pybind11/pytypes.h',
         'include/pybind11/stl.h',
         'include/pybind11/stl_bind.h',
-        'include/pybind11/typeid.h'
-    ],
+    ]
+
+
+class InstallHeaders(install_headers):
+    """Use custom header installer because the default one flattens subdirectories"""
+    def run(self):
+        if not self.distribution.headers:
+            return
+
+        for header in self.distribution.headers:
+            subdir = os.path.dirname(os.path.relpath(header, 'include/pybind11'))
+            install_dir = os.path.join(self.install_dir, subdir)
+            self.mkpath(install_dir)
+
+            (out, _) = self.copy_file(header, install_dir)
+            self.outfiles.append(out)
+
+
+setup(
+    name='pybind11',
+    version=__version__,
+    description='Seamless operability between C++11 and Python',
+    author='Wenzel Jakob',
+    author_email='wenzel.jakob@epfl.ch',
+    url='https://github.com/wjakob/pybind11',
+    download_url='https://github.com/wjakob/pybind11/tarball/v' + __version__,
+    packages=['pybind11'],
+    license='BSD',
+    headers=headers,
+    cmdclass=dict(install_headers=InstallHeaders),
     classifiers=[
         'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',
diff --git a/pybind11/tests/CMakeLists.txt b/pybind11/tests/CMakeLists.txt
index 763ad54a0..25e06662c 100644
--- a/pybind11/tests/CMakeLists.txt
+++ b/pybind11/tests/CMakeLists.txt
@@ -1,3 +1,22 @@
+# CMakeLists.txt -- Build system for the pybind11 test suite
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+option(PYBIND11_WERROR  "Report all warnings as errors"  OFF)
+
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+    # We're being loaded directly, i.e. not via add_subdirectory, so make this
+    # work as its own project and load the pybind11Config to get the tools we need
+    project(pybind11_tests CXX)
+
+    find_package(pybind11 REQUIRED CONFIG)
+endif()
+
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting tests build type to MinSizeRel as none was specified")
   set(CMAKE_BUILD_TYPE MinSizeRel CACHE STRING "Choose the type of build." FORCE)
@@ -7,22 +26,23 @@ endif()
 
 # Full set of test files (you can override these; see below)
 set(PYBIND11_TEST_FILES
-  test_alias_initialization.cpp
   test_buffers.cpp
+  test_builtin_casters.cpp
+  test_call_policies.cpp
   test_callbacks.cpp
   test_chrono.cpp
-  test_class_args.cpp
+  test_class.cpp
   test_constants_and_functions.cpp
-  test_copy_move_policies.cpp
+  test_copy_move.cpp
   test_docstring_options.cpp
   test_eigen.cpp
   test_enum.cpp
   test_eval.cpp
   test_exceptions.cpp
-  test_inheritance.cpp
-  test_issues.cpp
-  test_keep_alive.cpp
+  test_factory_constructors.cpp
+  test_iostream.cpp
   test_kwargs_and_defaults.cpp
+  test_local_bindings.cpp
   test_methods_and_attributes.cpp
   test_modules.cpp
   test_multiple_inheritance.cpp
@@ -32,15 +52,16 @@ set(PYBIND11_TEST_FILES
   test_opaque_types.cpp
   test_operator_overloading.cpp
   test_pickling.cpp
-  test_python_types.cpp
+  test_pytypes.cpp
   test_sequences_and_iterators.cpp
   test_smart_ptr.cpp
+  test_stl.cpp
   test_stl_binders.cpp
   test_virtual_functions.cpp
 )
 
 # Invoking cmake with something like:
-#     cmake -DPYBIND11_TEST_OVERRIDE="test_issues.cpp;test_picking.cpp" ..
+#     cmake -DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_picking.cpp" ..
 # lets you override the tests that get compiled and run.  You can restore to all tests with:
 #     cmake -DPYBIND11_TEST_OVERRIDE= ..
 if (PYBIND11_TEST_OVERRIDE)
@@ -49,14 +70,45 @@ endif()
 
 string(REPLACE ".cpp" ".py" PYBIND11_PYTEST_FILES "${PYBIND11_TEST_FILES}")
 
+# Contains the set of test files that require pybind11_cross_module_tests to be
+# built; if none of these are built (i.e. because TEST_OVERRIDE is used and
+# doesn't include them) the second module doesn't get built.
+set(PYBIND11_CROSS_MODULE_TESTS
+  test_exceptions.py
+  test_local_bindings.py
+  test_stl.py
+  test_stl_binders.py
+)
+
 # Check if Eigen is available; if not, remove from PYBIND11_TEST_FILES (but
 # keep it in PYBIND11_PYTEST_FILES, so that we get the "eigen is not installed"
 # skip message).
 list(FIND PYBIND11_TEST_FILES test_eigen.cpp PYBIND11_TEST_FILES_EIGEN_I)
 if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
-  find_package(Eigen3 QUIET)
+  # Try loading via newer Eigen's Eigen3Config first (bypassing tools/FindEigen3.cmake).
+  # Eigen 3.3.1+ exports a cmake 3.0+ target for handling dependency requirements, but also
+  # produces a fatal error if loaded from a pre-3.0 cmake.
+  if (NOT CMAKE_VERSION VERSION_LESS 3.0)
+    find_package(Eigen3 QUIET CONFIG)
+    if (EIGEN3_FOUND)
+      if (EIGEN3_VERSION_STRING AND NOT EIGEN3_VERSION_STRING VERSION_LESS 3.3.1)
+        set(PYBIND11_EIGEN_VIA_TARGET 1)
+      endif()
+    endif()
+  endif()
+  if (NOT EIGEN3_FOUND)
+    # Couldn't load via target, so fall back to allowing module mode finding, which will pick up
+    # tools/FindEigen3.cmake
+    find_package(Eigen3 QUIET)
+  endif()
 
   if(EIGEN3_FOUND)
+    # Eigen 3.3.1+ cmake sets EIGEN3_VERSION_STRING (and hard codes the version when installed
+    # rather than looking it up in the cmake script); older versions, and the
+    # tools/FindEigen3.cmake, set EIGEN3_VERSION instead.
+    if(NOT EIGEN3_VERSION AND EIGEN3_VERSION_STRING)
+      set(EIGEN3_VERSION ${EIGEN3_VERSION_STRING})
+    endif()
     message(STATUS "Building tests with Eigen v${EIGEN3_VERSION}")
   else()
     list(REMOVE_AT PYBIND11_TEST_FILES ${PYBIND11_TEST_FILES_EIGEN_I})
@@ -64,101 +116,121 @@ if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
   endif()
 endif()
 
-# Create the binding library
-pybind11_add_module(pybind11_tests pybind11_tests.cpp
-  ${PYBIND11_TEST_FILES} ${PYBIND11_HEADERS})
+# Optional dependency for some tests (boost::variant is only supported with version >= 1.56)
+find_package(Boost 1.56)
 
-pybind11_enable_warnings(pybind11_tests)
+# Compile with compiler warnings turned on
+function(pybind11_enable_warnings target_name)
+  if(MSVC)
+    target_compile_options(${target_name} PRIVATE /W4)
+  else()
+      target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion -Wcast-qual)
+  endif()
 
-if(EIGEN3_FOUND)
-  target_include_directories(pybind11_tests PRIVATE ${EIGEN3_INCLUDE_DIR})
-  target_compile_definitions(pybind11_tests PRIVATE -DPYBIND11_TEST_EIGEN)
-endif()
+  if(PYBIND11_WERROR)
+    if(MSVC)
+      target_compile_options(${target_name} PRIVATE /WX)
+    else()
+      target_compile_options(${target_name} PRIVATE -Werror)
+    endif()
+  endif()
+endfunction()
 
-set(testdir ${PROJECT_SOURCE_DIR}/tests)
+set(test_targets pybind11_tests)
 
-# Always write the output file directly into the 'tests' directory (even on MSVC)
-if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
-  set_target_properties(pybind11_tests PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${testdir})
-  foreach(config ${CMAKE_CONFIGURATION_TYPES})
-    string(TOUPPER ${config} config)
-    set_target_properties(pybind11_tests PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config} ${testdir})
-  endforeach()
-endif()
+# Build pybind11_cross_module_tests if any test_whatever.py are being built that require it
+foreach(t ${PYBIND11_CROSS_MODULE_TESTS})
+  list(FIND PYBIND11_PYTEST_FILES ${t} i)
+  if (i GREATER -1)
+    list(APPEND test_targets pybind11_cross_module_tests)
+    break()
+  endif()
+endforeach()
+
+set(testdir ${CMAKE_CURRENT_SOURCE_DIR})
+foreach(target ${test_targets})
+  set(test_files ${PYBIND11_TEST_FILES})
+  if(NOT target STREQUAL "pybind11_tests")
+    set(test_files "")
+  endif()
+
+  # Create the binding library
+  pybind11_add_module(${target} THIN_LTO ${target}.cpp ${test_files} ${PYBIND11_HEADERS})
+  pybind11_enable_warnings(${target})
+
+  if(MSVC)
+    target_compile_options(${target} PRIVATE /utf-8)
+  endif()
+
+  if(EIGEN3_FOUND)
+    if (PYBIND11_EIGEN_VIA_TARGET)
+      target_link_libraries(${target} PRIVATE Eigen3::Eigen)
+    else()
+      target_include_directories(${target} PRIVATE ${EIGEN3_INCLUDE_DIR})
+    endif()
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_EIGEN)
+  endif()
+
+  if(Boost_FOUND)
+    target_include_directories(${target} PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_BOOST)
+  endif()
+
+  # Always write the output file directly into the 'tests' directory (even on MSVC)
+  if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${testdir})
+    foreach(config ${CMAKE_CONFIGURATION_TYPES})
+      string(TOUPPER ${config} config)
+      set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config} ${testdir})
+    endforeach()
+  endif()
+endforeach()
 
 # Make sure pytest is found or produce a fatal error
 if(NOT PYBIND11_PYTEST_FOUND)
-  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pytest" OUTPUT_QUIET ERROR_QUIET
-                  RESULT_VARIABLE PYBIND11_EXEC_PYTHON_ERR)
-  if(PYBIND11_EXEC_PYTHON_ERR)
-    message(FATAL_ERROR "Running the tests requires pytest.  Please install it manually (try: ${PYTHON_EXECUTABLE} -m pip install pytest)")
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pytest; print(pytest.__version__)"
+                  RESULT_VARIABLE pytest_not_found OUTPUT_VARIABLE pytest_version ERROR_QUIET)
+  if(pytest_not_found)
+    message(FATAL_ERROR "Running the tests requires pytest. Please install it manually"
+                        " (try: ${PYTHON_EXECUTABLE} -m pip install pytest)")
+  elseif(pytest_version VERSION_LESS 3.0)
+    message(FATAL_ERROR "Running the tests requires pytest >= 3.0. Found: ${pytest_version}"
+                        "Please update it (try: ${PYTHON_EXECUTABLE} -m pip install -U pytest)")
   endif()
   set(PYBIND11_PYTEST_FOUND TRUE CACHE INTERNAL "")
 endif()
 
+if(CMAKE_VERSION VERSION_LESS 3.2)
+  set(PYBIND11_USES_TERMINAL "")
+else()
+  set(PYBIND11_USES_TERMINAL "USES_TERMINAL")
+endif()
+
 # A single command to compile and run the tests
-add_custom_target(pytest COMMAND ${PYTHON_EXECUTABLE} -m pytest -rws ${PYBIND11_PYTEST_FILES}
-                  DEPENDS pybind11_tests WORKING_DIRECTORY ${testdir})
+add_custom_target(pytest COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PYBIND11_PYTEST_FILES}
+                  DEPENDS ${test_targets} WORKING_DIRECTORY ${testdir} ${PYBIND11_USES_TERMINAL})
 
 if(PYBIND11_TEST_OVERRIDE)
   add_custom_command(TARGET pytest POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E echo "Note: not all tests run: -DPYBIND11_TEST_OVERRIDE is in effect")
 endif()
 
-# And another to show the .so size and, if a previous size, compare it:
-add_custom_command(TARGET pybind11_tests POST_BUILD
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/libsize.py
-  $<TARGET_FILE:pybind11_tests> ${CMAKE_CURRENT_BINARY_DIR}/sosize-$<TARGET_FILE_NAME:pybind11_tests>.txt)
+# Add a check target to run all the tests, starting with pytest (we add dependencies to this below)
+add_custom_target(check DEPENDS pytest)
 
-# Test CMake build using functions and targets from subdirectory or installed location
-add_custom_target(test_cmake_build)
-if(NOT CMAKE_VERSION VERSION_LESS 3.1)
-  # 3.0 needed for interface library for subdirectory_target/installed_target
-  # 3.1 needed for cmake -E env for testing
-
-  include(CMakeParseArguments)
-  function(pybind11_add_build_test name)
-    cmake_parse_arguments(ARG "INSTALL" "" "" ${ARGN})
-
-    set(build_options "-DCMAKE_PREFIX_PATH=${PROJECT_BINARY_DIR}/mock_install"
-                      "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-                      "-DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}"
-                      "-DPYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}")
-    if(NOT ARG_INSTALL)
-      list(APPEND build_options "-DPYBIND11_PROJECT_DIR=${PROJECT_SOURCE_DIR}")
-    endif()
-
-    add_custom_target(test_${name} ${CMAKE_CTEST_COMMAND}
-      --quiet --output-log test_cmake_build/${name}.log
-      --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/test_cmake_build/${name}"
-                       "${CMAKE_CURRENT_BINARY_DIR}/test_cmake_build/${name}"
-      --build-config Release
-      --build-noclean
-      --build-generator ${CMAKE_GENERATOR}
-      $<$<BOOL:${CMAKE_GENERATOR_PLATFORM}>:--build-generator-platform> ${CMAKE_GENERATOR_PLATFORM}
-      --build-makeprogram ${CMAKE_MAKE_PROGRAM}
-      --build-target check
-      --build-options ${build_options}
-    )
-    if(ARG_INSTALL)
-      add_dependencies(test_${name} mock_install)
-    endif()
-    add_dependencies(test_cmake_build test_${name})
-  endfunction()
-
-  pybind11_add_build_test(subdirectory_function)
-  pybind11_add_build_test(subdirectory_target)
+# The remaining tests only apply when being built as part of the pybind11 project, but not if the
+# tests are being built independently.
+if (NOT PROJECT_NAME STREQUAL "pybind11")
+  return()
+endif()
 
-  if(PYBIND11_INSTALL)
-    add_custom_target(mock_install ${CMAKE_COMMAND}
-      "-DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/mock_install"
-      -P "${PROJECT_BINARY_DIR}/cmake_install.cmake"
-    )
+# Add a post-build comment to show the primary test suite .so size and, if a previous size, compare it:
+add_custom_command(TARGET pybind11_tests POST_BUILD
+  COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tools/libsize.py
+  $<TARGET_FILE:pybind11_tests> ${CMAKE_CURRENT_BINARY_DIR}/sosize-$<TARGET_FILE_NAME:pybind11_tests>.txt)
 
-    pybind11_add_build_test(installed_function INSTALL)
-    pybind11_add_build_test(installed_target INSTALL)
-  endif()
-endif()
+# Test embedding the interpreter. Provides the `cpptest` target.
+add_subdirectory(test_embed)
 
-# Run all the tests
-add_custom_target(check DEPENDS pytest test_cmake_build)
+# Test CMake build using functions and targets from subdirectory or installed location
+add_subdirectory(test_cmake_build)
diff --git a/pybind11/tests/conftest.py b/pybind11/tests/conftest.py
index b69fd6cb2..f4c228260 100644
--- a/pybind11/tests/conftest.py
+++ b/pybind11/tests/conftest.py
@@ -103,9 +103,9 @@ class Capture(object):
 
 
 @pytest.fixture
-def capture(capfd):
-    """Extended `capfd` with context manager and custom equality operators"""
-    return Capture(capfd)
+def capture(capsys):
+    """Extended `capsys` with context manager and custom equality operators"""
+    return Capture(capsys)
 
 
 class SanitizedString(object):
@@ -196,7 +196,7 @@ def pytest_namespace():
     except ImportError:
         scipy = None
     try:
-        from pybind11_tests import have_eigen
+        from pybind11_tests.eigen import have_eigen
     except ImportError:
         have_eigen = False
     pypy = platform.python_implementation() == "PyPy"
@@ -211,6 +211,8 @@ def pytest_namespace():
         'requires_eigen_and_scipy': skipif(not have_eigen or not scipy,
                                            reason="eigen and/or scipy are not installed"),
         'unsupported_on_pypy': skipif(pypy, reason="unsupported on PyPy"),
+        'unsupported_on_py2': skipif(sys.version_info.major < 3,
+                                     reason="unsupported on Python 2.x"),
         'gc_collect': gc_collect
     }
 
diff --git a/pybind11/tests/constructor_stats.h b/pybind11/tests/constructor_stats.h
index de5c133c1..babded032 100644
--- a/pybind11/tests/constructor_stats.h
+++ b/pybind11/tests/constructor_stats.h
@@ -24,7 +24,7 @@ function calls to constructors:
         ...
     }
 
-You can find various examples of these in several of the existing example .cpp files.  (Of course
+You can find various examples of these in several of the existing testing .cpp files.  (Of course
 you don't need to add any of the above constructors/operators that you don't actually have, except
 for the destructor).
 
@@ -41,7 +41,7 @@ value constructor) for all of the above methods which will be included in the ou
 For testing, each of these also keeps track the created instances and allows you to check how many
 of the various constructors have been invoked from the Python side via code such as:
 
-    from example import ConstructorStats
+    from pybind11_tests import ConstructorStats
     cstats = ConstructorStats.get(MyClass)
     print(cstats.alive())
     print(cstats.default_constructions)
@@ -169,7 +169,7 @@ public:
         auto &internals = py::detail::get_internals();
         const std::type_index *t1 = nullptr, *t2 = nullptr;
         try {
-            auto *type_info = internals.registered_types_py.at(class_.ptr());
+            auto *type_info = internals.registered_types_py.at((PyTypeObject *) class_.ptr()).at(0);
             for (auto &p : internals.registered_types_cpp) {
                 if (p.second == type_info) {
                     if (t1) {
diff --git a/pybind11/tests/local_bindings.h b/pybind11/tests/local_bindings.h
new file mode 100644
index 000000000..b6afb8086
--- /dev/null
+++ b/pybind11/tests/local_bindings.h
@@ -0,0 +1,64 @@
+#pragma once
+#include "pybind11_tests.h"
+
+/// Simple class used to test py::local:
+template <int> class LocalBase {
+public:
+    LocalBase(int i) : i(i) { }
+    int i = -1;
+};
+
+/// Registered with py::module_local in both main and secondary modules:
+using LocalType = LocalBase<0>;
+/// Registered without py::module_local in both modules:
+using NonLocalType = LocalBase<1>;
+/// A second non-local type (for stl_bind tests):
+using NonLocal2 = LocalBase<2>;
+/// Tests within-module, different-compilation-unit local definition conflict:
+using LocalExternal = LocalBase<3>;
+/// Mixed: registered local first, then global
+using MixedLocalGlobal = LocalBase<4>;
+/// Mixed: global first, then local
+using MixedGlobalLocal = LocalBase<5>;
+
+/// Registered with py::module_local only in the secondary module:
+using ExternalType1 = LocalBase<6>;
+using ExternalType2 = LocalBase<7>;
+
+using LocalVec = std::vector<LocalType>;
+using LocalVec2 = std::vector<NonLocal2>;
+using LocalMap = std::unordered_map<std::string, LocalType>;
+using NonLocalVec = std::vector<NonLocalType>;
+using NonLocalVec2 = std::vector<NonLocal2>;
+using NonLocalMap = std::unordered_map<std::string, NonLocalType>;
+using NonLocalMap2 = std::unordered_map<std::string, uint8_t>;
+
+PYBIND11_MAKE_OPAQUE(LocalVec);
+PYBIND11_MAKE_OPAQUE(LocalVec2);
+PYBIND11_MAKE_OPAQUE(LocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalVec);
+//PYBIND11_MAKE_OPAQUE(NonLocalVec2); // same type as LocalVec2
+PYBIND11_MAKE_OPAQUE(NonLocalMap);
+PYBIND11_MAKE_OPAQUE(NonLocalMap2);
+
+
+// Simple bindings (used with the above):
+template <typename T, int Adjust = 0, typename... Args>
+py::class_<T> bind_local(Args && ...args) {
+    return py::class_<T>(std::forward<Args>(args)...)
+        .def(py::init<int>())
+        .def("get", [](T &i) { return i.i + Adjust; });
+};
+
+// Simulate a foreign library base class (to match the example in the docs):
+namespace pets {
+class Pet {
+public:
+    Pet(std::string name) : name_(name) {}
+    std::string name_;
+    const std::string &name() { return name_; }
+};
+}
+
+struct MixGL { int i; MixGL(int i) : i{i} {} };
+struct MixGL2 { int i; MixGL2(int i) : i{i} {} };
diff --git a/pybind11/tests/object.h b/pybind11/tests/object.h
index 753f654b2..9235f19c2 100644
--- a/pybind11/tests/object.h
+++ b/pybind11/tests/object.h
@@ -164,10 +164,10 @@ public:
     operator T* () { return m_ptr; }
 
     /// Return a const pointer to the referenced object
-    T* get() { return m_ptr; }
+    T* get_ptr() { return m_ptr; }
 
     /// Return a pointer to the referenced object
-    const T* get() const { return m_ptr; }
+    const T* get_ptr() const { return m_ptr; }
 private:
     T *m_ptr;
 };
diff --git a/pybind11/tests/pybind11_cross_module_tests.cpp b/pybind11/tests/pybind11_cross_module_tests.cpp
new file mode 100644
index 000000000..f705e3106
--- /dev/null
+++ b/pybind11/tests/pybind11_cross_module_tests.cpp
@@ -0,0 +1,123 @@
+/*
+    tests/pybind11_cross_module_tests.cpp -- contains tests that require multiple modules
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+PYBIND11_MODULE(pybind11_cross_module_tests, m) {
+    m.doc() = "pybind11 cross-module test module";
+
+    // test_local_bindings.py tests:
+    //
+    // Definitions here are tested by importing both this module and the
+    // relevant pybind11_tests submodule from a test_whatever.py
+
+    // test_load_external
+    bind_local<ExternalType1>(m, "ExternalType1", py::module_local());
+    bind_local<ExternalType2>(m, "ExternalType2", py::module_local());
+
+    // test_exceptions.py
+    m.def("raise_runtime_error", []() { PyErr_SetString(PyExc_RuntimeError, "My runtime error"); throw py::error_already_set(); });
+    m.def("raise_value_error", []() { PyErr_SetString(PyExc_ValueError, "My value error"); throw py::error_already_set(); });
+    m.def("throw_pybind_value_error", []() { throw py::value_error("pybind11 value error"); });
+    m.def("throw_pybind_type_error", []() { throw py::type_error("pybind11 type error"); });
+    m.def("throw_stop_iteration", []() { throw py::stop_iteration(); });
+
+    // test_local_bindings.py
+    // Local to both:
+    bind_local<LocalType, 1>(m, "LocalType", py::module_local())
+        .def("get2", [](LocalType &t) { return t.i + 2; })
+        ;
+
+    // Can only be called with our python type:
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // This registration will fail (global registration when LocalFail is already registered
+    // globally in the main test module):
+    m.def("register_nonlocal", [m]() {
+        bind_local<NonLocalType, 0>(m, "NonLocalType");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+
+    // test_stl_bind_global
+    // and global if the type (or one of the types, for the map) is global (so these will fail,
+    // assuming pybind11_tests is already loaded):
+    m.def("register_nonlocal_vec", [m]() {
+        py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    });
+    m.def("register_nonlocal_map", [m]() {
+        py::bind_map<NonLocalMap>(m, "NonLocalMap");
+    });
+    // The default can, however, be overridden to global using `py::module_local()` or
+    // `py::module_local(false)`.
+    // Explicitly made local:
+    py::bind_vector<NonLocalVec2>(m, "NonLocalVec2", py::module_local());
+    // Explicitly made global (and so will fail to bind):
+    m.def("register_nonlocal_map2", [m]() {
+        py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+    });
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global_local", [m]() {
+        bind_local<MixedGlobalLocal, 200>(m, "MixedGlobalLocal", py::module_local());
+    });
+    m.def("register_mixed_local_global", [m]() {
+        bind_local<MixedLocalGlobal, 2000>(m, "MixedLocalGlobal", py::module_local(false));
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+
+    m.def("load_vector_via_binding", [](std::vector<int> &v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Dog, pets::Pet>(m, "Dog")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL", py::module_local()).def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 100; });
+
+    py::class_<MixGL2>(m, "MixGL2", py::module_local()).def(py::init<int>());
+
+    // test_vector_bool
+    // We can't test both stl.h and stl_bind.h conversions of `std::vector<bool>` within
+    // the same module (it would be an ODR violation). Therefore `bind_vector` of `bool`
+    // is defined here and tested in `test_stl_binders.py`.
+    py::bind_vector<std::vector<bool>>(m, "VectorBool");
+
+    // test_missing_header_message
+    // The main module already includes stl.h, but we need to test the error message
+    // which appears when this header is missing.
+    m.def("missing_header_arg", [](std::vector<float>) { });
+    m.def("missing_header_return", []() { return std::vector<float>(); });
+}
diff --git a/pybind11/tests/pybind11_tests.cpp b/pybind11/tests/pybind11_tests.cpp
index 9c593eee1..bc7d2c3e7 100644
--- a/pybind11/tests/pybind11_tests.cpp
+++ b/pybind11/tests/pybind11_tests.cpp
@@ -10,13 +10,36 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
 
+#include <functional>
+#include <list>
+
+/*
+For testing purposes, we define a static global variable here in a function that each individual
+test .cpp calls with its initialization lambda.  It's convenient here because we can just not
+compile some test files to disable/ignore some of the test code.
+
+It is NOT recommended as a way to use pybind11 in practice, however: the initialization order will
+be essentially random, which is okay for our test scripts (there are no dependencies between the
+individual pybind11 test .cpp files), but most likely not what you want when using pybind11
+productively.
+
+Instead, see the "How can I reduce the build time?" question in the "Frequently asked questions"
+section of the documentation for good practice on splitting binding code over multiple files.
+*/
 std::list<std::function<void(py::module &)>> &initializers() {
     static std::list<std::function<void(py::module &)>> inits;
     return inits;
 }
 
-test_initializer::test_initializer(std::function<void(py::module &)> initializer) {
-    initializers().push_back(std::move(initializer));
+test_initializer::test_initializer(Initializer init) {
+    initializers().push_back(init);
+}
+
+test_initializer::test_initializer(const char *submodule_name, Initializer init) {
+    initializers().push_back([=](py::module &parent) {
+        auto m = parent.def_submodule(submodule_name);
+        init(m);
+    });
 }
 
 void bind_ConstructorStats(py::module &m) {
@@ -28,18 +51,43 @@ void bind_ConstructorStats(py::module &m) {
         .def_readwrite("move_assignments", &ConstructorStats::move_assignments)
         .def_readwrite("copy_constructions", &ConstructorStats::copy_constructions)
         .def_readwrite("move_constructions", &ConstructorStats::move_constructions)
-        .def_static("get", (ConstructorStats &(*)(py::object)) &ConstructorStats::get, py::return_value_policy::reference_internal);
+        .def_static("get", (ConstructorStats &(*)(py::object)) &ConstructorStats::get, py::return_value_policy::reference_internal)
+
+        // Not exactly ConstructorStats, but related: expose the internal pybind number of registered instances
+        // to allow instance cleanup checks (invokes a GC first)
+        .def_static("detail_reg_inst", []() {
+            ConstructorStats::gc();
+            return py::detail::get_internals().registered_instances.size();
+        })
+        ;
 }
 
-PYBIND11_PLUGIN(pybind11_tests) {
-    py::module m("pybind11_tests", "pybind example plugin");
+PYBIND11_MODULE(pybind11_tests, m) {
+    m.doc() = "pybind11 test module";
 
     bind_ConstructorStats(m);
 
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+
+    py::class_<UserType>(m, "UserType", "A `py::class_` type for testing")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("get_value", &UserType::value, "Get value using a method")
+        .def("set_value", &UserType::set, "Set value using a method")
+        .def_property("value", &UserType::value, &UserType::set, "Get/set value using a property")
+        .def("__repr__", [](const UserType& u) { return "UserType({})"_s.format(u.value()); });
+
+    py::class_<IncType, UserType>(m, "IncType")
+        .def(py::init<>())
+        .def(py::init<int>())
+        .def("__repr__", [](const IncType& u) { return "IncType({})"_s.format(u.value()); });
+
     for (const auto &initializer : initializers())
         initializer(m);
 
     if (!py::hasattr(m, "have_eigen")) m.attr("have_eigen") = false;
-
-    return m.ptr();
 }
diff --git a/pybind11/tests/pybind11_tests.h b/pybind11/tests/pybind11_tests.h
index c11b687b2..90963a5de 100644
--- a/pybind11/tests/pybind11_tests.h
+++ b/pybind11/tests/pybind11_tests.h
@@ -1,12 +1,65 @@
 #pragma once
 #include <pybind11/pybind11.h>
-#include <functional>
-#include <list>
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+// We get some really long type names here which causes MSVC 2015 to emit warnings
+#  pragma warning(disable: 4503) // warning C4503: decorated name length exceeded, name was truncated
+#endif
 
 namespace py = pybind11;
 using namespace pybind11::literals;
 
 class test_initializer {
+    using Initializer = void (*)(py::module &);
+
+public:
+    test_initializer(Initializer init);
+    test_initializer(const char *submodule_name, Initializer init);
+};
+
+#define TEST_SUBMODULE(name, variable)                   \
+    void test_submodule_##name(py::module &);            \
+    test_initializer name(#name, test_submodule_##name); \
+    void test_submodule_##name(py::module &variable)
+
+
+/// Dummy type which is not exported anywhere -- something to trigger a conversion error
+struct UnregisteredType { };
+
+/// A user-defined type which is exported and can be used by any test
+class UserType {
+public:
+    UserType() = default;
+    UserType(int i) : i(i) { }
+
+    int value() const { return i; }
+    void set(int set) { i = set; }
+
+private:
+    int i = -1;
+};
+
+/// Like UserType, but increments `value` on copy for quick reference vs. copy tests
+class IncType : public UserType {
+public:
+    using UserType::UserType;
+    IncType() = default;
+    IncType(const IncType &other) : IncType(other.value() + 1) { }
+    IncType(IncType &&) = delete;
+    IncType &operator=(const IncType &) = delete;
+    IncType &operator=(IncType &&) = delete;
+};
+
+/// Custom cast-only type that casts to a string "rvalue" or "lvalue" depending on the cast context.
+/// Used to test recursive casters (e.g. std::tuple, stl containers).
+struct RValueCaster {};
+NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(detail)
+template<> class type_caster<RValueCaster> {
 public:
-    test_initializer(std::function<void(py::module &)> initializer);
+    PYBIND11_TYPE_CASTER(RValueCaster, _("RValueCaster"));
+    static handle cast(RValueCaster &&, return_value_policy, handle) { return py::str("rvalue").release(); }
+    static handle cast(const RValueCaster &, return_value_policy, handle) { return py::str("lvalue").release(); }
 };
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/pybind11/tests/pytest.ini b/pybind11/tests/pytest.ini
new file mode 100644
index 000000000..1e44f0a05
--- /dev/null
+++ b/pybind11/tests/pytest.ini
@@ -0,0 +1,15 @@
+[pytest]
+minversion = 3.0
+norecursedirs = test_cmake_build test_embed
+addopts =
+    # show summary of skipped tests
+    -rs
+    # capture only Python print and C++ py::print, but not C output (low-level Python errors)
+    --capture=sys
+filterwarnings =
+    # make warnings into errors but ignore certain third-party extension issues
+    error
+    # importing scipy submodules on some version of Python
+    ignore::ImportWarning
+    # bogus numpy ABI warning (see numpy/#432)
+    ignore:.*numpy.dtype size changed.*:RuntimeWarning
diff --git a/pybind11/tests/test_alias_initialization.cpp b/pybind11/tests/test_alias_initialization.cpp
deleted file mode 100644
index 48e595695..000000000
--- a/pybind11/tests/test_alias_initialization.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-    tests/test_alias_initialization.cpp -- test cases and example of different trampoline
-    initialization modes
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, Jason Rhinelander <jason@imaginary.ca>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-
-test_initializer alias_initialization([](py::module &m) {
-    // don't invoke Python dispatch classes by default when instantiating C++ classes that were not
-    // extended on the Python side
-    struct A {
-        virtual ~A() {}
-        virtual void f() { py::print("A.f()"); }
-    };
-
-    struct PyA : A {
-        PyA() { py::print("PyA.PyA()"); }
-        ~PyA() { py::print("PyA.~PyA()"); }
-
-        void f() override {
-            py::print("PyA.f()");
-            PYBIND11_OVERLOAD(void, A, f);
-        }
-    };
-
-    auto call_f = [](A *a) { a->f(); };
-
-    py::class_<A, PyA>(m, "A")
-        .def(py::init<>())
-        .def("f", &A::f);
-
-    m.def("call_f", call_f);
-
-
-    // ... unless we explicitly request it, as in this example:
-    struct A2 {
-        virtual ~A2() {}
-        virtual void f() { py::print("A2.f()"); }
-    };
-
-    struct PyA2 : A2 {
-        PyA2() { py::print("PyA2.PyA2()"); }
-        ~PyA2() { py::print("PyA2.~PyA2()"); }
-        void f() override {
-            py::print("PyA2.f()");
-            PYBIND11_OVERLOAD(void, A2, f);
-        }
-    };
-
-    py::class_<A2, PyA2>(m, "A2")
-        .def(py::init_alias<>())
-        .def("f", &A2::f);
-
-    m.def("call_f", [](A2 *a2) { a2->f(); });
-
-});
-
diff --git a/pybind11/tests/test_alias_initialization.py b/pybind11/tests/test_alias_initialization.py
deleted file mode 100644
index fb90cfc7b..000000000
--- a/pybind11/tests/test_alias_initialization.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import pytest
-
-
-def test_alias_delay_initialization1(capture):
-    """
-    A only initializes its trampoline class when we inherit from it; if we just
-    create and use an A instance directly, the trampoline initialization is
-    bypassed and we only initialize an A() instead (for performance reasons).
-    """
-    from pybind11_tests import A, call_f
-
-    class B(A):
-        def __init__(self):
-            super(B, self).__init__()
-
-        def f(self):
-            print("In python f()")
-
-    # C++ version
-    with capture:
-        a = A()
-        call_f(a)
-        del a
-        pytest.gc_collect()
-    assert capture == "A.f()"
-
-    # Python version
-    with capture:
-        b = B()
-        call_f(b)
-        del b
-        pytest.gc_collect()
-    assert capture == """
-        PyA.PyA()
-        PyA.f()
-        In python f()
-        PyA.~PyA()
-    """
-
-
-def test_alias_delay_initialization2(capture):
-    """A2, unlike the above, is configured to always initialize the alias; while
-    the extra initialization and extra class layer has small virtual dispatch
-    performance penalty, it also allows us to do more things with the trampoline
-    class such as defining local variables and performing construction/destruction.
-    """
-    from pybind11_tests import A2, call_f
-
-    class B2(A2):
-        def __init__(self):
-            super(B2, self).__init__()
-
-        def f(self):
-            print("In python B2.f()")
-
-    # No python subclass version
-    with capture:
-        a2 = A2()
-        call_f(a2)
-        del a2
-        pytest.gc_collect()
-    assert capture == """
-        PyA2.PyA2()
-        PyA2.f()
-        A2.f()
-        PyA2.~PyA2()
-    """
-
-    # Python subclass version
-    with capture:
-        b2 = B2()
-        call_f(b2)
-        del b2
-        pytest.gc_collect()
-    assert capture == """
-        PyA2.PyA2()
-        PyA2.f()
-        In python B2.f()
-        PyA2.~PyA2()
-    """
diff --git a/pybind11/tests/test_buffers.cpp b/pybind11/tests/test_buffers.cpp
index 057250d29..5be717730 100644
--- a/pybind11/tests/test_buffers.cpp
+++ b/pybind11/tests/test_buffers.cpp
@@ -10,93 +10,94 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
 
-class Matrix {
-public:
-    Matrix(size_t rows, size_t cols) : m_rows(rows), m_cols(cols) {
-        print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
-        m_data = new float[rows*cols];
-        memset(m_data, 0, sizeof(float) * rows * cols);
-    }
-
-    Matrix(const Matrix &s) : m_rows(s.m_rows), m_cols(s.m_cols) {
-        print_copy_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
-        m_data = new float[m_rows * m_cols];
-        memcpy(m_data, s.m_data, sizeof(float) * m_rows * m_cols);
-    }
-
-    Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
-        print_move_created(this);
-        s.m_rows = 0;
-        s.m_cols = 0;
-        s.m_data = nullptr;
-    }
-
-    ~Matrix() {
-        print_destroyed(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
-        delete[] m_data;
-    }
-
-    Matrix &operator=(const Matrix &s) {
-        print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
-        delete[] m_data;
-        m_rows = s.m_rows;
-        m_cols = s.m_cols;
-        m_data = new float[m_rows * m_cols];
-        memcpy(m_data, s.m_data, sizeof(float) * m_rows * m_cols);
-        return *this;
-    }
-
-    Matrix &operator=(Matrix &&s) {
-        print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
-        if (&s != this) {
-            delete[] m_data;
-            m_rows = s.m_rows; m_cols = s.m_cols; m_data = s.m_data;
-            s.m_rows = 0; s.m_cols = 0; s.m_data = nullptr;
+TEST_SUBMODULE(buffers, m) {
+    // test_from_python / test_to_python:
+    class Matrix {
+    public:
+        Matrix(ssize_t rows, ssize_t cols) : m_rows(rows), m_cols(cols) {
+            print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (rows*cols)];
+            memset(m_data, 0, sizeof(float) * (size_t) (rows * cols));
+        }
+
+        Matrix(const Matrix &s) : m_rows(s.m_rows), m_cols(s.m_cols) {
+            print_copy_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
         }
-        return *this;
-    }
 
-    float operator()(size_t i, size_t j) const {
-        return m_data[i*m_cols + j];
-    }
+        Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_rows = 0;
+            s.m_cols = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Matrix() {
+            print_destroyed(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+        }
 
-    float &operator()(size_t i, size_t j) {
-        return m_data[i*m_cols + j];
-    }
+        Matrix &operator=(const Matrix &s) {
+            print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            delete[] m_data;
+            m_rows = s.m_rows;
+            m_cols = s.m_cols;
+            m_data = new float[(size_t) (m_rows * m_cols)];
+            memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
+            return *this;
+        }
 
-    float *data() { return m_data; }
+        Matrix &operator=(Matrix &&s) {
+            print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            if (&s != this) {
+                delete[] m_data;
+                m_rows = s.m_rows; m_cols = s.m_cols; m_data = s.m_data;
+                s.m_rows = 0; s.m_cols = 0; s.m_data = nullptr;
+            }
+            return *this;
+        }
 
-    size_t rows() const { return m_rows; }
-    size_t cols() const { return m_cols; }
-private:
-    size_t m_rows;
-    size_t m_cols;
-    float *m_data;
-};
+        float operator()(ssize_t i, ssize_t j) const {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
 
-test_initializer buffers([](py::module &m) {
-    py::class_<Matrix> mtx(m, "Matrix", py::buffer_protocol());
+        float &operator()(ssize_t i, ssize_t j) {
+            return m_data[(size_t) (i*m_cols + j)];
+        }
 
-    mtx.def(py::init<size_t, size_t>())
+        float *data() { return m_data; }
+
+        ssize_t rows() const { return m_rows; }
+        ssize_t cols() const { return m_cols; }
+    private:
+        ssize_t m_rows;
+        ssize_t m_cols;
+        float *m_data;
+    };
+    py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
+        .def(py::init<ssize_t, ssize_t>())
         /// Construct from a buffer
-        .def("__init__", [](Matrix &v, py::buffer b) {
+        .def(py::init([](py::buffer b) {
             py::buffer_info info = b.request();
             if (info.format != py::format_descriptor<float>::format() || info.ndim != 2)
                 throw std::runtime_error("Incompatible buffer format!");
-            new (&v) Matrix(info.shape[0], info.shape[1]);
-            memcpy(v.data(), info.ptr, sizeof(float) * v.rows() * v.cols());
-        })
+
+            auto v = new Matrix(info.shape[0], info.shape[1]);
+            memcpy(v->data(), info.ptr, sizeof(float) * (size_t) (v->rows() * v->cols()));
+            return v;
+        }))
 
        .def("rows", &Matrix::rows)
        .def("cols", &Matrix::cols)
 
         /// Bare bones interface
-       .def("__getitem__", [](const Matrix &m, std::pair<size_t, size_t> i) {
+       .def("__getitem__", [](const Matrix &m, std::pair<ssize_t, ssize_t> i) {
             if (i.first >= m.rows() || i.second >= m.cols())
                 throw py::index_error();
             return m(i.first, i.second);
         })
-       .def("__setitem__", [](Matrix &m, std::pair<size_t, size_t> i, float v) {
+       .def("__setitem__", [](Matrix &m, std::pair<ssize_t, ssize_t> i, float v) {
             if (i.first >= m.rows() || i.second >= m.cols())
                 throw py::index_error();
             m(i.first, i.second) = v;
@@ -105,13 +106,64 @@ test_initializer buffers([](py::module &m) {
        .def_buffer([](Matrix &m) -> py::buffer_info {
             return py::buffer_info(
                 m.data(),                               /* Pointer to buffer */
-                sizeof(float),                          /* Size of one scalar */
-                py::format_descriptor<float>::format(), /* Python struct-style format descriptor */
-                2,                                      /* Number of dimensions */
                 { m.rows(), m.cols() },                 /* Buffer dimensions */
-                { sizeof(float) * m.rows(),             /* Strides (in bytes) for each index */
+                { sizeof(float) * size_t(m.rows()),     /* Strides (in bytes) for each index */
                   sizeof(float) }
             );
         })
         ;
-});
+
+
+    // test_inherited_protocol
+    class SquareMatrix : public Matrix {
+    public:
+        SquareMatrix(ssize_t n) : Matrix(n, n) { }
+    };
+    // Derived classes inherit the buffer protocol and the buffer access function
+    py::class_<SquareMatrix, Matrix>(m, "SquareMatrix")
+        .def(py::init<ssize_t>());
+
+
+    // test_pointer_to_member_fn
+    // Tests that passing a pointer to member to the base class works in
+    // the derived class.
+    struct Buffer {
+        int32_t value = 0;
+
+        py::buffer_info get_buffer_info() {
+            return py::buffer_info(&value, sizeof(value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+    };
+    py::class_<Buffer>(m, "Buffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", &Buffer::value)
+        .def_buffer(&Buffer::get_buffer_info);
+
+
+    class ConstBuffer {
+        std::unique_ptr<int32_t> value;
+
+    public:
+        int32_t get_value() const { return *value; }
+        void set_value(int32_t v) { *value = v; }
+
+        py::buffer_info get_buffer_info() const {
+            return py::buffer_info(value.get(), sizeof(*value),
+                                   py::format_descriptor<int32_t>::format(), 1);
+        }
+
+        ConstBuffer() : value(new int32_t{0}) { };
+    };
+    py::class_<ConstBuffer>(m, "ConstBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_property("value", &ConstBuffer::get_value, &ConstBuffer::set_value)
+        .def_buffer(&ConstBuffer::get_buffer_info);
+
+    struct DerivedBuffer : public Buffer { };
+    py::class_<DerivedBuffer>(m, "DerivedBuffer", py::buffer_protocol())
+        .def(py::init<>())
+        .def_readwrite("value", (int32_t DerivedBuffer::*) &DerivedBuffer::value)
+        .def_buffer(&DerivedBuffer::get_buffer_info);
+
+}
diff --git a/pybind11/tests/test_buffers.py b/pybind11/tests/test_buffers.py
index 956839c1c..c348be5dd 100644
--- a/pybind11/tests/test_buffers.py
+++ b/pybind11/tests/test_buffers.py
@@ -1,24 +1,27 @@
+import struct
 import pytest
-from pybind11_tests import Matrix, ConstructorStats
+from pybind11_tests import buffers as m
+from pybind11_tests import ConstructorStats
+
+pytestmark = pytest.requires_numpy
 
 with pytest.suppress(ImportError):
     import numpy as np
 
 
-@pytest.requires_numpy
 def test_from_python():
     with pytest.raises(RuntimeError) as excinfo:
-        Matrix(np.array([1, 2, 3]))  # trying to assign a 1D array
+        m.Matrix(np.array([1, 2, 3]))  # trying to assign a 1D array
     assert str(excinfo.value) == "Incompatible buffer format!"
 
     m3 = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
-    m4 = Matrix(m3)
+    m4 = m.Matrix(m3)
 
     for i in range(m4.rows()):
         for j in range(m4.cols()):
             assert m3[i, j] == m4[i, j]
 
-    cstats = ConstructorStats.get(Matrix)
+    cstats = ConstructorStats.get(m.Matrix)
     assert cstats.alive() == 1
     del m3, m4
     assert cstats.alive() == 0
@@ -32,27 +35,27 @@ def test_from_python():
 # PyPy: Memory leak in the "np.array(m, copy=False)" call
 # https://bitbucket.org/pypy/pypy/issues/2444
 @pytest.unsupported_on_pypy
-@pytest.requires_numpy
 def test_to_python():
-    m = Matrix(5, 5)
+    mat = m.Matrix(5, 5)
+    assert memoryview(mat).shape == (5, 5)
 
-    assert m[2, 3] == 0
-    m[2, 3] = 4
-    assert m[2, 3] == 4
+    assert mat[2, 3] == 0
+    mat[2, 3] = 4
+    assert mat[2, 3] == 4
 
-    m2 = np.array(m, copy=False)
-    assert m2.shape == (5, 5)
-    assert abs(m2).sum() == 4
-    assert m2[2, 3] == 4
-    m2[2, 3] = 5
-    assert m2[2, 3] == 5
+    mat2 = np.array(mat, copy=False)
+    assert mat2.shape == (5, 5)
+    assert abs(mat2).sum() == 4
+    assert mat2[2, 3] == 4
+    mat2[2, 3] = 5
+    assert mat2[2, 3] == 5
 
-    cstats = ConstructorStats.get(Matrix)
+    cstats = ConstructorStats.get(m.Matrix)
     assert cstats.alive() == 1
-    del m
+    del mat
     pytest.gc_collect()
     assert cstats.alive() == 1
-    del m2  # holds an m reference
+    del mat2  # holds a mat reference
     pytest.gc_collect()
     assert cstats.alive() == 0
     assert cstats.values() == ["5x5 matrix"]
@@ -60,3 +63,21 @@ def test_to_python():
     # assert cstats.move_constructions >= 0  # Don't invoke any
     assert cstats.copy_assignments == 0
     assert cstats.move_assignments == 0
+
+
+@pytest.unsupported_on_pypy
+def test_inherited_protocol():
+    """SquareMatrix is derived from Matrix and inherits the buffer protocol"""
+
+    matrix = m.SquareMatrix(5)
+    assert memoryview(matrix).shape == (5, 5)
+    assert np.asarray(matrix).shape == (5, 5)
+
+
+@pytest.unsupported_on_pypy
+def test_pointer_to_member_fn():
+    for cls in [m.Buffer, m.ConstBuffer, m.DerivedBuffer]:
+        buf = cls()
+        buf.value = 0x12345678
+        value = struct.unpack('i', bytearray(buf))[0]
+        assert value == 0x12345678
diff --git a/pybind11/tests/test_builtin_casters.cpp b/pybind11/tests/test_builtin_casters.cpp
new file mode 100644
index 000000000..b73e96ea5
--- /dev/null
+++ b/pybind11/tests/test_builtin_casters.cpp
@@ -0,0 +1,156 @@
+/*
+    tests/test_builtin_casters.cpp -- Casters available without any additional headers
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/complex.h>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+TEST_SUBMODULE(builtin_casters, m) {
+    // test_simple_string
+    m.def("string_roundtrip", [](const char *s) { return s; });
+
+    // test_unicode_conversion
+    // Some test characters in utf16 and utf32 encodings.  The last one (the 𝐀) contains a null byte
+    char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*‽*/, cake32 = 0x1f382 /*🎂*/,              mathbfA32 = 0x1d400 /*𝐀*/;
+    char16_t b16 = 0x62 /*b*/, z16 = 0x7a,       ib16 = 0x203d,       cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
+    std::wstring wstr;
+    wstr.push_back(0x61); // a
+    wstr.push_back(0x2e18); // ⸘
+    if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
+    else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
+    wstr.push_back(0x7a); // z
+
+    m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
+    m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
+    m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
+    m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
+    m.def("bad_utf8_string", []()  { return std::string("abc\xd0" "def"); });
+    m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
+    // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
+    if (PY_MAJOR_VERSION >= 3)
+        m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
+    if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
+        m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
+    m.def("u8_Z", []() -> char { return 'Z'; });
+    m.def("u8_eacute", []() -> char { return '\xe9'; });
+    m.def("u16_ibang", [=]() -> char16_t { return ib16; });
+    m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
+    m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
+
+    // test_single_char_arguments
+    m.attr("wchar_size") = py::cast(sizeof(wchar_t));
+    m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
+    m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
+    m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
+    m.def("ord_wchar", [](wchar_t c) -> int { return c; });
+
+    // test_bytes_to_string
+    m.def("strlen", [](char *s) { return strlen(s); });
+    m.def("string_length", [](std::string s) { return s.length(); });
+
+    // test_string_view
+#ifdef PYBIND11_HAS_STRING_VIEW
+    m.attr("has_string_view") = true;
+    m.def("string_view_print",   [](std::string_view s)    { py::print(s, s.size()); });
+    m.def("string_view16_print", [](std::u16string_view s) { py::print(s, s.size()); });
+    m.def("string_view32_print", [](std::u32string_view s) { py::print(s, s.size()); });
+    m.def("string_view_chars",   [](std::string_view s)    { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
+    m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
+    m.def("string_view_return",   []() { return std::string_view(u8"utf8 secret \U0001f382"); });
+    m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
+    m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
+#endif
+
+    // test_integer_casting
+    m.def("i32_str", [](std::int32_t v) { return std::to_string(v); });
+    m.def("u32_str", [](std::uint32_t v) { return std::to_string(v); });
+    m.def("i64_str", [](std::int64_t v) { return std::to_string(v); });
+    m.def("u64_str", [](std::uint64_t v) { return std::to_string(v); });
+
+    // test_tuple
+    m.def("pair_passthrough", [](std::pair<bool, std::string> input) {
+        return std::make_pair(input.second, input.first);
+    }, "Return a pair in reversed order");
+    m.def("tuple_passthrough", [](std::tuple<bool, std::string, int> input) {
+        return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
+    }, "Return a triple in reversed order");
+    m.def("empty_tuple", []() { return std::tuple<>(); });
+    static std::pair<RValueCaster, RValueCaster> lvpair;
+    static std::tuple<RValueCaster, RValueCaster, RValueCaster> lvtuple;
+    static std::pair<RValueCaster, std::tuple<RValueCaster, std::pair<RValueCaster, RValueCaster>>> lvnested;
+    m.def("rvalue_pair", []() { return std::make_pair(RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_pair", []() -> const decltype(lvpair) & { return lvpair; });
+    m.def("rvalue_tuple", []() { return std::make_tuple(RValueCaster{}, RValueCaster{}, RValueCaster{}); });
+    m.def("lvalue_tuple", []() -> const decltype(lvtuple) & { return lvtuple; });
+    m.def("rvalue_nested", []() {
+        return std::make_pair(RValueCaster{}, std::make_tuple(RValueCaster{}, std::make_pair(RValueCaster{}, RValueCaster{}))); });
+    m.def("lvalue_nested", []() -> const decltype(lvnested) & { return lvnested; });
+
+    // test_builtins_cast_return_none
+    m.def("return_none_string", []() -> std::string * { return nullptr; });
+    m.def("return_none_char",   []() -> const char *  { return nullptr; });
+    m.def("return_none_bool",   []() -> bool *        { return nullptr; });
+    m.def("return_none_int",    []() -> int *         { return nullptr; });
+    m.def("return_none_float",  []() -> float *       { return nullptr; });
+
+    // test_none_deferred
+    m.def("defer_none_cstring", [](char *) { return false; });
+    m.def("defer_none_cstring", [](py::none) { return true; });
+    m.def("defer_none_custom", [](UserType *) { return false; });
+    m.def("defer_none_custom", [](py::none) { return true; });
+    m.def("nodefer_none_void", [](void *) { return true; });
+    m.def("nodefer_none_void", [](py::none) { return false; });
+
+    // test_void_caster
+    m.def("load_nullptr_t", [](std::nullptr_t) {}); // not useful, but it should still compile
+    m.def("cast_nullptr_t", []() { return std::nullptr_t{}; });
+
+    // test_bool_caster
+    m.def("bool_passthrough", [](bool arg) { return arg; });
+    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg().noconvert());
+
+    // test_reference_wrapper
+    m.def("refwrap_builtin", [](std::reference_wrapper<int> p) { return 10 * p.get(); });
+    m.def("refwrap_usertype", [](std::reference_wrapper<UserType> p) { return p.get().value(); });
+    // Not currently supported (std::pair caster has return-by-value cast operator);
+    // triggers static_assert failure.
+    //m.def("refwrap_pair", [](std::reference_wrapper<std::pair<int, int>>) { });
+
+    m.def("refwrap_list", [](bool copy) {
+        static IncType x1(1), x2(2);
+        py::list l;
+        for (auto &f : {std::ref(x1), std::ref(x2)}) {
+            l.append(py::cast(f, copy ? py::return_value_policy::copy
+                                      : py::return_value_policy::reference));
+        }
+        return l;
+    }, "copy"_a);
+
+    m.def("refwrap_iiw", [](const IncType &w) { return w.value(); });
+    m.def("refwrap_call_iiw", [](IncType &w, py::function f) {
+        py::list l;
+        l.append(f(std::ref(w)));
+        l.append(f(std::cref(w)));
+        IncType x(w.value());
+        l.append(f(std::ref(x)));
+        IncType y(w.value());
+        auto r3 = std::ref(y);
+        l.append(f(r3));
+        return l;
+    });
+
+    // test_complex
+    m.def("complex_cast", [](float x) { return "{}"_s.format(x); });
+    m.def("complex_cast", [](std::complex<float> x) { return "({}, {})"_s.format(x.real(), x.imag()); });
+}
diff --git a/pybind11/tests/test_builtin_casters.py b/pybind11/tests/test_builtin_casters.py
new file mode 100644
index 000000000..bc094a381
--- /dev/null
+++ b/pybind11/tests/test_builtin_casters.py
@@ -0,0 +1,322 @@
+# Python < 3 needs this: coding=utf-8
+import pytest
+
+from pybind11_tests import builtin_casters as m
+from pybind11_tests import UserType, IncType
+
+
+def test_simple_string():
+    assert m.string_roundtrip("const char *") == "const char *"
+
+
+def test_unicode_conversion():
+    """Tests unicode conversion and error reporting."""
+    assert m.good_utf8_string() == u"Say utf8‽ 🎂 𝐀"
+    assert m.good_utf16_string() == u"b‽🎂𝐀z"
+    assert m.good_utf32_string() == u"a𝐀🎂‽z"
+    assert m.good_wchar_string() == u"a⸘𝐀z"
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf8_string()
+
+    with pytest.raises(UnicodeDecodeError):
+        m.bad_utf16_string()
+
+    # These are provided only if they actually fail (they don't when 32-bit and under Python 2.7)
+    if hasattr(m, "bad_utf32_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_utf32_string()
+    if hasattr(m, "bad_wchar_string"):
+        with pytest.raises(UnicodeDecodeError):
+            m.bad_wchar_string()
+
+    assert m.u8_Z() == 'Z'
+    assert m.u8_eacute() == u'é'
+    assert m.u16_ibang() == u'‽'
+    assert m.u32_mathbfA() == u'𝐀'
+    assert m.wchar_heart() == u'♥'
+
+
+def test_single_char_arguments():
+    """Tests failures for passing invalid inputs to char-accepting functions"""
+    def toobig_message(r):
+        return "Character code point not in range({0:#x})".format(r)
+    toolong_message = "Expected a character, but multi-character string found"
+
+    assert m.ord_char(u'a') == 0x61  # simple ASCII
+    assert m.ord_char(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'Ä€') == 0x100  # requires 2 bytes, doesn't fit in a char
+    assert str(excinfo.value) == toobig_message(0x100)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char(u'ab')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char16(u'a') == 0x61
+    assert m.ord_char16(u'é') == 0xE9
+    assert m.ord_char16(u'Ä€') == 0x100
+    assert m.ord_char16(u'‽') == 0x203d
+    assert m.ord_char16(u'♥') == 0x2665
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'🎂') == 0x1F382  # requires surrogate pair
+    assert str(excinfo.value) == toobig_message(0x10000)
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char16(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_char32(u'a') == 0x61
+    assert m.ord_char32(u'é') == 0xE9
+    assert m.ord_char32(u'Ä€') == 0x100
+    assert m.ord_char32(u'‽') == 0x203d
+    assert m.ord_char32(u'♥') == 0x2665
+    assert m.ord_char32(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_char32(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+    assert m.ord_wchar(u'a') == 0x61
+    assert m.ord_wchar(u'é') == 0xE9
+    assert m.ord_wchar(u'Ä€') == 0x100
+    assert m.ord_wchar(u'‽') == 0x203d
+    assert m.ord_wchar(u'♥') == 0x2665
+    if m.wchar_size == 2:
+        with pytest.raises(ValueError) as excinfo:
+            assert m.ord_wchar(u'🎂') == 0x1F382  # requires surrogate pair
+        assert str(excinfo.value) == toobig_message(0x10000)
+    else:
+        assert m.ord_wchar(u'🎂') == 0x1F382
+    with pytest.raises(ValueError) as excinfo:
+        assert m.ord_wchar(u'aa')
+    assert str(excinfo.value) == toolong_message
+
+
+def test_bytes_to_string():
+    """Tests the ability to pass bytes to C++ string-accepting functions.  Note that this is
+    one-way: the only way to return bytes to Python is via the pybind11::bytes class."""
+    # Issue #816
+    import sys
+    byte = bytes if sys.version_info[0] < 3 else str
+
+    assert m.strlen(byte("hi")) == 2
+    assert m.string_length(byte("world")) == 5
+    assert m.string_length(byte("a\x00b")) == 3
+    assert m.strlen(byte("a\x00b")) == 1  # C-string limitation
+
+    # passing in a utf8 encoded string should work
+    assert m.string_length(u'💩'.encode("utf8")) == 4
+
+
+@pytest.mark.skipif(not hasattr(m, "has_string_view"), reason="no <string_view>")
+def test_string_view(capture):
+    """Tests support for C++17 string_view arguments and return values"""
+    assert m.string_view_chars("Hi") == [72, 105]
+    assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
+    assert m.string_view16_chars("Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
+    assert m.string_view32_chars("Hi 🎂") == [72, 105, 32, 127874]
+
+    assert m.string_view_return() == "utf8 secret 🎂"
+    assert m.string_view16_return() == "utf16 secret 🎂"
+    assert m.string_view32_return() == "utf32 secret 🎂"
+
+    with capture:
+        m.string_view_print("Hi")
+        m.string_view_print("utf8 🎂")
+        m.string_view16_print("utf16 🎂")
+        m.string_view32_print("utf32 🎂")
+    assert capture == """
+        Hi 2
+        utf8 🎂 9
+        utf16 🎂 8
+        utf32 🎂 7
+    """
+
+    with capture:
+        m.string_view_print("Hi, ascii")
+        m.string_view_print("Hi, utf8 🎂")
+        m.string_view16_print("Hi, utf16 🎂")
+        m.string_view32_print("Hi, utf32 🎂")
+    assert capture == """
+        Hi, ascii 9
+        Hi, utf8 🎂 13
+        Hi, utf16 🎂 12
+        Hi, utf32 🎂 11
+    """
+
+
+def test_integer_casting():
+    """Issue #929 - out-of-range integer values shouldn't be accepted"""
+    import sys
+    assert m.i32_str(-1) == "-1"
+    assert m.i64_str(-1) == "-1"
+    assert m.i32_str(2000000000) == "2000000000"
+    assert m.u32_str(2000000000) == "2000000000"
+    if sys.version_info < (3,):
+        assert m.i32_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
+        assert m.i64_str(long(-999999999999)) == "-999999999999"  # noqa: F821 undefined name
+        assert m.u64_str(long(999999999999)) == "999999999999"  # noqa: F821 undefined name 'long'
+    else:
+        assert m.i64_str(-999999999999) == "-999999999999"
+        assert m.u64_str(999999999999) == "999999999999"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.u32_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.u64_str(-1)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(-3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.i32_str(3000000000)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    if sys.version_info < (3,):
+        with pytest.raises(TypeError) as excinfo:
+            m.u32_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+        with pytest.raises(TypeError) as excinfo:
+            m.u64_str(long(-1))  # noqa: F821 undefined name 'long'
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_tuple(doc):
+    """std::pair <-> tuple & std::tuple <-> tuple"""
+    assert m.pair_passthrough((True, "test")) == ("test", True)
+    assert m.tuple_passthrough((True, "test", 5)) == (5, "test", True)
+    # Any sequence can be cast to a std::pair or std::tuple
+    assert m.pair_passthrough([True, "test"]) == ("test", True)
+    assert m.tuple_passthrough([True, "test", 5]) == (5, "test", True)
+    assert m.empty_tuple() == ()
+
+    assert doc(m.pair_passthrough) == """
+        pair_passthrough(arg0: Tuple[bool, str]) -> Tuple[str, bool]
+
+        Return a pair in reversed order
+    """
+    assert doc(m.tuple_passthrough) == """
+        tuple_passthrough(arg0: Tuple[bool, str, int]) -> Tuple[int, str, bool]
+
+        Return a triple in reversed order
+    """
+
+    assert m.rvalue_pair() == ("rvalue", "rvalue")
+    assert m.lvalue_pair() == ("lvalue", "lvalue")
+    assert m.rvalue_tuple() == ("rvalue", "rvalue", "rvalue")
+    assert m.lvalue_tuple() == ("lvalue", "lvalue", "lvalue")
+    assert m.rvalue_nested() == ("rvalue", ("rvalue", ("rvalue", "rvalue")))
+    assert m.lvalue_nested() == ("lvalue", ("lvalue", ("lvalue", "lvalue")))
+
+
+def test_builtins_cast_return_none():
+    """Casters produced with PYBIND11_TYPE_CASTER() should convert nullptr to None"""
+    assert m.return_none_string() is None
+    assert m.return_none_char() is None
+    assert m.return_none_bool() is None
+    assert m.return_none_int() is None
+    assert m.return_none_float() is None
+
+
+def test_none_deferred():
+    """None passed as various argument types should defer to other overloads"""
+    assert not m.defer_none_cstring("abc")
+    assert m.defer_none_cstring(None)
+    assert not m.defer_none_custom(UserType())
+    assert m.defer_none_custom(None)
+    assert m.nodefer_none_void(None)
+
+
+def test_void_caster():
+    assert m.load_nullptr_t(None) is None
+    assert m.cast_nullptr_t() is None
+
+
+def test_reference_wrapper():
+    """std::reference_wrapper for builtin and user types"""
+    assert m.refwrap_builtin(42) == 420
+    assert m.refwrap_usertype(UserType(42)) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_builtin(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.refwrap_usertype(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    a1 = m.refwrap_list(copy=True)
+    a2 = m.refwrap_list(copy=True)
+    assert [x.value for x in a1] == [2, 3]
+    assert [x.value for x in a2] == [2, 3]
+    assert not a1[0] is a2[0] and not a1[1] is a2[1]
+
+    b1 = m.refwrap_list(copy=False)
+    b2 = m.refwrap_list(copy=False)
+    assert [x.value for x in b1] == [1, 2]
+    assert [x.value for x in b2] == [1, 2]
+    assert b1[0] is b2[0] and b1[1] is b2[1]
+
+    assert m.refwrap_iiw(IncType(5)) == 5
+    assert m.refwrap_call_iiw(IncType(10), m.refwrap_iiw) == [10, 10, 10, 10]
+
+
+def test_complex_cast():
+    """std::complex casts"""
+    assert m.complex_cast(1) == "1.0"
+    assert m.complex_cast(2j) == "(0.0, 2.0)"
+
+
+def test_bool_caster():
+    """Test bool caster implicit conversions."""
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    def require_implicit(v):
+        pytest.raises(TypeError, noconvert, v)
+
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+
+    # straight up bool
+    assert convert(True) is True
+    assert convert(False) is False
+    assert noconvert(True) is True
+    assert noconvert(False) is False
+
+    # None requires implicit conversion
+    require_implicit(None)
+    assert convert(None) is False
+
+    class A(object):
+        def __init__(self, x):
+            self.x = x
+
+        def __nonzero__(self):
+            return self.x
+
+        def __bool__(self):
+            return self.x
+
+    class B(object):
+        pass
+
+    # Arbitrary objects are not accepted
+    cant_convert(object())
+    cant_convert(B())
+
+    # Objects with __nonzero__ / __bool__ defined can be converted
+    require_implicit(A(True))
+    assert convert(A(True)) is True
+    assert convert(A(False)) is False
+
+
+@pytest.requires_numpy
+def test_numpy_bool():
+    import numpy as np
+    convert, noconvert = m.bool_passthrough, m.bool_passthrough_noconvert
+
+    # np.bool_ is not considered implicit
+    assert convert(np.bool_(True)) is True
+    assert convert(np.bool_(False)) is False
+    assert noconvert(np.bool_(True)) is True
+    assert noconvert(np.bool_(False)) is False
diff --git a/pybind11/tests/test_call_policies.cpp b/pybind11/tests/test_call_policies.cpp
new file mode 100644
index 000000000..8642188f9
--- /dev/null
+++ b/pybind11/tests/test_call_policies.cpp
@@ -0,0 +1,98 @@
+/*
+    tests/test_call_policies.cpp -- keep_alive and call_guard
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+struct CustomGuard {
+    static bool enabled;
+
+    CustomGuard() { enabled = true; }
+    ~CustomGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool CustomGuard::enabled = false;
+
+struct DependentGuard {
+    static bool enabled;
+
+    DependentGuard() { enabled = CustomGuard::enabled; }
+    ~DependentGuard() { enabled = false; }
+
+    static const char *report_status() { return enabled ? "guarded" : "unguarded"; }
+};
+bool DependentGuard::enabled = false;
+
+TEST_SUBMODULE(call_policies, m) {
+    // Parent/Child are used in:
+    // test_keep_alive_argument, test_keep_alive_return_value, test_alive_gc_derived,
+    // test_alive_gc_multi_derived, test_return_none, test_keep_alive_constructor
+    class Child {
+    public:
+        Child() { py::print("Allocating child."); }
+        ~Child() { py::print("Releasing child."); }
+    };
+    py::class_<Child>(m, "Child")
+        .def(py::init<>());
+
+    class Parent {
+    public:
+        Parent() { py::print("Allocating parent."); }
+        ~Parent() { py::print("Releasing parent."); }
+        void addChild(Child *) { }
+        Child *returnChild() { return new Child(); }
+        Child *returnNullChild() { return nullptr; }
+    };
+    py::class_<Parent>(m, "Parent")
+        .def(py::init<>())
+        .def(py::init([](Child *) { return new Parent(); }), py::keep_alive<1, 2>())
+        .def("addChild", &Parent::addChild)
+        .def("addChildKeepAlive", &Parent::addChild, py::keep_alive<1, 2>())
+        .def("returnChild", &Parent::returnChild)
+        .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
+        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
+
+#if !defined(PYPY_VERSION)
+    // test_alive_gc
+    class ParentGC : public Parent {
+    public:
+        using Parent::Parent;
+    };
+    py::class_<ParentGC, Parent>(m, "ParentGC", py::dynamic_attr())
+        .def(py::init<>());
+#endif
+
+    // test_call_guard
+    m.def("unguarded_call", &CustomGuard::report_status);
+    m.def("guarded_call", &CustomGuard::report_status, py::call_guard<CustomGuard>());
+
+    m.def("multiple_guards_correct_order", []() {
+        return CustomGuard::report_status() + std::string(" & ") + DependentGuard::report_status();
+    }, py::call_guard<CustomGuard, DependentGuard>());
+
+    m.def("multiple_guards_wrong_order", []() {
+        return DependentGuard::report_status() + std::string(" & ") + CustomGuard::report_status();
+    }, py::call_guard<DependentGuard, CustomGuard>());
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+    // `py::call_guard<py::gil_scoped_release>()` should work in PyPy as well,
+    // but it's unclear how to test it without `PyGILState_GetThisThreadState`.
+    auto report_gil_status = []() {
+        auto is_gil_held = false;
+        if (auto tstate = py::detail::get_thread_state_unchecked())
+            is_gil_held = (tstate == PyGILState_GetThisThreadState());
+
+        return is_gil_held ? "GIL held" : "GIL released";
+    };
+
+    m.def("with_gil", report_gil_status);
+    m.def("without_gil", report_gil_status, py::call_guard<py::gil_scoped_release>());
+#endif
+}
diff --git a/pybind11/tests/test_call_policies.py b/pybind11/tests/test_call_policies.py
new file mode 100644
index 000000000..7c835599c
--- /dev/null
+++ b/pybind11/tests/test_call_policies.py
@@ -0,0 +1,187 @@
+import pytest
+from pybind11_tests import call_policies as m
+from pybind11_tests import ConstructorStats
+
+
+def test_keep_alive_argument(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChild(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.addChildKeepAlive(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_keep_alive_return_value(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == """
+        Allocating child.
+        Releasing child.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnChildKeepAlive()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+# https://bitbucket.org/pypy/pypy/issues/2447
+@pytest.unsupported_on_pypy
+def test_alive_gc(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = m.ParentGC()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_derived(capture):
+    class Derived(m.Parent):
+        pass
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_alive_gc_multi_derived(capture):
+    class Derived(m.Parent, m.Child):
+        def __init__(self):
+            m.Parent.__init__(self)
+            m.Child.__init__(self)
+
+    n_inst = ConstructorStats.detail_reg_inst()
+    p = Derived()
+    p.addChildKeepAlive(m.Child())
+    # +3 rather than +2 because Derived corresponds to two registered instances
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    lst = [p]
+    lst.append(lst)   # creates a circular reference
+    with capture:
+        del p, lst
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+        Releasing child.
+    """
+
+
+def test_return_none(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveChild()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+    with capture:
+        p = m.Parent()
+    assert capture == "Allocating parent."
+    with capture:
+        p.returnNullChildKeepAliveParent()
+        assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    assert capture == ""
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == "Releasing parent."
+
+
+def test_keep_alive_constructor(capture):
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    with capture:
+        p = m.Parent(m.Child())
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == """
+        Allocating child.
+        Allocating parent.
+    """
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert capture == """
+        Releasing parent.
+        Releasing child.
+    """
+
+
+def test_call_guard():
+    assert m.unguarded_call() == "unguarded"
+    assert m.guarded_call() == "guarded"
+
+    assert m.multiple_guards_correct_order() == "guarded & guarded"
+    assert m.multiple_guards_wrong_order() == "unguarded & guarded"
+
+    if hasattr(m, "with_gil"):
+        assert m.with_gil() == "GIL held"
+        assert m.without_gil() == "GIL released"
diff --git a/pybind11/tests/test_callbacks.cpp b/pybind11/tests/test_callbacks.cpp
index 31d4e39aa..273eacc30 100644
--- a/pybind11/tests/test_callbacks.cpp
+++ b/pybind11/tests/test_callbacks.cpp
@@ -12,76 +12,20 @@
 #include <pybind11/functional.h>
 
 
-py::object test_callback1(py::object func) {
-    return func();
-}
-
-py::tuple test_callback2(py::object func) {
-    return func("Hello", 'x', true, 5);
-}
-
-std::string test_callback3(const std::function<int(int)> &func) {
-    return "func(43) = " + std::to_string(func(43));
-}
-
-std::function<int(int)> test_callback4() {
-    return [](int i) { return i+1; };
-}
-
-py::cpp_function test_callback5() {
-    return py::cpp_function([](int i) { return i+1; },
-       py::arg("number"));
-}
-
 int dummy_function(int i) { return i + 1; }
-int dummy_function2(int i, int j) { return i + j; }
-std::function<int(int)> roundtrip(std::function<int(int)> f, bool expect_none = false) {
-    if (expect_none && f) {
-        throw std::runtime_error("Expected None to be converted to empty std::function");
-    }
-    return f;
-}
 
-std::string test_dummy_function(const std::function<int(int)> &f) {
-    using fn_type = int (*)(int);
-    auto result = f.target<fn_type>();
-    if (!result) {
-        auto r = f(1);
-        return "can't convert to function pointer: eval(1) = " + std::to_string(r);
-    } else if (*result == dummy_function) {
-        auto r = (*result)(1);
-        return "matches dummy_function: eval(1) = " + std::to_string(r);
-    } else {
-        return "argument does NOT match dummy_function. This should never happen!";
-    }
-}
+TEST_SUBMODULE(callbacks, m) {
+    // test_callbacks, test_function_signatures
+    m.def("test_callback1", [](py::object func) { return func(); });
+    m.def("test_callback2", [](py::object func) { return func("Hello", 'x', true, 5); });
+    m.def("test_callback3", [](const std::function<int(int)> &func) {
+        return "func(43) = " + std::to_string(func(43)); });
+    m.def("test_callback4", []() -> std::function<int(int)> { return [](int i) { return i+1; }; });
+    m.def("test_callback5", []() {
+        return py::cpp_function([](int i) { return i+1; }, py::arg("number"));
+    });
 
-struct Payload {
-    Payload() {
-        print_default_created(this);
-    }
-    ~Payload() {
-        print_destroyed(this);
-    }
-    Payload(const Payload &) {
-        print_copy_created(this);
-    }
-    Payload(Payload &&) {
-        print_move_created(this);
-    }
-};
-
-/// Something to trigger a conversion error
-struct Unregistered {};
-
-test_initializer callbacks([](py::module &m) {
-    m.def("test_callback1", &test_callback1);
-    m.def("test_callback2", &test_callback2);
-    m.def("test_callback3", &test_callback3);
-    m.def("test_callback4", &test_callback4);
-    m.def("test_callback5", &test_callback5);
-
-    // Test keyword args and generalized unpacking
+    // test_keyword_args_and_generalized_unpacking
     m.def("test_tuple_unpacking", [](py::function f) {
         auto t1 = py::make_tuple(2, 3);
         auto t2 = py::make_tuple(5, 6);
@@ -123,27 +67,83 @@ test_initializer callbacks([](py::module &m) {
     });
 
     m.def("test_arg_conversion_error1", [](py::function f) {
-        f(234, Unregistered(), "kw"_a=567);
+        f(234, UnregisteredType(), "kw"_a=567);
     });
 
     m.def("test_arg_conversion_error2", [](py::function f) {
-        f(234, "expected_name"_a=Unregistered(), "kw"_a=567);
+        f(234, "expected_name"_a=UnregisteredType(), "kw"_a=567);
     });
 
+    // test_lambda_closure_cleanup
+    struct Payload {
+        Payload() { print_default_created(this); }
+        ~Payload() { print_destroyed(this); }
+        Payload(const Payload &) { print_copy_created(this); }
+        Payload(Payload &&) { print_move_created(this); }
+    };
+    // Export the payload constructor statistics for testing purposes:
+    m.def("payload_cstats", &ConstructorStats::get<Payload>);
     /* Test cleanup of lambda closure */
     m.def("test_cleanup", []() -> std::function<void(void)> {
         Payload p;
 
         return [p]() {
             /* p should be cleaned up when the returned function is garbage collected */
+            (void) p;
         };
     });
 
+    // test_cpp_function_roundtrip
     /* Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer */
     m.def("dummy_function", &dummy_function);
-    m.def("dummy_function2", &dummy_function2);
-    m.def("roundtrip", &roundtrip, py::arg("f"), py::arg("expect_none")=false);
-    m.def("test_dummy_function", &test_dummy_function);
-    // Export the payload constructor statistics for testing purposes:
-    m.def("payload_cstats", &ConstructorStats::get<Payload>);
-});
+    m.def("dummy_function2", [](int i, int j) { return i + j; });
+    m.def("roundtrip", [](std::function<int(int)> f, bool expect_none = false) {
+        if (expect_none && f)
+            throw std::runtime_error("Expected None to be converted to empty std::function");
+        return f;
+    }, py::arg("f"), py::arg("expect_none")=false);
+    m.def("test_dummy_function", [](const std::function<int(int)> &f) -> std::string {
+        using fn_type = int (*)(int);
+        auto result = f.target<fn_type>();
+        if (!result) {
+            auto r = f(1);
+            return "can't convert to function pointer: eval(1) = " + std::to_string(r);
+        } else if (*result == dummy_function) {
+            auto r = (*result)(1);
+            return "matches dummy_function: eval(1) = " + std::to_string(r);
+        } else {
+            return "argument does NOT match dummy_function. This should never happen!";
+        }
+    });
+
+    class AbstractBase { public: virtual unsigned int func() = 0; };
+    m.def("func_accepting_func_accepting_base", [](std::function<double(AbstractBase&)>) { });
+
+    struct MovableObject {
+        bool valid = true;
+
+        MovableObject() = default;
+        MovableObject(const MovableObject &) = default;
+        MovableObject &operator=(const MovableObject &) = default;
+        MovableObject(MovableObject &&o) : valid(o.valid) { o.valid = false; }
+        MovableObject &operator=(MovableObject &&o) {
+            valid = o.valid;
+            o.valid = false;
+            return *this;
+        }
+    };
+    py::class_<MovableObject>(m, "MovableObject");
+
+    // test_movable_object
+    m.def("callback_with_movable", [](std::function<void(MovableObject &)> f) {
+        auto x = MovableObject();
+        f(x); // lvalue reference shouldn't move out object
+        return x.valid; // must still return `true`
+    });
+
+    // test_bound_method_callback
+    struct CppBoundMethodTest {};
+    py::class_<CppBoundMethodTest>(m, "CppBoundMethodTest")
+        .def(py::init<>())
+        .def("triple", [](CppBoundMethodTest &, int val) { return 3 * val; });
+}
diff --git a/pybind11/tests/test_callbacks.py b/pybind11/tests/test_callbacks.py
index c2668aa95..93c42c22b 100644
--- a/pybind11/tests/test_callbacks.py
+++ b/pybind11/tests/test_callbacks.py
@@ -1,10 +1,9 @@
 import pytest
+from pybind11_tests import callbacks as m
 
 
 def test_callbacks():
     from functools import partial
-    from pybind11_tests import (test_callback1, test_callback2, test_callback3,
-                                test_callback4, test_callback5)
 
     def func1():
         return "func1"
@@ -15,58 +14,65 @@ def test_callbacks():
     def func3(a):
         return "func3({})".format(a)
 
-    assert test_callback1(func1) == "func1"
-    assert test_callback2(func2) == ("func2", "Hello", "x", True, 5)
-    assert test_callback1(partial(func2, 1, 2, 3, 4)) == ("func2", 1, 2, 3, 4)
-    assert test_callback1(partial(func3, "partial")) == "func3(partial)"
-    assert test_callback3(lambda i: i + 1) == "func(43) = 44"
+    assert m.test_callback1(func1) == "func1"
+    assert m.test_callback2(func2) == ("func2", "Hello", "x", True, 5)
+    assert m.test_callback1(partial(func2, 1, 2, 3, 4)) == ("func2", 1, 2, 3, 4)
+    assert m.test_callback1(partial(func3, "partial")) == "func3(partial)"
+    assert m.test_callback3(lambda i: i + 1) == "func(43) = 44"
 
-    f = test_callback4()
+    f = m.test_callback4()
     assert f(43) == 44
-    f = test_callback5()
+    f = m.test_callback5()
     assert f(number=43) == 44
 
 
+def test_bound_method_callback():
+    # Bound Python method:
+    class MyClass:
+        def double(self, val):
+            return 2 * val
+
+    z = MyClass()
+    assert m.test_callback3(z.double) == "func(43) = 86"
+
+    z = m.CppBoundMethodTest()
+    assert m.test_callback3(z.triple) == "func(43) = 129"
+
+
 def test_keyword_args_and_generalized_unpacking():
-    from pybind11_tests import (test_tuple_unpacking, test_dict_unpacking, test_keyword_args,
-                                test_unpacking_and_keywords1, test_unpacking_and_keywords2,
-                                test_unpacking_error1, test_unpacking_error2,
-                                test_arg_conversion_error1, test_arg_conversion_error2)
 
     def f(*args, **kwargs):
         return args, kwargs
 
-    assert test_tuple_unpacking(f) == (("positional", 1, 2, 3, 4, 5, 6), {})
-    assert test_dict_unpacking(f) == (("positional", 1), {"key": "value", "a": 1, "b": 2})
-    assert test_keyword_args(f) == ((), {"x": 10, "y": 20})
-    assert test_unpacking_and_keywords1(f) == ((1, 2), {"c": 3, "d": 4})
-    assert test_unpacking_and_keywords2(f) == (
+    assert m.test_tuple_unpacking(f) == (("positional", 1, 2, 3, 4, 5, 6), {})
+    assert m.test_dict_unpacking(f) == (("positional", 1), {"key": "value", "a": 1, "b": 2})
+    assert m.test_keyword_args(f) == ((), {"x": 10, "y": 20})
+    assert m.test_unpacking_and_keywords1(f) == ((1, 2), {"c": 3, "d": 4})
+    assert m.test_unpacking_and_keywords2(f) == (
         ("positional", 1, 2, 3, 4, 5),
         {"key": "value", "a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
     )
 
     with pytest.raises(TypeError) as excinfo:
-        test_unpacking_error1(f)
+        m.test_unpacking_error1(f)
     assert "Got multiple values for keyword argument" in str(excinfo.value)
 
     with pytest.raises(TypeError) as excinfo:
-        test_unpacking_error2(f)
+        m.test_unpacking_error2(f)
     assert "Got multiple values for keyword argument" in str(excinfo.value)
 
     with pytest.raises(RuntimeError) as excinfo:
-        test_arg_conversion_error1(f)
+        m.test_arg_conversion_error1(f)
     assert "Unable to convert call argument" in str(excinfo.value)
 
     with pytest.raises(RuntimeError) as excinfo:
-        test_arg_conversion_error2(f)
+        m.test_arg_conversion_error2(f)
     assert "Unable to convert call argument" in str(excinfo.value)
 
 
 def test_lambda_closure_cleanup():
-    from pybind11_tests import test_cleanup, payload_cstats
-
-    test_cleanup()
-    cstats = payload_cstats()
+    m.test_cleanup()
+    cstats = m.payload_cstats()
     assert cstats.alive() == 0
     assert cstats.copy_constructions == 1
     assert cstats.move_constructions >= 1
@@ -74,25 +80,28 @@ def test_lambda_closure_cleanup():
 
 def test_cpp_function_roundtrip():
     """Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer"""
-    from pybind11_tests import dummy_function, dummy_function2, test_dummy_function, roundtrip
 
-    assert test_dummy_function(dummy_function) == "matches dummy_function: eval(1) = 2"
-    assert test_dummy_function(roundtrip(dummy_function)) == "matches dummy_function: eval(1) = 2"
-    assert roundtrip(None, expect_none=True) is None
-    assert test_dummy_function(lambda x: x + 2) == "can't convert to function pointer: eval(1) = 3"
+    assert m.test_dummy_function(m.dummy_function) == "matches dummy_function: eval(1) = 2"
+    assert (m.test_dummy_function(m.roundtrip(m.dummy_function)) ==
+            "matches dummy_function: eval(1) = 2")
+    assert m.roundtrip(None, expect_none=True) is None
+    assert (m.test_dummy_function(lambda x: x + 2) ==
+            "can't convert to function pointer: eval(1) = 3")
 
     with pytest.raises(TypeError) as excinfo:
-        test_dummy_function(dummy_function2)
+        m.test_dummy_function(m.dummy_function2)
     assert "incompatible function arguments" in str(excinfo.value)
 
     with pytest.raises(TypeError) as excinfo:
-        test_dummy_function(lambda x, y: x + y)
+        m.test_dummy_function(lambda x, y: x + y)
     assert any(s in str(excinfo.value) for s in ("missing 1 required positional argument",
                                                  "takes exactly 2 arguments"))
 
 
 def test_function_signatures(doc):
-    from pybind11_tests import test_callback3, test_callback4
+    assert doc(m.test_callback3) == "test_callback3(arg0: Callable[[int], int]) -> str"
+    assert doc(m.test_callback4) == "test_callback4() -> Callable[[int], int]"
+
 
-    assert doc(test_callback3) == "test_callback3(arg0: Callable[[int], int]) -> str"
-    assert doc(test_callback4) == "test_callback4() -> Callable[[int], int]"
+def test_movable_object():
+    assert m.callback_with_movable(lambda _: None) is True
diff --git a/pybind11/tests/test_chrono.cpp b/pybind11/tests/test_chrono.cpp
index b86f57adf..195a93bba 100644
--- a/pybind11/tests/test_chrono.cpp
+++ b/pybind11/tests/test_chrono.cpp
@@ -8,52 +8,40 @@
     BSD-style license that can be found in the LICENSE file.
 */
 
-
 #include "pybind11_tests.h"
-#include "constructor_stats.h"
 #include <pybind11/chrono.h>
 
-// Return the current time off the wall clock
-std::chrono::system_clock::time_point test_chrono1() {
-    return std::chrono::system_clock::now();
-}
-
-// Round trip the passed in system clock time
-std::chrono::system_clock::time_point test_chrono2(std::chrono::system_clock::time_point t) {
-    return t;
-}
-
-// Round trip the passed in duration
-std::chrono::system_clock::duration test_chrono3(std::chrono::system_clock::duration d) {
-    return d;
-}
-
-// Difference between two passed in time_points
-std::chrono::system_clock::duration test_chrono4(std::chrono::system_clock::time_point a, std::chrono::system_clock::time_point b) {
-    return a - b;
+TEST_SUBMODULE(chrono, m) {
+    using system_time = std::chrono::system_clock::time_point;
+    using steady_time = std::chrono::steady_clock::time_point;
+    // test_chrono_system_clock
+    // Return the current time off the wall clock
+    m.def("test_chrono1", []() { return std::chrono::system_clock::now(); });
+
+    // test_chrono_system_clock_roundtrip
+    // Round trip the passed in system clock time
+    m.def("test_chrono2", [](system_time t) { return t; });
+
+    // test_chrono_duration_roundtrip
+    // Round trip the passed in duration
+    m.def("test_chrono3", [](std::chrono::system_clock::duration d) { return d; });
+
+    // test_chrono_duration_subtraction_equivalence
+    // Difference between two passed in time_points
+    m.def("test_chrono4", [](system_time a, system_time b) { return a - b; });
+
+    // test_chrono_steady_clock
+    // Return the current time off the steady_clock
+    m.def("test_chrono5", []() { return std::chrono::steady_clock::now(); });
+
+    // test_chrono_steady_clock_roundtrip
+    // Round trip a steady clock timepoint
+    m.def("test_chrono6", [](steady_time t) { return t; });
+
+    // test_floating_point_duration
+    // Roundtrip a duration in microseconds from a float argument
+    m.def("test_chrono7", [](std::chrono::microseconds t) { return t; });
+    // Float durations (issue #719)
+    m.def("test_chrono_float_diff", [](std::chrono::duration<float> a, std::chrono::duration<float> b) {
+        return a - b; });
 }
-
-// Return the current time off the steady_clock
-std::chrono::steady_clock::time_point test_chrono5() {
-    return std::chrono::steady_clock::now();
-}
-
-// Round trip a steady clock timepoint
-std::chrono::steady_clock::time_point test_chrono6(std::chrono::steady_clock::time_point t) {
-    return t;
-}
-
-// Roundtrip a duration in microseconds from a float argument
-std::chrono::microseconds test_chrono7(std::chrono::microseconds t) {
-    return t;
-}
-
-test_initializer chrono([] (py::module &m) {
-    m.def("test_chrono1", &test_chrono1);
-    m.def("test_chrono2", &test_chrono2);
-    m.def("test_chrono3", &test_chrono3);
-    m.def("test_chrono4", &test_chrono4);
-    m.def("test_chrono5", &test_chrono5);
-    m.def("test_chrono6", &test_chrono6);
-    m.def("test_chrono7", &test_chrono7);
-});
diff --git a/pybind11/tests/test_chrono.py b/pybind11/tests/test_chrono.py
index 94ca55c76..2b75bd191 100644
--- a/pybind11/tests/test_chrono.py
+++ b/pybind11/tests/test_chrono.py
@@ -1,11 +1,11 @@
+from pybind11_tests import chrono as m
+import datetime
 
 
 def test_chrono_system_clock():
-    from pybind11_tests import test_chrono1
-    import datetime
 
     # Get the time from both c++ and datetime
-    date1 = test_chrono1()
+    date1 = m.test_chrono1()
     date2 = datetime.datetime.today()
 
     # The returned value should be a datetime
@@ -25,13 +25,10 @@ def test_chrono_system_clock():
 
 
 def test_chrono_system_clock_roundtrip():
-    from pybind11_tests import test_chrono2
-    import datetime
-
     date1 = datetime.datetime.today()
 
     # Roundtrip the time
-    date2 = test_chrono2(date1)
+    date2 = m.test_chrono2(date1)
 
     # The returned value should be a datetime
     assert isinstance(date2, datetime.datetime)
@@ -44,8 +41,6 @@ def test_chrono_system_clock_roundtrip():
 
 
 def test_chrono_duration_roundtrip():
-    from pybind11_tests import test_chrono3
-    import datetime
 
     # Get the difference between two times (a timedelta)
     date1 = datetime.datetime.today()
@@ -55,7 +50,7 @@ def test_chrono_duration_roundtrip():
     # Make sure this is a timedelta
     assert isinstance(diff, datetime.timedelta)
 
-    cpp_diff = test_chrono3(diff)
+    cpp_diff = m.test_chrono3(diff)
 
     assert cpp_diff.days == diff.days
     assert cpp_diff.seconds == diff.seconds
@@ -63,14 +58,12 @@ def test_chrono_duration_roundtrip():
 
 
 def test_chrono_duration_subtraction_equivalence():
-    from pybind11_tests import test_chrono4
-    import datetime
 
     date1 = datetime.datetime.today()
     date2 = datetime.datetime.today()
 
     diff = date2 - date1
-    cpp_diff = test_chrono4(date2, date1)
+    cpp_diff = m.test_chrono4(date2, date1)
 
     assert cpp_diff.days == diff.days
     assert cpp_diff.seconds == diff.seconds
@@ -78,22 +71,13 @@ def test_chrono_duration_subtraction_equivalence():
 
 
 def test_chrono_steady_clock():
-    from pybind11_tests import test_chrono5
-    import datetime
-
-    time1 = test_chrono5()
-    time2 = test_chrono5()
-
+    time1 = m.test_chrono5()
     assert isinstance(time1, datetime.timedelta)
-    assert isinstance(time2, datetime.timedelta)
 
 
 def test_chrono_steady_clock_roundtrip():
-    from pybind11_tests import test_chrono6
-    import datetime
-
     time1 = datetime.timedelta(days=10, seconds=10, microseconds=100)
-    time2 = test_chrono6(time1)
+    time2 = m.test_chrono6(time1)
 
     assert isinstance(time2, datetime.timedelta)
 
@@ -104,13 +88,14 @@ def test_chrono_steady_clock_roundtrip():
 
 
 def test_floating_point_duration():
-    from pybind11_tests import test_chrono7
-    import datetime
-
-    # Test using 35.525123 seconds as an example floating point number in seconds
-    time = test_chrono7(35.525123)
+    # Test using a floating point number in seconds
+    time = m.test_chrono7(35.525123)
 
     assert isinstance(time, datetime.timedelta)
 
     assert time.seconds == 35
     assert 525122 <= time.microseconds <= 525123
+
+    diff = m.test_chrono_float_diff(43.789012, 1.123456)
+    assert diff.seconds == 42
+    assert 665556 <= diff.microseconds <= 665557
diff --git a/pybind11/tests/test_class.cpp b/pybind11/tests/test_class.cpp
new file mode 100644
index 000000000..222190617
--- /dev/null
+++ b/pybind11/tests/test_class.cpp
@@ -0,0 +1,357 @@
+/*
+    tests/test_class.cpp -- test py::class_ definitions and basic functionality
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include "local_bindings.h"
+
+TEST_SUBMODULE(class_, m) {
+    // test_instance
+    struct NoConstructor {
+        static NoConstructor *new_instance() {
+            auto *ptr = new NoConstructor();
+            print_created(ptr, "via new_instance");
+            return ptr;
+        }
+        ~NoConstructor() { print_destroyed(this); }
+    };
+
+    py::class_<NoConstructor>(m, "NoConstructor")
+        .def_static("new_instance", &NoConstructor::new_instance, "Return an instance");
+
+    // test_inheritance
+    class Pet {
+    public:
+        Pet(const std::string &name, const std::string &species)
+            : m_name(name), m_species(species) {}
+        std::string name() const { return m_name; }
+        std::string species() const { return m_species; }
+    private:
+        std::string m_name;
+        std::string m_species;
+    };
+
+    class Dog : public Pet {
+    public:
+        Dog(const std::string &name) : Pet(name, "dog") {}
+        std::string bark() const { return "Woof!"; }
+    };
+
+    class Rabbit : public Pet {
+    public:
+        Rabbit(const std::string &name) : Pet(name, "parrot") {}
+    };
+
+    class Hamster : public Pet {
+    public:
+        Hamster(const std::string &name) : Pet(name, "rodent") {}
+    };
+
+    class Chimera : public Pet {
+        Chimera() : Pet("Kimmy", "chimera") {}
+    };
+
+    py::class_<Pet> pet_class(m, "Pet");
+    pet_class
+        .def(py::init<std::string, std::string>())
+        .def("name", &Pet::name)
+        .def("species", &Pet::species);
+
+    /* One way of declaring a subclass relationship: reference parent's class_ object */
+    py::class_<Dog>(m, "Dog", pet_class)
+        .def(py::init<std::string>());
+
+    /* Another way of declaring a subclass relationship: reference parent's C++ type */
+    py::class_<Rabbit, Pet>(m, "Rabbit")
+        .def(py::init<std::string>());
+
+    /* And another: list parent in class template arguments */
+    py::class_<Hamster, Pet>(m, "Hamster")
+        .def(py::init<std::string>());
+
+    /* Constructors are not inherited by default */
+    py::class_<Chimera, Pet>(m, "Chimera");
+
+    m.def("pet_name_species", [](const Pet &pet) { return pet.name() + " is a " + pet.species(); });
+    m.def("dog_bark", [](const Dog &dog) { return dog.bark(); });
+
+    // test_automatic_upcasting
+    struct BaseClass { virtual ~BaseClass() {} };
+    struct DerivedClass1 : BaseClass { };
+    struct DerivedClass2 : BaseClass { };
+
+    py::class_<BaseClass>(m, "BaseClass").def(py::init<>());
+    py::class_<DerivedClass1>(m, "DerivedClass1").def(py::init<>());
+    py::class_<DerivedClass2>(m, "DerivedClass2").def(py::init<>());
+
+    m.def("return_class_1", []() -> BaseClass* { return new DerivedClass1(); });
+    m.def("return_class_2", []() -> BaseClass* { return new DerivedClass2(); });
+    m.def("return_class_n", [](int n) -> BaseClass* {
+        if (n == 1) return new DerivedClass1();
+        if (n == 2) return new DerivedClass2();
+        return new BaseClass();
+    });
+    m.def("return_none", []() -> BaseClass* { return nullptr; });
+
+    // test_isinstance
+    m.def("check_instances", [](py::list l) {
+        return py::make_tuple(
+            py::isinstance<py::tuple>(l[0]),
+            py::isinstance<py::dict>(l[1]),
+            py::isinstance<Pet>(l[2]),
+            py::isinstance<Pet>(l[3]),
+            py::isinstance<Dog>(l[4]),
+            py::isinstance<Rabbit>(l[5]),
+            py::isinstance<UnregisteredType>(l[6])
+        );
+    });
+
+    // test_mismatched_holder
+    struct MismatchBase1 { };
+    struct MismatchDerived1 : MismatchBase1 { };
+
+    struct MismatchBase2 { };
+    struct MismatchDerived2 : MismatchBase2 { };
+
+    m.def("mismatched_holder_1", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase1, std::shared_ptr<MismatchBase1>>(mod, "MismatchBase1");
+        py::class_<MismatchDerived1, MismatchBase1>(mod, "MismatchDerived1");
+    });
+    m.def("mismatched_holder_2", []() {
+        auto mod = py::module::import("__main__");
+        py::class_<MismatchBase2>(mod, "MismatchBase2");
+        py::class_<MismatchDerived2, std::shared_ptr<MismatchDerived2>,
+                   MismatchBase2>(mod, "MismatchDerived2");
+    });
+
+    // test_override_static
+    // #511: problem with inheritance + overwritten def_static
+    struct MyBase {
+        static std::unique_ptr<MyBase> make() {
+            return std::unique_ptr<MyBase>(new MyBase());
+        }
+    };
+
+    struct MyDerived : MyBase {
+        static std::unique_ptr<MyDerived> make() {
+            return std::unique_ptr<MyDerived>(new MyDerived());
+        }
+    };
+
+    py::class_<MyBase>(m, "MyBase")
+        .def_static("make", &MyBase::make);
+
+    py::class_<MyDerived, MyBase>(m, "MyDerived")
+        .def_static("make", &MyDerived::make)
+        .def_static("make2", &MyDerived::make);
+
+    // test_implicit_conversion_life_support
+    struct ConvertibleFromUserType {
+        int i;
+
+        ConvertibleFromUserType(UserType u) : i(u.value()) { }
+    };
+
+    py::class_<ConvertibleFromUserType>(m, "AcceptsUserType")
+        .def(py::init<UserType>());
+    py::implicitly_convertible<UserType, ConvertibleFromUserType>();
+
+    m.def("implicitly_convert_argument", [](const ConvertibleFromUserType &r) { return r.i; });
+    m.def("implicitly_convert_variable", [](py::object o) {
+        // `o` is `UserType` and `r` is a reference to a temporary created by implicit
+        // conversion. This is valid when called inside a bound function because the temp
+        // object is attached to the same life support system as the arguments.
+        const auto &r = o.cast<const ConvertibleFromUserType &>();
+        return r.i;
+    });
+    m.add_object("implicitly_convert_variable_fail", [&] {
+        auto f = [](PyObject *, PyObject *args) -> PyObject * {
+            auto o = py::reinterpret_borrow<py::tuple>(args)[0];
+            try { // It should fail here because there is no life support.
+                o.cast<const ConvertibleFromUserType &>();
+            } catch (const py::cast_error &e) {
+                return py::str(e.what()).release().ptr();
+            }
+            return py::str().release().ptr();
+        };
+
+        auto def = new PyMethodDef{"f", f, METH_VARARGS, nullptr};
+        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, nullptr, m.ptr()));
+    }());
+
+    // test_operator_new_delete
+    struct HasOpNewDel {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("A new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("A placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("A delete"); return ::operator delete(p); }
+    };
+    struct HasOpNewDelSize {
+        std::uint32_t i;
+        static void *operator new(size_t s) { py::print("B new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("B placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("B delete", s); return ::operator delete(p); }
+    };
+    struct AliasedHasOpNewDelSize {
+        std::uint64_t i;
+        static void *operator new(size_t s) { py::print("C new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("C placement-new", s); return ptr; }
+        static void operator delete(void *p, size_t s) { py::print("C delete", s); return ::operator delete(p); }
+        virtual ~AliasedHasOpNewDelSize() = default;
+    };
+    struct PyAliasedHasOpNewDelSize : AliasedHasOpNewDelSize {
+        PyAliasedHasOpNewDelSize() = default;
+        PyAliasedHasOpNewDelSize(int) { }
+        std::uint64_t j;
+    };
+    struct HasOpNewDelBoth {
+        std::uint32_t i[8];
+        static void *operator new(size_t s) { py::print("D new", s); return ::operator new(s); }
+        static void *operator new(size_t s, void *ptr) { py::print("D placement-new", s); return ptr; }
+        static void operator delete(void *p) { py::print("D delete"); return ::operator delete(p); }
+        static void operator delete(void *p, size_t s) { py::print("D wrong delete", s); return ::operator delete(p); }
+    };
+    py::class_<HasOpNewDel>(m, "HasOpNewDel").def(py::init<>());
+    py::class_<HasOpNewDelSize>(m, "HasOpNewDelSize").def(py::init<>());
+    py::class_<HasOpNewDelBoth>(m, "HasOpNewDelBoth").def(py::init<>());
+    py::class_<AliasedHasOpNewDelSize, PyAliasedHasOpNewDelSize> aliased(m, "AliasedHasOpNewDelSize");
+    aliased.def(py::init<>());
+    aliased.attr("size_noalias") = py::int_(sizeof(AliasedHasOpNewDelSize));
+    aliased.attr("size_alias") = py::int_(sizeof(PyAliasedHasOpNewDelSize));
+
+    // This test is actually part of test_local_bindings (test_duplicate_local), but we need a
+    // definition in a different compilation unit within the same module:
+    bind_local<LocalExternal, 17>(m, "LocalExternal", py::module_local());
+
+    // test_bind_protected_functions
+    class ProtectedA {
+    protected:
+        int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class PublicistA : public ProtectedA {
+    public:
+        using ProtectedA::foo;
+    };
+
+    py::class_<ProtectedA>(m, "ProtectedA")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistA::foo);
+#else
+        .def("foo", static_cast<int (ProtectedA::*)() const>(&PublicistA::foo));
+#endif
+
+    class ProtectedB {
+    public:
+        virtual ~ProtectedB() = default;
+
+    protected:
+        virtual int foo() const { return value; }
+
+    private:
+        int value = 42;
+    };
+
+    class TrampolineB : public ProtectedB {
+    public:
+        int foo() const override { PYBIND11_OVERLOAD(int, ProtectedB, foo, ); }
+    };
+
+    class PublicistB : public ProtectedB {
+    public:
+        using ProtectedB::foo;
+    };
+
+    py::class_<ProtectedB, TrampolineB>(m, "ProtectedB")
+        .def(py::init<>())
+#if !defined(_MSC_VER) || _MSC_VER >= 1910
+        .def("foo", &PublicistB::foo);
+#else
+        .def("foo", static_cast<int (ProtectedB::*)() const>(&PublicistB::foo));
+#endif
+
+    // test_brace_initialization
+    struct BraceInitialization {
+        int field1;
+        std::string field2;
+    };
+
+    py::class_<BraceInitialization>(m, "BraceInitialization")
+        .def(py::init<int, const std::string &>())
+        .def_readwrite("field1", &BraceInitialization::field1)
+        .def_readwrite("field2", &BraceInitialization::field2);
+
+    // test_reentrant_implicit_conversion_failure
+    // #1035: issue with runaway reentrant implicit conversion
+    struct BogusImplicitConversion {
+        BogusImplicitConversion(const BogusImplicitConversion &) { }
+    };
+
+    py::class_<BogusImplicitConversion>(m, "BogusImplicitConversion")
+        .def(py::init<const BogusImplicitConversion &>());
+
+    py::implicitly_convertible<int, BogusImplicitConversion>();
+}
+
+template <int N> class BreaksBase { public: virtual ~BreaksBase() = default; };
+template <int N> class BreaksTramp : public BreaksBase<N> {};
+// These should all compile just fine:
+typedef py::class_<BreaksBase<1>, std::unique_ptr<BreaksBase<1>>, BreaksTramp<1>> DoesntBreak1;
+typedef py::class_<BreaksBase<2>, BreaksTramp<2>, std::unique_ptr<BreaksBase<2>>> DoesntBreak2;
+typedef py::class_<BreaksBase<3>, std::unique_ptr<BreaksBase<3>>> DoesntBreak3;
+typedef py::class_<BreaksBase<4>, BreaksTramp<4>> DoesntBreak4;
+typedef py::class_<BreaksBase<5>> DoesntBreak5;
+typedef py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>> DoesntBreak6;
+typedef py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>> DoesntBreak7;
+typedef py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>> DoesntBreak8;
+#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
+        "DoesntBreak" #N " has wrong type!")
+CHECK_BASE(1); CHECK_BASE(2); CHECK_BASE(3); CHECK_BASE(4); CHECK_BASE(5); CHECK_BASE(6); CHECK_BASE(7); CHECK_BASE(8);
+#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
+        "DoesntBreak" #N " has wrong type_alias!")
+#define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
+        "DoesntBreak" #N " has type alias, but shouldn't!")
+CHECK_ALIAS(1); CHECK_ALIAS(2); CHECK_NOALIAS(3); CHECK_ALIAS(4); CHECK_NOALIAS(5); CHECK_ALIAS(6); CHECK_ALIAS(7); CHECK_NOALIAS(8);
+#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
+        "DoesntBreak" #N " has wrong holder_type!")
+CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
+CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
+
+// There's no nice way to test that these fail because they fail to compile; leave them here,
+// though, so that they can be manually tested by uncommenting them (and seeing that compilation
+// failures occurs).
+
+// We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
+#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
+        "Breaks1 has wrong type!");
+
+//// Two holder classes:
+//typedef py::class_<BreaksBase<-1>, std::unique_ptr<BreaksBase<-1>>, std::unique_ptr<BreaksBase<-1>>> Breaks1;
+//CHECK_BROKEN(1);
+//// Two aliases:
+//typedef py::class_<BreaksBase<-2>, BreaksTramp<-2>, BreaksTramp<-2>> Breaks2;
+//CHECK_BROKEN(2);
+//// Holder + 2 aliases
+//typedef py::class_<BreaksBase<-3>, std::unique_ptr<BreaksBase<-3>>, BreaksTramp<-3>, BreaksTramp<-3>> Breaks3;
+//CHECK_BROKEN(3);
+//// Alias + 2 holders
+//typedef py::class_<BreaksBase<-4>, std::unique_ptr<BreaksBase<-4>>, BreaksTramp<-4>, std::shared_ptr<BreaksBase<-4>>> Breaks4;
+//CHECK_BROKEN(4);
+//// Invalid option (not a subclass or holder)
+//typedef py::class_<BreaksBase<-5>, BreaksTramp<-4>> Breaks5;
+//CHECK_BROKEN(5);
+//// Invalid option: multiple inheritance not supported:
+//template <> struct BreaksBase<-8> : BreaksBase<-6>, BreaksBase<-7> {};
+//typedef py::class_<BreaksBase<-8>, BreaksBase<-6>, BreaksBase<-7>> Breaks8;
+//CHECK_BROKEN(8);
diff --git a/pybind11/tests/test_class.py b/pybind11/tests/test_class.py
new file mode 100644
index 000000000..412d6798e
--- /dev/null
+++ b/pybind11/tests/test_class.py
@@ -0,0 +1,235 @@
+import pytest
+
+from pybind11_tests import class_ as m
+from pybind11_tests import UserType, ConstructorStats
+
+
+def test_repr():
+    # In Python 3.3+, repr() accesses __qualname__
+    assert "pybind11_type" in repr(type(UserType))
+    assert "UserType" in repr(UserType)
+
+
+def test_instance(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.NoConstructor()
+    assert msg(excinfo.value) == "m.class_.NoConstructor: No constructor defined!"
+
+    instance = m.NoConstructor.new_instance()
+
+    cstats = ConstructorStats.get(m.NoConstructor)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
+
+
+def test_docstrings(doc):
+    assert doc(UserType) == "A `py::class_` type for testing"
+    assert UserType.__name__ == "UserType"
+    assert UserType.__module__ == "pybind11_tests"
+    assert UserType.get_value.__name__ == "get_value"
+    assert UserType.get_value.__module__ == "pybind11_tests"
+
+    assert doc(UserType.get_value) == """
+        get_value(self: m.UserType) -> int
+
+        Get value using a method
+    """
+    assert doc(UserType.value) == "Get/set value using a property"
+
+    assert doc(m.NoConstructor.new_instance) == """
+        new_instance() -> m.class_.NoConstructor
+
+        Return an instance
+    """
+
+
+def test_inheritance(msg):
+    roger = m.Rabbit('Rabbit')
+    assert roger.name() + " is a " + roger.species() == "Rabbit is a parrot"
+    assert m.pet_name_species(roger) == "Rabbit is a parrot"
+
+    polly = m.Pet('Polly', 'parrot')
+    assert polly.name() + " is a " + polly.species() == "Polly is a parrot"
+    assert m.pet_name_species(polly) == "Polly is a parrot"
+
+    molly = m.Dog('Molly')
+    assert molly.name() + " is a " + molly.species() == "Molly is a dog"
+    assert m.pet_name_species(molly) == "Molly is a dog"
+
+    fred = m.Hamster('Fred')
+    assert fred.name() + " is a " + fred.species() == "Fred is a rodent"
+
+    assert m.dog_bark(molly) == "Woof!"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.dog_bark(polly)
+    assert msg(excinfo.value) == """
+        dog_bark(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.class_.Dog) -> str
+
+        Invoked with: <m.class_.Pet object at 0>
+    """
+
+    with pytest.raises(TypeError) as excinfo:
+        m.Chimera("lion", "goat")
+    assert "No constructor defined!" in str(excinfo.value)
+
+
+def test_automatic_upcasting():
+    assert type(m.return_class_1()).__name__ == "DerivedClass1"
+    assert type(m.return_class_2()).__name__ == "DerivedClass2"
+    assert type(m.return_none()).__name__ == "NoneType"
+    # Repeat these a few times in a random order to ensure no invalid caching is applied
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(2)).__name__ == "DerivedClass2"
+    assert type(m.return_class_n(0)).__name__ == "BaseClass"
+    assert type(m.return_class_n(1)).__name__ == "DerivedClass1"
+
+
+def test_isinstance():
+    objects = [tuple(), dict(), m.Pet("Polly", "parrot")] + [m.Dog("Molly")] * 4
+    expected = (True, True, True, True, True, False, False)
+    assert m.check_instances(objects) == expected
+
+
+def test_mismatched_holder():
+    import re
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_1()
+    assert re.match('generic_type: type ".*MismatchDerived1" does not have a non-default '
+                    'holder type while its base ".*MismatchBase1" does', str(excinfo.value))
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.mismatched_holder_2()
+    assert re.match('generic_type: type ".*MismatchDerived2" has a non-default holder type '
+                    'while its base ".*MismatchBase2" does not', str(excinfo.value))
+
+
+def test_override_static():
+    """#511: problem with inheritance + overwritten def_static"""
+    b = m.MyBase.make()
+    d1 = m.MyDerived.make2()
+    d2 = m.MyDerived.make()
+
+    assert isinstance(b, m.MyBase)
+    assert isinstance(d1, m.MyDerived)
+    assert isinstance(d2, m.MyDerived)
+
+
+def test_implicit_conversion_life_support():
+    """Ensure the lifetime of temporary objects created for implicit conversions"""
+    assert m.implicitly_convert_argument(UserType(5)) == 5
+    assert m.implicitly_convert_variable(UserType(5)) == 5
+
+    assert "outside a bound function" in m.implicitly_convert_variable_fail(UserType(5))
+
+
+def test_operator_new_delete(capture):
+    """Tests that class-specific operator new/delete functions are invoked"""
+
+    class SubAliased(m.AliasedHasOpNewDelSize):
+        pass
+
+    with capture:
+        a = m.HasOpNewDel()
+        b = m.HasOpNewDelSize()
+        d = m.HasOpNewDelBoth()
+    assert capture == """
+        A new 8
+        B new 4
+        D new 32
+    """
+    sz_alias = str(m.AliasedHasOpNewDelSize.size_alias)
+    sz_noalias = str(m.AliasedHasOpNewDelSize.size_noalias)
+    with capture:
+        c = m.AliasedHasOpNewDelSize()
+        c2 = SubAliased()
+    assert capture == (
+        "C new " + sz_noalias + "\n" +
+        "C new " + sz_alias + "\n"
+    )
+
+    with capture:
+        del a
+        pytest.gc_collect()
+        del b
+        pytest.gc_collect()
+        del d
+        pytest.gc_collect()
+    assert capture == """
+        A delete
+        B delete 4
+        D delete
+    """
+
+    with capture:
+        del c
+        pytest.gc_collect()
+        del c2
+        pytest.gc_collect()
+    assert capture == (
+        "C delete " + sz_noalias + "\n" +
+        "C delete " + sz_alias + "\n"
+    )
+
+
+def test_bind_protected_functions():
+    """Expose protected member functions to Python using a helper class"""
+    a = m.ProtectedA()
+    assert a.foo() == 42
+
+    b = m.ProtectedB()
+    assert b.foo() == 42
+
+    class C(m.ProtectedB):
+        def __init__(self):
+            m.ProtectedB.__init__(self)
+
+        def foo(self):
+            return 0
+
+    c = C()
+    assert c.foo() == 0
+
+
+def test_brace_initialization():
+    """ Tests that simple POD classes can be constructed using C++11 brace initialization """
+    a = m.BraceInitialization(123, "test")
+    assert a.field1 == 123
+    assert a.field2 == "test"
+
+
+@pytest.unsupported_on_pypy
+def test_class_refcount():
+    """Instances must correctly increase/decrease the reference count of their types (#1029)"""
+    from sys import getrefcount
+
+    class PyDog(m.Dog):
+        pass
+
+    for cls in m.Dog, PyDog:
+        refcount_1 = getrefcount(cls)
+        molly = [cls("Molly") for _ in range(10)]
+        refcount_2 = getrefcount(cls)
+
+        del molly
+        pytest.gc_collect()
+        refcount_3 = getrefcount(cls)
+
+        assert refcount_1 == refcount_3
+        assert refcount_2 > refcount_1
+
+
+def test_reentrant_implicit_conversion_failure(msg):
+    # ensure that there is no runaway reentrant implicit conversion (#1035)
+    with pytest.raises(TypeError) as excinfo:
+        m.BogusImplicitConversion(0)
+    assert msg(excinfo.value) == '''__init__(): incompatible constructor arguments. The following argument types are supported:
+    1. m.class_.BogusImplicitConversion(arg0: m.class_.BogusImplicitConversion)
+
+Invoked with: 0'''
diff --git a/pybind11/tests/test_class_args.cpp b/pybind11/tests/test_class_args.cpp
deleted file mode 100644
index e18b39db2..000000000
--- a/pybind11/tests/test_class_args.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
-    tests/test_class_args.cpp -- tests that various way of defining a class work
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-
-
-template <int N> class BreaksBase {};
-template <int N> class BreaksTramp : public BreaksBase<N> {};
-// These should all compile just fine:
-typedef py::class_<BreaksBase<1>, std::unique_ptr<BreaksBase<1>>, BreaksTramp<1>> DoesntBreak1;
-typedef py::class_<BreaksBase<2>, BreaksTramp<2>, std::unique_ptr<BreaksBase<2>>> DoesntBreak2;
-typedef py::class_<BreaksBase<3>, std::unique_ptr<BreaksBase<3>>> DoesntBreak3;
-typedef py::class_<BreaksBase<4>, BreaksTramp<4>> DoesntBreak4;
-typedef py::class_<BreaksBase<5>> DoesntBreak5;
-typedef py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>> DoesntBreak6;
-typedef py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>> DoesntBreak7;
-typedef py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>> DoesntBreak8;
-#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
-        "DoesntBreak" #N " has wrong type!")
-CHECK_BASE(1); CHECK_BASE(2); CHECK_BASE(3); CHECK_BASE(4); CHECK_BASE(5); CHECK_BASE(6); CHECK_BASE(7); CHECK_BASE(8);
-#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
-        "DoesntBreak" #N " has wrong type_alias!")
-#define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
-        "DoesntBreak" #N " has type alias, but shouldn't!")
-CHECK_ALIAS(1); CHECK_ALIAS(2); CHECK_NOALIAS(3); CHECK_ALIAS(4); CHECK_NOALIAS(5); CHECK_ALIAS(6); CHECK_ALIAS(7); CHECK_NOALIAS(8);
-#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
-        "DoesntBreak" #N " has wrong holder_type!")
-CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
-CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
-
-// There's no nice way to test that these fail because they fail to compile; leave them here,
-// though, so that they can be manually tested by uncommenting them (and seeing that compilation
-// failures occurs).
-
-// We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
-#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
-        "Breaks1 has wrong type!");
-
-//// Two holder classes:
-//typedef py::class_<BreaksBase<-1>, std::unique_ptr<BreaksBase<-1>>, std::unique_ptr<BreaksBase<-1>>> Breaks1;
-//CHECK_BROKEN(1);
-//// Two aliases:
-//typedef py::class_<BreaksBase<-2>, BreaksTramp<-2>, BreaksTramp<-2>> Breaks2;
-//CHECK_BROKEN(2);
-//// Holder + 2 aliases
-//typedef py::class_<BreaksBase<-3>, std::unique_ptr<BreaksBase<-3>>, BreaksTramp<-3>, BreaksTramp<-3>> Breaks3;
-//CHECK_BROKEN(3);
-//// Alias + 2 holders
-//typedef py::class_<BreaksBase<-4>, std::unique_ptr<BreaksBase<-4>>, BreaksTramp<-4>, std::shared_ptr<BreaksBase<-4>>> Breaks4;
-//CHECK_BROKEN(4);
-//// Invalid option (not a subclass or holder)
-//typedef py::class_<BreaksBase<-5>, BreaksTramp<-4>> Breaks5;
-//CHECK_BROKEN(5);
-//// Invalid option: multiple inheritance not supported:
-//template <> struct BreaksBase<-8> : BreaksBase<-6>, BreaksBase<-7> {};
-//typedef py::class_<BreaksBase<-8>, BreaksBase<-6>, BreaksBase<-7>> Breaks8;
-//CHECK_BROKEN(8);
-
-test_initializer class_args([](py::module &m) {
-    // Just test that this compiled okay
-    m.def("class_args_noop", []() {});
-});
diff --git a/pybind11/tests/test_class_args.py b/pybind11/tests/test_class_args.py
deleted file mode 100644
index 40cbcec9f..000000000
--- a/pybind11/tests/test_class_args.py
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-def test_class_args():
-    """There's basically nothing to test here; just make sure the code compiled
-    and declared its definition
-    """
-    from pybind11_tests import class_args_noop
-    class_args_noop()
diff --git a/pybind11/tests/test_cmake_build/CMakeLists.txt b/pybind11/tests/test_cmake_build/CMakeLists.txt
new file mode 100644
index 000000000..c9b5fcb2e
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/CMakeLists.txt
@@ -0,0 +1,58 @@
+add_custom_target(test_cmake_build)
+
+if(CMAKE_VERSION VERSION_LESS 3.1)
+  # 3.0 needed for interface library for subdirectory_target/installed_target
+  # 3.1 needed for cmake -E env for testing
+  return()
+endif()
+
+include(CMakeParseArguments)
+function(pybind11_add_build_test name)
+  cmake_parse_arguments(ARG "INSTALL" "" "" ${ARGN})
+
+  set(build_options "-DCMAKE_PREFIX_PATH=${PROJECT_BINARY_DIR}/mock_install"
+                    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+                    "-DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}"
+                    "-DPYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}")
+  if(NOT ARG_INSTALL)
+    list(APPEND build_options "-DPYBIND11_PROJECT_DIR=${PROJECT_SOURCE_DIR}")
+  endif()
+
+  add_custom_target(test_${name} ${CMAKE_CTEST_COMMAND}
+    --quiet --output-log ${name}.log
+    --build-and-test "${CMAKE_CURRENT_SOURCE_DIR}/${name}"
+                     "${CMAKE_CURRENT_BINARY_DIR}/${name}"
+    --build-config Release
+    --build-noclean
+    --build-generator ${CMAKE_GENERATOR}
+    $<$<BOOL:${CMAKE_GENERATOR_PLATFORM}>:--build-generator-platform> ${CMAKE_GENERATOR_PLATFORM}
+    --build-makeprogram ${CMAKE_MAKE_PROGRAM}
+    --build-target check
+    --build-options ${build_options}
+  )
+  if(ARG_INSTALL)
+    add_dependencies(test_${name} mock_install)
+  endif()
+  add_dependencies(test_cmake_build test_${name})
+endfunction()
+
+pybind11_add_build_test(subdirectory_function)
+pybind11_add_build_test(subdirectory_target)
+if(NOT ${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+  pybind11_add_build_test(subdirectory_embed)
+endif()
+
+if(PYBIND11_INSTALL)
+  add_custom_target(mock_install ${CMAKE_COMMAND}
+    "-DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/mock_install"
+    -P "${PROJECT_BINARY_DIR}/cmake_install.cmake"
+  )
+
+  pybind11_add_build_test(installed_function INSTALL)
+  pybind11_add_build_test(installed_target INSTALL)
+  if(NOT ${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+    pybind11_add_build_test(installed_embed INSTALL)
+  endif()
+endif()
+
+add_dependencies(check test_cmake_build)
diff --git a/pybind11/tests/test_cmake_build/embed.cpp b/pybind11/tests/test_cmake_build/embed.cpp
new file mode 100644
index 000000000..b9581d2fd
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/embed.cpp
@@ -0,0 +1,21 @@
+#include <pybind11/embed.h>
+namespace py = pybind11;
+
+PYBIND11_EMBEDDED_MODULE(test_cmake_build, m) {
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2)
+        throw std::runtime_error("Expected test.py file as the first argument");
+    auto test_py_file = argv[1];
+
+    py::scoped_interpreter guard{};
+
+    auto m = py::module::import("test_cmake_build");
+    if (m.attr("add")(1, 2).cast<int>() != 3)
+        throw std::runtime_error("embed.cpp failed");
+
+    py::module::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
+    py::eval_file(test_py_file, py::globals());
+}
diff --git a/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt b/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
new file mode 100644
index 000000000..f7fc09c21
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_installed_embed CXX)
+
+set(CMAKE_MODULE_PATH "")
+find_package(pybind11 CONFIG REQUIRED)
+message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+
+add_executable(test_cmake_build ../embed.cpp)
+target_link_libraries(test_cmake_build PRIVATE pybind11::embed)
+
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::embed).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_cmake_build PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
+add_custom_target(check $<TARGET_FILE:test_cmake_build> ${PROJECT_SOURCE_DIR}/../test.py)
diff --git a/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt b/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
index dd206592f..cd3ae6f7d 100644
--- a/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
+++ b/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
@@ -14,5 +14,9 @@ target_link_libraries(test_cmake_build PRIVATE pybind11::module)
 set_target_properties(test_cmake_build PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
                                                   SUFFIX "${PYTHON_MODULE_EXTENSION}")
 
+# Do not treat includes from IMPORTED target as SYSTEM (Python headers in pybind11::module).
+# This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
+set_target_properties(test_cmake_build PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
+
 add_custom_target(check ${CMAKE_COMMAND} -E env PYTHONPATH=$<TARGET_FILE_DIR:test_cmake_build>
                   ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/../test.py ${PROJECT_NAME})
diff --git a/pybind11/tests/test_cmake_build/main.cpp b/pybind11/tests/test_cmake_build/main.cpp
index e0f5b69c9..e30f2c4b9 100644
--- a/pybind11/tests/test_cmake_build/main.cpp
+++ b/pybind11/tests/test_cmake_build/main.cpp
@@ -1,10 +1,6 @@
 #include <pybind11/pybind11.h>
 namespace py = pybind11;
 
-PYBIND11_PLUGIN(test_cmake_build) {
-    py::module m("test_cmake_build");
-
+PYBIND11_MODULE(test_cmake_build, m) {
     m.def("add", [](int i, int j) { return i + j; });
-
-    return m.ptr();
 }
diff --git a/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
new file mode 100644
index 000000000..88ba60dd5
--- /dev/null
+++ b/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.0)
+project(test_subdirectory_embed CXX)
+
+set(PYBIND11_INSTALL ON CACHE BOOL "")
+set(PYBIND11_EXPORT_NAME test_export)
+
+add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+
+# Test basic target functionality
+add_executable(test_cmake_build ../embed.cpp)
+target_link_libraries(test_cmake_build PRIVATE pybind11::embed)
+
+add_custom_target(check $<TARGET_FILE:test_cmake_build> ${PROJECT_SOURCE_DIR}/../test.py)
+
+# Test custom export group -- PYBIND11_EXPORT_NAME
+add_library(test_embed_lib ../embed.cpp)
+target_link_libraries(test_embed_lib PRIVATE pybind11::embed)
+
+install(TARGETS test_embed_lib
+        EXPORT  test_export
+        ARCHIVE DESTINATION bin
+        LIBRARY DESTINATION lib
+        RUNTIME DESTINATION lib)
+install(EXPORT      test_export
+        DESTINATION lib/cmake/test_export/test_export-Targets.cmake)
diff --git a/pybind11/tests/test_constants_and_functions.cpp b/pybind11/tests/test_constants_and_functions.cpp
index 653bdf6b6..8c9ef7f67 100644
--- a/pybind11/tests/test_constants_and_functions.cpp
+++ b/pybind11/tests/test_constants_and_functions.cpp
@@ -23,6 +23,8 @@ std::string test_function3(int i) {
     return "test_function(" + std::to_string(i) + ")";
 }
 
+py::str test_function4()           { return "test_function()"; }
+py::str test_function4(char *)     { return "test_function(char *)"; }
 py::str test_function4(int, float) { return "test_function(int, float)"; }
 py::str test_function4(float, int) { return "test_function(float, int)"; }
 
@@ -61,17 +63,23 @@ struct C {
 }
 
 
-test_initializer constants_and_functions([](py::module &m) {
+TEST_SUBMODULE(constants_and_functions, m) {
+    // test_constants
     m.attr("some_constant") = py::int_(14);
 
+    // test_function_overloading
     m.def("test_function", &test_function1);
     m.def("test_function", &test_function2);
     m.def("test_function", &test_function3);
 
 #if defined(PYBIND11_OVERLOAD_CAST)
+    m.def("test_function", py::overload_cast<>(&test_function4));
+    m.def("test_function", py::overload_cast<char *>(&test_function4));
     m.def("test_function", py::overload_cast<int, float>(&test_function4));
     m.def("test_function", py::overload_cast<float, int>(&test_function4));
 #else
+    m.def("test_function", static_cast<py::str (*)()>(&test_function4));
+    m.def("test_function", static_cast<py::str (*)(char *)>(&test_function4));
     m.def("test_function", static_cast<py::str (*)(int, float)>(&test_function4));
     m.def("test_function", static_cast<py::str (*)(float, int)>(&test_function4));
 #endif
@@ -81,12 +89,13 @@ test_initializer constants_and_functions([](py::module &m) {
         .value("ESecondEntry", ESecondEntry)
         .export_values();
 
+    // test_bytes
     m.def("return_bytes", &return_bytes);
     m.def("print_bytes", &print_bytes);
 
+    // test_exception_specifiers
     using namespace test_exc_sp;
-    py::module m2 = m.def_submodule("exc_sp");
-    py::class_<C>(m2, "C")
+    py::class_<C>(m, "C")
         .def(py::init<>())
         .def("m1", &C::m1)
         .def("m2", &C::m2)
@@ -97,8 +106,8 @@ test_initializer constants_and_functions([](py::module &m) {
         .def("m7", &C::m7)
         .def("m8", &C::m8)
         ;
-    m2.def("f1", f1);
-    m2.def("f2", f2);
-    m2.def("f3", f3);
-    m2.def("f4", f4);
-});
+    m.def("f1", f1);
+    m.def("f2", f2);
+    m.def("f3", f3);
+    m.def("f4", f4);
+}
diff --git a/pybind11/tests/test_constants_and_functions.py b/pybind11/tests/test_constants_and_functions.py
index 2a570d2e5..472682d61 100644
--- a/pybind11/tests/test_constants_and_functions.py
+++ b/pybind11/tests/test_constants_and_functions.py
@@ -1,33 +1,29 @@
+from pybind11_tests import constants_and_functions as m
 
 
 def test_constants():
-    from pybind11_tests import some_constant
-
-    assert some_constant == 14
+    assert m.some_constant == 14
 
 
 def test_function_overloading():
-    from pybind11_tests import MyEnum, test_function
-
-    assert test_function() == "test_function()"
-    assert test_function(7) == "test_function(7)"
-    assert test_function(MyEnum.EFirstEntry) == "test_function(enum=1)"
-    assert test_function(MyEnum.ESecondEntry) == "test_function(enum=2)"
+    assert m.test_function() == "test_function()"
+    assert m.test_function(7) == "test_function(7)"
+    assert m.test_function(m.MyEnum.EFirstEntry) == "test_function(enum=1)"
+    assert m.test_function(m.MyEnum.ESecondEntry) == "test_function(enum=2)"
 
-    assert test_function(1, 1.0) == "test_function(int, float)"
-    assert test_function(2.0, 2) == "test_function(float, int)"
+    assert m.test_function() == "test_function()"
+    assert m.test_function("abcd") == "test_function(char *)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(1, 1.0) == "test_function(int, float)"
+    assert m.test_function(2.0, 2) == "test_function(float, int)"
 
 
 def test_bytes():
-    from pybind11_tests import return_bytes, print_bytes
-
-    assert print_bytes(return_bytes()) == "bytes[1 0 2 0]"
+    assert m.print_bytes(m.return_bytes()) == "bytes[1 0 2 0]"
 
 
 def test_exception_specifiers():
-    from pybind11_tests.exc_sp import C, f1, f2, f3, f4
-
-    c = C()
+    c = m.C()
     assert c.m1(2) == 1
     assert c.m2(3) == 1
     assert c.m3(5) == 2
@@ -37,7 +33,7 @@ def test_exception_specifiers():
     assert c.m7(20) == 13
     assert c.m8(29) == 21
 
-    assert f1(33) == 34
-    assert f2(53) == 55
-    assert f3(86) == 89
-    assert f4(140) == 144
+    assert m.f1(33) == 34
+    assert m.f2(53) == 55
+    assert m.f3(86) == 89
+    assert m.f4(140) == 144
diff --git a/pybind11/tests/test_copy_move.cpp b/pybind11/tests/test_copy_move.cpp
new file mode 100644
index 000000000..94113e3af
--- /dev/null
+++ b/pybind11/tests/test_copy_move.cpp
@@ -0,0 +1,213 @@
+/*
+    tests/test_copy_move_policies.cpp -- 'copy' and 'move' return value policies
+                                         and related tests
+
+    Copyright (c) 2016 Ben North <ben@redfrontdoor.org>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <pybind11/stl.h>
+
+template <typename derived>
+struct empty {
+    static const derived& get_one() { return instance_; }
+    static derived instance_;
+};
+
+struct lacking_copy_ctor : public empty<lacking_copy_ctor> {
+    lacking_copy_ctor() {}
+    lacking_copy_ctor(const lacking_copy_ctor& other) = delete;
+};
+
+template <> lacking_copy_ctor empty<lacking_copy_ctor>::instance_ = {};
+
+struct lacking_move_ctor : public empty<lacking_move_ctor> {
+    lacking_move_ctor() {}
+    lacking_move_ctor(const lacking_move_ctor& other) = delete;
+    lacking_move_ctor(lacking_move_ctor&& other) = delete;
+};
+
+template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
+
+/* Custom type caster move/copy test classes */
+class MoveOnlyInt {
+public:
+    MoveOnlyInt() { print_default_created(this); }
+    MoveOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOnlyInt(MoveOnlyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOnlyInt &operator=(MoveOnlyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOnlyInt(const MoveOnlyInt &) = delete;
+    MoveOnlyInt &operator=(const MoveOnlyInt &) = delete;
+    ~MoveOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+class MoveOrCopyInt {
+public:
+    MoveOrCopyInt() { print_default_created(this); }
+    MoveOrCopyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    MoveOrCopyInt(MoveOrCopyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
+    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    MoveOrCopyInt(const MoveOrCopyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    MoveOrCopyInt &operator=(const MoveOrCopyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~MoveOrCopyInt() { print_destroyed(this); }
+
+    int value;
+};
+class CopyOnlyInt {
+public:
+    CopyOnlyInt() { print_default_created(this); }
+    CopyOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    CopyOnlyInt(const CopyOnlyInt &c) { print_copy_created(this, c.value); value = c.value; }
+    CopyOnlyInt &operator=(const CopyOnlyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
+    ~CopyOnlyInt() { print_destroyed(this); }
+
+    int value;
+};
+NAMESPACE_BEGIN(pybind11)
+NAMESPACE_BEGIN(detail)
+template <> struct type_caster<MoveOnlyInt> {
+    PYBIND11_TYPE_CASTER(MoveOnlyInt, _("MoveOnlyInt"));
+    bool load(handle src, bool) { value = MoveOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<MoveOrCopyInt> {
+    PYBIND11_TYPE_CASTER(MoveOrCopyInt, _("MoveOrCopyInt"));
+    bool load(handle src, bool) { value = MoveOrCopyInt(src.cast<int>()); return true; }
+    static handle cast(const MoveOrCopyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+};
+
+template <> struct type_caster<CopyOnlyInt> {
+protected:
+    CopyOnlyInt value;
+public:
+    static PYBIND11_DESCR name() { return _("CopyOnlyInt"); }
+    bool load(handle src, bool) { value = CopyOnlyInt(src.cast<int>()); return true; }
+    static handle cast(const CopyOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
+    static handle cast(const CopyOnlyInt *src, return_value_policy policy, handle parent) {
+        if (!src) return none().release();
+        return cast(*src, policy, parent);
+    }
+    operator CopyOnlyInt*() { return &value; }
+    operator CopyOnlyInt&() { return value; }
+    template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
+
+TEST_SUBMODULE(copy_move_policies, m) {
+    // test_lacking_copy_ctor
+    py::class_<lacking_copy_ctor>(m, "lacking_copy_ctor")
+        .def_static("get_one", &lacking_copy_ctor::get_one,
+                    py::return_value_policy::copy);
+    // test_lacking_move_ctor
+    py::class_<lacking_move_ctor>(m, "lacking_move_ctor")
+        .def_static("get_one", &lacking_move_ctor::get_one,
+                    py::return_value_policy::move);
+
+    // test_move_and_copy_casts
+    m.def("move_and_copy_casts", [](py::object o) {
+        int r = 0;
+        r += py::cast<MoveOrCopyInt>(o).value; /* moves */
+        r += py::cast<MoveOnlyInt>(o).value; /* moves */
+        r += py::cast<CopyOnlyInt>(o).value; /* copies */
+        MoveOrCopyInt m1(py::cast<MoveOrCopyInt>(o)); /* moves */
+        MoveOnlyInt m2(py::cast<MoveOnlyInt>(o)); /* moves */
+        CopyOnlyInt m3(py::cast<CopyOnlyInt>(o)); /* copies */
+        r += m1.value + m2.value + m3.value;
+
+        return r;
+    });
+
+    // test_move_and_copy_loads
+    m.def("move_only", [](MoveOnlyInt m) { return m.value; });
+    m.def("move_or_copy", [](MoveOrCopyInt m) { return m.value; });
+    m.def("copy_only", [](CopyOnlyInt m) { return m.value; });
+    m.def("move_pair", [](std::pair<MoveOnlyInt, MoveOrCopyInt> p) {
+        return p.first.value + p.second.value;
+    });
+    m.def("move_tuple", [](std::tuple<MoveOnlyInt, MoveOrCopyInt, MoveOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value + std::get<2>(t).value;
+    });
+    m.def("copy_tuple", [](std::tuple<CopyOnlyInt, CopyOnlyInt> t) {
+        return std::get<0>(t).value + std::get<1>(t).value;
+    });
+    m.def("move_copy_nested", [](std::pair<MoveOnlyInt, std::pair<std::tuple<MoveOrCopyInt, CopyOnlyInt, std::tuple<MoveOnlyInt>>, MoveOrCopyInt>> x) {
+        return x.first.value + std::get<0>(x.second.first).value + std::get<1>(x.second.first).value +
+            std::get<0>(std::get<2>(x.second.first)).value + x.second.second.value;
+    });
+    m.def("move_and_copy_cstats", []() {
+        ConstructorStats::gc();
+        // Reset counts to 0 so that previous tests don't affect later ones:
+        auto &mc = ConstructorStats::get<MoveOrCopyInt>();
+        mc.move_assignments = mc.move_constructions = mc.copy_assignments = mc.copy_constructions = 0;
+        auto &mo = ConstructorStats::get<MoveOnlyInt>();
+        mo.move_assignments = mo.move_constructions = mo.copy_assignments = mo.copy_constructions = 0;
+        auto &co = ConstructorStats::get<CopyOnlyInt>();
+        co.move_assignments = co.move_constructions = co.copy_assignments = co.copy_constructions = 0;
+        py::dict d;
+        d["MoveOrCopyInt"] = py::cast(mc, py::return_value_policy::reference);
+        d["MoveOnlyInt"] = py::cast(mo, py::return_value_policy::reference);
+        d["CopyOnlyInt"] = py::cast(co, py::return_value_policy::reference);
+        return d;
+    });
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_move_and_copy_load_optional
+    m.attr("has_optional") = true;
+    m.def("move_optional", [](std::optional<MoveOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_or_copy_optional", [](std::optional<MoveOrCopyInt> o) {
+        return o->value;
+    });
+    m.def("copy_optional", [](std::optional<CopyOnlyInt> o) {
+        return o->value;
+    });
+    m.def("move_optional_tuple", [](std::optional<std::tuple<MoveOrCopyInt, MoveOnlyInt, CopyOnlyInt>> x) {
+        return std::get<0>(*x).value + std::get<1>(*x).value + std::get<2>(*x).value;
+    });
+#else
+    m.attr("has_optional") = false;
+#endif
+
+    // #70 compilation issue if operator new is not public
+    struct PrivateOpNew {
+        int value = 1;
+    private:
+#if defined(_MSC_VER)
+#  pragma warning(disable: 4822) // warning C4822: local class member function does not have a body
+#endif
+        void *operator new(size_t bytes);
+    };
+    py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
+    m.def("private_op_new_value", []() { return PrivateOpNew(); });
+    m.def("private_op_new_reference", []() -> const PrivateOpNew & {
+        static PrivateOpNew x{};
+        return x;
+    }, py::return_value_policy::reference);
+
+    // test_move_fallback
+    // #389: rvp::move should fall-through to copy on non-movable objects
+    struct MoveIssue1 {
+        int v;
+        MoveIssue1(int v) : v{v} {}
+        MoveIssue1(const MoveIssue1 &c) = default;
+        MoveIssue1(MoveIssue1 &&) = delete;
+    };
+    py::class_<MoveIssue1>(m, "MoveIssue1").def(py::init<int>()).def_readwrite("value", &MoveIssue1::v);
+
+    struct MoveIssue2 {
+        int v;
+        MoveIssue2(int v) : v{v} {}
+        MoveIssue2(MoveIssue2 &&) = default;
+    };
+    py::class_<MoveIssue2>(m, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
+
+    m.def("get_moveissue1", [](int i) { return new MoveIssue1(i); }, py::return_value_policy::move);
+    m.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
+}
diff --git a/pybind11/tests/test_copy_move.py b/pybind11/tests/test_copy_move.py
new file mode 100644
index 000000000..aff2d99f2
--- /dev/null
+++ b/pybind11/tests/test_copy_move.py
@@ -0,0 +1,112 @@
+import pytest
+from pybind11_tests import copy_move_policies as m
+
+
+def test_lacking_copy_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_copy_ctor.get_one()
+    assert "the object is non-copyable!" in str(excinfo.value)
+
+
+def test_lacking_move_ctor():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.lacking_move_ctor.get_one()
+    assert "the object is neither movable nor copyable!" in str(excinfo.value)
+
+
+def test_move_and_copy_casts():
+    """Cast some values in C++ via custom type casters and count the number of moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The type move constructions/assignments below each get incremented: the move assignment comes
+    # from the type_caster load; the move construction happens when extracting that via a cast or
+    # loading into an argument.
+    assert m.move_and_copy_casts(3) == 18
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions >= 2
+    assert c_mc.alive() == 0
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions >= 2
+    assert c_c.alive() == 0
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions >= 2
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_move_and_copy_loads():
+    """Call some functions that load arguments via custom type casters and count the number of
+    moves/copies."""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    assert m.move_only(10) == 10  # 1 move, c_m
+    assert m.move_or_copy(11) == 11  # 1 move, c_mc
+    assert m.copy_only(12) == 12  # 1 copy, c_c
+    assert m.move_pair((13, 14)) == 27  # 1 c_m move, 1 c_mc move
+    assert m.move_tuple((15, 16, 17)) == 48  # 2 c_m moves, 1 c_mc move
+    assert m.copy_tuple((18, 19)) == 37  # 2 c_c copies
+    # Direct constructions: 2 c_m moves, 2 c_mc moves, 1 c_c copy
+    # Extra moves/copies when moving pairs/tuples: 3 c_m, 3 c_mc, 2 c_c
+    assert m.move_copy_nested((1, ((2, 3, (4,)), 5))) == 15
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 6
+    assert c_m.move_constructions == 9
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 5
+    assert c_mc.move_constructions == 8
+    assert c_c.copy_assignments == 4
+    assert c_c.copy_constructions == 6
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+@pytest.mark.skipif(not m.has_optional, reason='no <optional>')
+def test_move_and_copy_load_optional():
+    """Tests move/copy loads of std::optional arguments"""
+
+    cstats = m.move_and_copy_cstats()
+    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+
+    # The extra move/copy constructions below come from the std::optional move (which has to move
+    # its arguments):
+    assert m.move_optional(10) == 10  # c_m: 1 move assign, 2 move construct
+    assert m.move_or_copy_optional(11) == 11  # c_mc: 1 move assign, 2 move construct
+    assert m.copy_optional(12) == 12  # c_c: 1 copy assign, 2 copy construct
+    # 1 move assign + move construct moves each of c_m, c_mc, 1 c_c copy
+    # +1 move/copy construct each from moving the tuple
+    # +1 move/copy construct each from moving the optional (which moves the tuple again)
+    assert m.move_optional_tuple((3, 4, 5)) == 12
+
+    assert c_m.copy_assignments + c_m.copy_constructions == 0
+    assert c_m.move_assignments == 2
+    assert c_m.move_constructions == 5
+    assert c_mc.copy_assignments + c_mc.copy_constructions == 0
+    assert c_mc.move_assignments == 2
+    assert c_mc.move_constructions == 5
+    assert c_c.copy_assignments == 2
+    assert c_c.copy_constructions == 5
+    assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
+
+
+def test_private_op_new():
+    """An object with a private `operator new` cannot be returned by value"""
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.private_op_new_value()
+    assert "the object is neither movable nor copyable" in str(excinfo.value)
+
+    assert m.private_op_new_reference().value == 1
+
+
+def test_move_fallback():
+    """#389: rvp::move should fall-through to copy on non-movable objects"""
+
+    m2 = m.get_moveissue2(2)
+    assert m2.value == 2
+    m1 = m.get_moveissue1(1)
+    assert m1.value == 1
diff --git a/pybind11/tests/test_copy_move_policies.cpp b/pybind11/tests/test_copy_move_policies.cpp
deleted file mode 100644
index 6f7907c1f..000000000
--- a/pybind11/tests/test_copy_move_policies.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-    tests/test_copy_move_policies.cpp -- 'copy' and 'move'
-                                         return value policies
-
-    Copyright (c) 2016 Ben North <ben@redfrontdoor.org>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-
-template <typename derived>
-struct empty {
-    static const derived& get_one() { return instance_; }
-    static derived instance_;
-};
-
-struct lacking_copy_ctor : public empty<lacking_copy_ctor> {
-    lacking_copy_ctor() {}
-    lacking_copy_ctor(const lacking_copy_ctor& other) = delete;
-};
-
-template <> lacking_copy_ctor empty<lacking_copy_ctor>::instance_ = {};
-
-struct lacking_move_ctor : public empty<lacking_move_ctor> {
-    lacking_move_ctor() {}
-    lacking_move_ctor(const lacking_move_ctor& other) = delete;
-    lacking_move_ctor(lacking_move_ctor&& other) = delete;
-};
-
-template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
-
-test_initializer copy_move_policies([](py::module &m) {
-    py::class_<lacking_copy_ctor>(m, "lacking_copy_ctor")
-        .def_static("get_one", &lacking_copy_ctor::get_one,
-                    py::return_value_policy::copy);
-    py::class_<lacking_move_ctor>(m, "lacking_move_ctor")
-        .def_static("get_one", &lacking_move_ctor::get_one,
-                    py::return_value_policy::move);
-});
diff --git a/pybind11/tests/test_copy_move_policies.py b/pybind11/tests/test_copy_move_policies.py
deleted file mode 100644
index edcf38075..000000000
--- a/pybind11/tests/test_copy_move_policies.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import pytest
-
-
-def test_lacking_copy_ctor():
-    from pybind11_tests import lacking_copy_ctor
-    with pytest.raises(RuntimeError) as excinfo:
-        lacking_copy_ctor.get_one()
-    assert "the object is non-copyable!" in str(excinfo.value)
-
-
-def test_lacking_move_ctor():
-    from pybind11_tests import lacking_move_ctor
-    with pytest.raises(RuntimeError) as excinfo:
-        lacking_move_ctor.get_one()
-    assert "the object is neither movable nor copyable!" in str(excinfo.value)
diff --git a/pybind11/tests/test_docstring_options.cpp b/pybind11/tests/test_docstring_options.cpp
index 74178c272..8c8f79fd5 100644
--- a/pybind11/tests/test_docstring_options.cpp
+++ b/pybind11/tests/test_docstring_options.cpp
@@ -9,14 +9,8 @@
 
 #include "pybind11_tests.h"
 
-struct DocstringTestFoo {
-    int value;
-    void setValue(int v) { value = v; }
-    int getValue() const { return value; }
-};
-
-test_initializer docstring_generation([](py::module &m) {
-
+TEST_SUBMODULE(docstring_options, m) {
+    // test_docstring_options
     {
         py::options options;
         options.disable_function_signatures();
@@ -24,6 +18,15 @@ test_initializer docstring_generation([](py::module &m) {
         m.def("test_function1", [](int, int) {}, py::arg("a"), py::arg("b"));
         m.def("test_function2", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
 
+        m.def("test_overloaded1", [](int) {}, py::arg("i"), "Overload docstring");
+        m.def("test_overloaded1", [](double) {}, py::arg("d"));
+
+        m.def("test_overloaded2", [](int) {}, py::arg("i"), "overload docstring 1");
+        m.def("test_overloaded2", [](double) {}, py::arg("d"), "overload docstring 2");
+
+        m.def("test_overloaded3", [](int) {}, py::arg("i"));
+        m.def("test_overloaded3", [](double) {}, py::arg("d"), "Overload docstr");
+
         options.enable_function_signatures();
 
         m.def("test_function3", [](int, int) {}, py::arg("a"), py::arg("b"));
@@ -46,8 +49,13 @@ test_initializer docstring_generation([](py::module &m) {
         py::options options;
         options.disable_user_defined_docstrings();
 
+        struct DocstringTestFoo {
+            int value;
+            void setValue(int v) { value = v; }
+            int getValue() const { return value; }
+        };
         py::class_<DocstringTestFoo>(m, "DocstringTestFoo", "This is a class docstring")
             .def_property("value_prop", &DocstringTestFoo::getValue, &DocstringTestFoo::setValue, "This is a property docstring")
         ;
     }
-});
+}
diff --git a/pybind11/tests/test_docstring_options.py b/pybind11/tests/test_docstring_options.py
index 66ad6b89f..0dbca609e 100644
--- a/pybind11/tests/test_docstring_options.py
+++ b/pybind11/tests/test_docstring_options.py
@@ -1,32 +1,38 @@
+from pybind11_tests import docstring_options as m
 
 
 def test_docstring_options():
-    from pybind11_tests import (test_function1, test_function2, test_function3,
-                                test_function4, test_function5, test_function6,
-                                test_function7, DocstringTestFoo)
-
     # options.disable_function_signatures()
-    assert not test_function1.__doc__
+    assert not m.test_function1.__doc__
+
+    assert m.test_function2.__doc__ == "A custom docstring"
+
+    # docstring specified on just the first overload definition:
+    assert m.test_overloaded1.__doc__ == "Overload docstring"
+
+    # docstring on both overloads:
+    assert m.test_overloaded2.__doc__ == "overload docstring 1\noverload docstring 2"
 
-    assert test_function2.__doc__ == "A custom docstring"
+    # docstring on only second overload:
+    assert m.test_overloaded3.__doc__ == "Overload docstr"
 
     # options.enable_function_signatures()
-    assert test_function3.__doc__ .startswith("test_function3(a: int, b: int) -> None")
+    assert m.test_function3.__doc__ .startswith("test_function3(a: int, b: int) -> None")
 
-    assert test_function4.__doc__ .startswith("test_function4(a: int, b: int) -> None")
-    assert test_function4.__doc__ .endswith("A custom docstring\n")
+    assert m.test_function4.__doc__ .startswith("test_function4(a: int, b: int) -> None")
+    assert m.test_function4.__doc__ .endswith("A custom docstring\n")
 
     # options.disable_function_signatures()
     # options.disable_user_defined_docstrings()
-    assert not test_function5.__doc__
+    assert not m.test_function5.__doc__
 
     # nested options.enable_user_defined_docstrings()
-    assert test_function6.__doc__ == "A custom docstring"
+    assert m.test_function6.__doc__ == "A custom docstring"
 
     # RAII destructor
-    assert test_function7.__doc__ .startswith("test_function7(a: int, b: int) -> None")
-    assert test_function7.__doc__ .endswith("A custom docstring\n")
+    assert m.test_function7.__doc__ .startswith("test_function7(a: int, b: int) -> None")
+    assert m.test_function7.__doc__ .endswith("A custom docstring\n")
 
     # Suppression of user-defined docstrings for non-function objects
-    assert not DocstringTestFoo.__doc__
-    assert not DocstringTestFoo.value_prop.__doc__
+    assert not m.DocstringTestFoo.__doc__
+    assert not m.DocstringTestFoo.value_prop.__doc__
diff --git a/pybind11/tests/test_eigen.cpp b/pybind11/tests/test_eigen.cpp
index 588cdceb3..17b156ce4 100644
--- a/pybind11/tests/test_eigen.cpp
+++ b/pybind11/tests/test_eigen.cpp
@@ -8,55 +8,151 @@
 */
 
 #include "pybind11_tests.h"
+#include "constructor_stats.h"
 #include <pybind11/eigen.h>
+#include <pybind11/stl.h>
 #include <Eigen/Cholesky>
 
-Eigen::VectorXf double_col(const Eigen::VectorXf& x)
-{ return 2.0f * x; }
+using MatrixXdR = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 
-Eigen::RowVectorXf double_row(const Eigen::RowVectorXf& x)
-{ return 2.0f * x; }
 
-Eigen::MatrixXf double_mat_cm(const Eigen::MatrixXf& x)
-{ return 2.0f * x; }
 
-// Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
-Eigen::MatrixXd cholesky1(Eigen::Ref<Eigen::MatrixXd> &x) { return x.llt().matrixL(); }
-Eigen::MatrixXd cholesky2(const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.llt().matrixL(); }
-Eigen::MatrixXd cholesky3(const Eigen::Ref<Eigen::MatrixXd> &x) { return x.llt().matrixL(); }
-Eigen::MatrixXd cholesky4(Eigen::Ref<const Eigen::MatrixXd> &x) { return x.llt().matrixL(); }
-Eigen::MatrixXd cholesky5(Eigen::Ref<Eigen::MatrixXd> x) { return x.llt().matrixL(); }
-Eigen::MatrixXd cholesky6(Eigen::Ref<const Eigen::MatrixXd> x) { return x.llt().matrixL(); }
+// Sets/resets a testing reference matrix to have values of 10*r + c, where r and c are the
+// (1-based) row/column number.
+template <typename M> void reset_ref(M &x) {
+    for (int i = 0; i < x.rows(); i++) for (int j = 0; j < x.cols(); j++)
+        x(i, j) = 11 + 10*i + j;
+}
 
-typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXfRowMajor;
-MatrixXfRowMajor double_mat_rm(const MatrixXfRowMajor& x)
-{ return 2.0f * x; }
+// Returns a static, column-major matrix
+Eigen::MatrixXd &get_cm() {
+    static Eigen::MatrixXd *x;
+    if (!x) {
+        x = new Eigen::MatrixXd(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Likewise, but row-major
+MatrixXdR &get_rm() {
+    static MatrixXdR *x;
+    if (!x) {
+        x = new MatrixXdR(3, 3);
+        reset_ref(*x);
+    }
+    return *x;
+}
+// Resets the values of the static matrices returned by get_cm()/get_rm()
+void reset_refs() {
+    reset_ref(get_cm());
+    reset_ref(get_rm());
+}
 
-test_initializer eigen([](py::module &m) {
-    typedef Eigen::Matrix<float, 5, 6, Eigen::RowMajor> FixedMatrixR;
-    typedef Eigen::Matrix<float, 5, 6> FixedMatrixC;
-    typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> DenseMatrixR;
-    typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> DenseMatrixC;
-    typedef Eigen::SparseMatrix<float, Eigen::RowMajor> SparseMatrixR;
-    typedef Eigen::SparseMatrix<float> SparseMatrixC;
+// Returns element 2,1 from a matrix (used to test copy/nocopy)
+double get_elem(Eigen::Ref<const Eigen::MatrixXd> m) { return m(2, 1); };
+
+
+// Returns a matrix with 10*r + 100*c added to each matrix element (to help test that the matrix
+// reference is referencing rows/columns correctly).
+template <typename MatrixArgType> Eigen::MatrixXd adjust_matrix(MatrixArgType m) {
+    Eigen::MatrixXd ret(m);
+    for (int c = 0; c < m.cols(); c++) for (int r = 0; r < m.rows(); r++)
+        ret(r, c) += 10*r + 100*c;
+    return ret;
+}
+
+struct CustomOperatorNew {
+    CustomOperatorNew() = default;
+
+    Eigen::Matrix4d a = Eigen::Matrix4d::Zero();
+    Eigen::Matrix4d b = Eigen::Matrix4d::Identity();
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW;
+};
+
+TEST_SUBMODULE(eigen, m) {
+    using FixedMatrixR = Eigen::Matrix<float, 5, 6, Eigen::RowMajor>;
+    using FixedMatrixC = Eigen::Matrix<float, 5, 6>;
+    using DenseMatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using DenseMatrixC = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
+    using FourRowMatrixC = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixC = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using FourRowMatrixR = Eigen::Matrix<float, 4, Eigen::Dynamic>;
+    using FourColMatrixR = Eigen::Matrix<float, Eigen::Dynamic, 4>;
+    using SparseMatrixR = Eigen::SparseMatrix<float, Eigen::RowMajor>;
+    using SparseMatrixC = Eigen::SparseMatrix<float>;
 
     m.attr("have_eigen") = true;
 
-    // Non-symmetric matrix with zero elements
-    Eigen::MatrixXf mat(5, 6);
-    mat << 0, 3, 0, 0, 0, 11, 22, 0, 0, 0, 17, 11, 7, 5, 0, 1, 0, 11, 0,
-        0, 0, 0, 0, 11, 0, 0, 14, 0, 8, 11;
-
-    m.def("double_col", &double_col);
-    m.def("double_row", &double_row);
-    m.def("double_mat_cm", &double_mat_cm);
-    m.def("double_mat_rm", &double_mat_rm);
-    m.def("cholesky1", &cholesky1);
-    m.def("cholesky2", &cholesky2);
-    m.def("cholesky3", &cholesky3);
-    m.def("cholesky4", &cholesky4);
-    m.def("cholesky5", &cholesky5);
-    m.def("cholesky6", &cholesky6);
+    // various tests
+    m.def("double_col", [](const Eigen::VectorXf &x) -> Eigen::VectorXf { return 2.0f * x; });
+    m.def("double_row", [](const Eigen::RowVectorXf &x) -> Eigen::RowVectorXf { return 2.0f * x; });
+    m.def("double_complex", [](const Eigen::VectorXcf &x) -> Eigen::VectorXcf { return 2.0f * x; });
+    m.def("double_threec", [](py::EigenDRef<Eigen::Vector3f> x) { x *= 2; });
+    m.def("double_threer", [](py::EigenDRef<Eigen::RowVector3f> x) { x *= 2; });
+    m.def("double_mat_cm", [](Eigen::MatrixXf x) -> Eigen::MatrixXf { return 2.0f * x; });
+    m.def("double_mat_rm", [](DenseMatrixR x) -> DenseMatrixR { return 2.0f * x; });
+
+    // test_eigen_ref_to_python
+    // Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
+    m.def("cholesky1", [](Eigen::Ref<MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky2", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky3", [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky4", [](Eigen::Ref<const MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+
+    // test_eigen_ref_mutators
+    // Mutators: these add some value to the given element using Eigen, but Eigen should be mapping into
+    // the numpy array data and so the result should show up there.  There are three versions: one that
+    // works on a contiguous-row matrix (numpy's default), one for a contiguous-column matrix, and one
+    // for any matrix.
+    auto add_rm = [](Eigen::Ref<MatrixXdR> x, int r, int c, double v) { x(r,c) += v; };
+    auto add_cm = [](Eigen::Ref<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; };
+
+    // Mutators (Eigen maps into numpy variables):
+    m.def("add_rm", add_rm); // Only takes row-contiguous
+    m.def("add_cm", add_cm); // Only takes column-contiguous
+    // Overloaded versions that will accept either row or column contiguous:
+    m.def("add1", add_rm);
+    m.def("add1", add_cm);
+    m.def("add2", add_cm);
+    m.def("add2", add_rm);
+    // This one accepts a matrix of any stride:
+    m.def("add_any", [](py::EigenDRef<Eigen::MatrixXd> x, int r, int c, double v) { x(r,c) += v; });
+
+    // Return mutable references (numpy maps into eigen varibles)
+    m.def("get_cm_ref", []() { return Eigen::Ref<Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_ref", []() { return Eigen::Ref<MatrixXdR>(get_rm()); });
+    // The same references, but non-mutable (numpy maps into eigen variables, but is !writeable)
+    m.def("get_cm_const_ref", []() { return Eigen::Ref<const Eigen::MatrixXd>(get_cm()); });
+    m.def("get_rm_const_ref", []() { return Eigen::Ref<const MatrixXdR>(get_rm()); });
+
+    m.def("reset_refs", reset_refs); // Restores get_{cm,rm}_ref to original values
+
+    // Increments and returns ref to (same) matrix
+    m.def("incr_matrix", [](Eigen::Ref<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Same, but accepts a matrix of any strides
+    m.def("incr_matrix_any", [](py::EigenDRef<Eigen::MatrixXd> m, double v) {
+        m += Eigen::MatrixXd::Constant(m.rows(), m.cols(), v);
+        return m;
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even rows
+    m.def("even_rows", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), (m.rows() + 1) / 2, m.cols(),
+                py::EigenDStride(m.outerStride(), 2 * m.innerStride()));
+    }, py::return_value_policy::reference);
+
+    // Returns an eigen slice of even columns
+    m.def("even_cols", [](py::EigenDRef<Eigen::MatrixXd> m) {
+        return py::EigenDMap<Eigen::MatrixXd>(
+                m.data(), m.rows(), (m.cols() + 1) / 2,
+                py::EigenDStride(2 * m.outerStride(), m.innerStride()));
+    }, py::return_value_policy::reference);
 
     // Returns diagonals: a vector-like object with an inner stride != 1
     m.def("diagonal", [](const Eigen::Ref<const Eigen::MatrixXd> &x) { return x.diagonal(); });
@@ -68,6 +164,54 @@ test_initializer eigen([](py::module &m) {
         return x.block(start_row, start_col, block_rows, block_cols);
     });
 
+    // test_eigen_return_references, test_eigen_keepalive
+    // return value referencing/copying tests:
+    class ReturnTester {
+        Eigen::MatrixXd mat = create();
+    public:
+        ReturnTester() { print_created(this); }
+        ~ReturnTester() { print_destroyed(this); }
+        static Eigen::MatrixXd create() { return Eigen::MatrixXd::Ones(10, 10); }
+        static const Eigen::MatrixXd createConst() { return Eigen::MatrixXd::Ones(10, 10); }
+        Eigen::MatrixXd &get() { return mat; }
+        Eigen::MatrixXd *getPtr() { return &mat; }
+        const Eigen::MatrixXd &view() { return mat; }
+        const Eigen::MatrixXd *viewPtr() { return &mat; }
+        Eigen::Ref<Eigen::MatrixXd> ref() { return mat; }
+        Eigen::Ref<const Eigen::MatrixXd> refConst() { return mat; }
+        Eigen::Block<Eigen::MatrixXd> block(int r, int c, int nrow, int ncol) { return mat.block(r, c, nrow, ncol); }
+        Eigen::Block<const Eigen::MatrixXd> blockConst(int r, int c, int nrow, int ncol) const { return mat.block(r, c, nrow, ncol); }
+        py::EigenDMap<Eigen::Matrix2d> corners() { return py::EigenDMap<Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+        py::EigenDMap<const Eigen::Matrix2d> cornersConst() const { return py::EigenDMap<const Eigen::Matrix2d>(mat.data(),
+                    py::EigenDStride(mat.outerStride() * (mat.outerSize()-1), mat.innerStride() * (mat.innerSize()-1))); }
+    };
+    using rvp = py::return_value_policy;
+    py::class_<ReturnTester>(m, "ReturnTester")
+        .def(py::init<>())
+        .def_static("create", &ReturnTester::create)
+        .def_static("create_const", &ReturnTester::createConst)
+        .def("get", &ReturnTester::get, rvp::reference_internal)
+        .def("get_ptr", &ReturnTester::getPtr, rvp::reference_internal)
+        .def("view", &ReturnTester::view, rvp::reference_internal)
+        .def("view_ptr", &ReturnTester::view, rvp::reference_internal)
+        .def("copy_get", &ReturnTester::get)   // Default rvp: copy
+        .def("copy_view", &ReturnTester::view) //         "
+        .def("ref", &ReturnTester::ref) // Default for Ref is to reference
+        .def("ref_const", &ReturnTester::refConst) // Likewise, but const
+        .def("ref_safe", &ReturnTester::ref, rvp::reference_internal)
+        .def("ref_const_safe", &ReturnTester::refConst, rvp::reference_internal)
+        .def("copy_ref", &ReturnTester::ref, rvp::copy)
+        .def("copy_ref_const", &ReturnTester::refConst, rvp::copy)
+        .def("block", &ReturnTester::block)
+        .def("block_safe", &ReturnTester::block, rvp::reference_internal)
+        .def("block_const", &ReturnTester::blockConst, rvp::reference_internal)
+        .def("copy_block", &ReturnTester::block, rvp::copy)
+        .def("corners", &ReturnTester::corners, rvp::reference_internal)
+        .def("corners_const", &ReturnTester::cornersConst, rvp::reference_internal)
+        ;
+
+    // test_special_matrix_objects
     // Returns a DiagonalMatrix with diagonal (1,2,3,...)
     m.def("incr_diag", [](int k) {
         Eigen::DiagonalMatrix<int, Eigen::Dynamic> m(k);
@@ -84,51 +228,90 @@ test_initializer eigen([](py::module &m) {
             return m.selfadjointView<Eigen::Upper>();
     });
 
-    m.def("fixed_r", [mat]() -> FixedMatrixR {
-        return FixedMatrixR(mat);
-    });
+    // Test matrix for various functions below.
+    Eigen::MatrixXf mat(5, 6);
+    mat << 0,  3,  0,  0,  0, 11,
+           22, 0,  0,  0, 17, 11,
+           7,  5,  0,  1,  0, 11,
+           0,  0,  0,  0,  0, 11,
+           0,  0, 14,  0,  8, 11;
 
-    m.def("fixed_c", [mat]() -> FixedMatrixC {
-        return FixedMatrixC(mat);
-    });
+    // test_fixed, and various other tests
+    m.def("fixed_r", [mat]() -> FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_r_const", [mat]() -> const FixedMatrixR { return FixedMatrixR(mat); });
+    m.def("fixed_c", [mat]() -> FixedMatrixC { return FixedMatrixC(mat); });
+    m.def("fixed_copy_r", [](const FixedMatrixR &m) -> FixedMatrixR { return m; });
+    m.def("fixed_copy_c", [](const FixedMatrixC &m) -> FixedMatrixC { return m; });
+    // test_mutator_descriptors
+    m.def("fixed_mutator_r", [](Eigen::Ref<FixedMatrixR>) {});
+    m.def("fixed_mutator_c", [](Eigen::Ref<FixedMatrixC>) {});
+    m.def("fixed_mutator_a", [](py::EigenDRef<FixedMatrixC>) {});
+    // test_dense
+    m.def("dense_r", [mat]() -> DenseMatrixR { return DenseMatrixR(mat); });
+    m.def("dense_c", [mat]() -> DenseMatrixC { return DenseMatrixC(mat); });
+    m.def("dense_copy_r", [](const DenseMatrixR &m) -> DenseMatrixR { return m; });
+    m.def("dense_copy_c", [](const DenseMatrixC &m) -> DenseMatrixC { return m; });
+    // test_sparse, test_sparse_signature
+    m.def("sparse_r", [mat]() -> SparseMatrixR { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_c", [mat]() -> SparseMatrixC { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_copy_r", [](const SparseMatrixR &m) -> SparseMatrixR { return m; });
+    m.def("sparse_copy_c", [](const SparseMatrixC &m) -> SparseMatrixC { return m; });
+    // test_partially_fixed
+    m.def("partial_copy_four_rm_r", [](const FourRowMatrixR &m) -> FourRowMatrixR { return m; });
+    m.def("partial_copy_four_rm_c", [](const FourColMatrixR &m) -> FourColMatrixR { return m; });
+    m.def("partial_copy_four_cm_r", [](const FourRowMatrixC &m) -> FourRowMatrixC { return m; });
+    m.def("partial_copy_four_cm_c", [](const FourColMatrixC &m) -> FourColMatrixC { return m; });
 
-    m.def("fixed_passthrough_r", [](const FixedMatrixR &m) -> FixedMatrixR {
-        return m;
-    });
+    // test_cpp_casting
+    // Test that we can cast a numpy object to a Eigen::MatrixXd explicitly
+    m.def("cpp_copy", [](py::handle m) { return m.cast<Eigen::MatrixXd>()(1, 0); });
+    m.def("cpp_ref_c", [](py::handle m) { return m.cast<Eigen::Ref<Eigen::MatrixXd>>()(1, 0); });
+    m.def("cpp_ref_r", [](py::handle m) { return m.cast<Eigen::Ref<MatrixXdR>>()(1, 0); });
+    m.def("cpp_ref_any", [](py::handle m) { return m.cast<py::EigenDRef<Eigen::MatrixXd>>()(1, 0); });
 
-    m.def("fixed_passthrough_c", [](const FixedMatrixC &m) -> FixedMatrixC {
-        return m;
-    });
 
-    m.def("dense_r", [mat]() -> DenseMatrixR {
-        return DenseMatrixR(mat);
-    });
+    // test_nocopy_wrapper
+    // Test that we can prevent copying into an argument that would normally copy: First a version
+    // that would allow copying (if types or strides don't match) for comparison:
+    m.def("get_elem", &get_elem);
+    // Now this alternative that calls the tells pybind to fail rather than copy:
+    m.def("get_elem_nocopy", [](Eigen::Ref<const Eigen::MatrixXd> m) -> double { return get_elem(m); },
+            py::arg().noconvert());
+    // Also test a row-major-only no-copy const ref:
+    m.def("get_elem_rm_nocopy", [](Eigen::Ref<const Eigen::Matrix<long, -1, -1, Eigen::RowMajor>> &m) -> long { return m(2, 1); },
+            py::arg().noconvert());
 
-    m.def("dense_c", [mat]() -> DenseMatrixC {
-        return DenseMatrixC(mat);
-    });
+    // test_issue738
+    // Issue #738: 1xN or Nx1 2D matrices were neither accepted nor properly copied with an
+    // incompatible stride value on the length-1 dimension--but that should be allowed (without
+    // requiring a copy!) because the stride value can be safely ignored on a size-1 dimension.
+    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg().noconvert());
+    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg().noconvert());
 
-    m.def("dense_passthrough_r", [](const DenseMatrixR &m) -> DenseMatrixR {
-        return m;
-    });
+    // test_named_arguments
+    // Make sure named arguments are working properly:
+    m.def("matrix_multiply", [](const py::EigenDRef<const Eigen::MatrixXd> A, const py::EigenDRef<const Eigen::MatrixXd> B)
+            -> Eigen::MatrixXd {
+        if (A.cols() != B.rows()) throw std::domain_error("Nonconformable matrices!");
+        return A * B;
+    }, py::arg("A"), py::arg("B"));
 
-    m.def("dense_passthrough_c", [](const DenseMatrixC &m) -> DenseMatrixC {
-        return m;
-    });
+    // test_custom_operator_new
+    py::class_<CustomOperatorNew>(m, "CustomOperatorNew")
+        .def(py::init<>())
+        .def_readonly("a", &CustomOperatorNew::a)
+        .def_readonly("b", &CustomOperatorNew::b);
 
-    m.def("sparse_r", [mat]() -> SparseMatrixR {
-        return Eigen::SparseView<Eigen::MatrixXf>(mat);
+    // test_eigen_ref_life_support
+    // In case of a failure (the caster's temp array does not live long enough), creating
+    // a new array (np.ones(10)) increases the chances that the temp array will be garbage
+    // collected and/or that its memory will be overridden with different values.
+    m.def("get_elem_direct", [](Eigen::Ref<const Eigen::VectorXd> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v(5);
     });
-
-    m.def("sparse_c", [mat]() -> SparseMatrixC {
-        return Eigen::SparseView<Eigen::MatrixXf>(mat);
-    });
-
-    m.def("sparse_passthrough_r", [](const SparseMatrixR &m) -> SparseMatrixR {
-        return m;
-    });
-
-    m.def("sparse_passthrough_c", [](const SparseMatrixC &m) -> SparseMatrixC {
-        return m;
+    m.def("get_elem_indirect", [](std::vector<Eigen::Ref<const Eigen::VectorXd>> v) {
+        py::module::import("numpy").attr("ones")(10);
+        return v[0](5);
     });
-});
+}
diff --git a/pybind11/tests/test_eigen.py b/pybind11/tests/test_eigen.py
index b0092fc8b..4ac8cbf5d 100644
--- a/pybind11/tests/test_eigen.py
+++ b/pybind11/tests/test_eigen.py
@@ -1,9 +1,13 @@
 import pytest
+from pybind11_tests import ConstructorStats
+
+pytestmark = pytest.requires_eigen_and_numpy
 
 with pytest.suppress(ImportError):
+    from pybind11_tests import eigen as m
     import numpy as np
 
-    ref = np.array([[ 0,  3,  0,  0,  0, 11],
+    ref = np.array([[ 0.,  3,  0,  0,  0, 11],
                     [22,  0,  0,  0, 17, 11],
                     [ 7,  5,  0,  1,  0, 11],
                     [ 0,  0,  0,  0,  0, 11],
@@ -18,80 +22,577 @@ def assert_sparse_equal_ref(sparse_mat):
     assert_equal_ref(sparse_mat.todense())
 
 
-@pytest.requires_eigen_and_numpy
 def test_fixed():
-    from pybind11_tests import fixed_r, fixed_c, fixed_passthrough_r, fixed_passthrough_c
-
-    assert_equal_ref(fixed_c())
-    assert_equal_ref(fixed_r())
-    assert_equal_ref(fixed_passthrough_r(fixed_r()))
-    assert_equal_ref(fixed_passthrough_c(fixed_c()))
-    assert_equal_ref(fixed_passthrough_r(fixed_c()))
-    assert_equal_ref(fixed_passthrough_c(fixed_r()))
+    assert_equal_ref(m.fixed_c())
+    assert_equal_ref(m.fixed_r())
+    assert_equal_ref(m.fixed_copy_r(m.fixed_r()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_r(m.fixed_c()))
+    assert_equal_ref(m.fixed_copy_c(m.fixed_r()))
 
 
-@pytest.requires_eigen_and_numpy
 def test_dense():
-    from pybind11_tests import dense_r, dense_c, dense_passthrough_r, dense_passthrough_c
-
-    assert_equal_ref(dense_r())
-    assert_equal_ref(dense_c())
-    assert_equal_ref(dense_passthrough_r(dense_r()))
-    assert_equal_ref(dense_passthrough_c(dense_c()))
-    assert_equal_ref(dense_passthrough_r(dense_c()))
-    assert_equal_ref(dense_passthrough_c(dense_r()))
+    assert_equal_ref(m.dense_r())
+    assert_equal_ref(m.dense_c())
+    assert_equal_ref(m.dense_copy_r(m.dense_r()))
+    assert_equal_ref(m.dense_copy_c(m.dense_c()))
+    assert_equal_ref(m.dense_copy_r(m.dense_c()))
+    assert_equal_ref(m.dense_copy_c(m.dense_r()))
+
+
+def test_partially_fixed():
+    ref2 = np.array([[0., 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_rm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2), ref2)
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, 1]), ref2[:, [1]])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2[0, :]), ref2[[0], :])
+    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
+    np.testing.assert_array_equal(
+        m.partial_copy_four_cm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+
+    # TypeError should be raise for a shape mismatch
+    functions = [m.partial_copy_four_rm_r, m.partial_copy_four_rm_c,
+                 m.partial_copy_four_cm_r, m.partial_copy_four_cm_c]
+    matrix_with_wrong_shape = [[1, 2],
+                               [3, 4]]
+    for f in functions:
+        with pytest.raises(TypeError) as excinfo:
+            f(matrix_with_wrong_shape)
+        assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_mutator_descriptors():
+    zr = np.arange(30, dtype='float32').reshape(5, 6)  # row-major
+    zc = zr.reshape(6, 5).transpose()  # column-major
+
+    m.fixed_mutator_r(zr)
+    m.fixed_mutator_c(zc)
+    m.fixed_mutator_a(zr)
+    m.fixed_mutator_a(zc)
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_r(zc)
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable, flags.c_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_c(zr)
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable, flags.f_contiguous]) -> None'
+            in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.fixed_mutator_a(np.array([[1, 2], [3, 4]], dtype='float32'))
+    assert ('(arg0: numpy.ndarray[float32[5, 6], flags.writeable]) -> None'
+            in str(excinfo.value))
+    zr.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.fixed_mutator_r(zr)
+    with pytest.raises(TypeError):
+        m.fixed_mutator_a(zr)
+
+
+def test_cpp_casting():
+    assert m.cpp_copy(m.fixed_r()) == 22.
+    assert m.cpp_copy(m.fixed_c()) == 22.
+    z = np.array([[5., 6], [7, 8]])
+    assert m.cpp_copy(z) == 7.
+    assert m.cpp_copy(m.get_cm_ref()) == 21.
+    assert m.cpp_copy(m.get_rm_ref()) == 21.
+    assert m.cpp_ref_c(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_r(m.get_rm_ref()) == 21.
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_c: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_c())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    with pytest.raises(RuntimeError) as excinfo:
+        # Can't reference m.fixed_r: it contains floats, m.cpp_ref_any wants doubles
+        m.cpp_ref_any(m.fixed_r())
+    assert 'Unable to cast Python instance' in str(excinfo.value)
+    assert m.cpp_ref_any(m.ReturnTester.create()) == 1.
+
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+
+
+def test_pass_readonly_array():
+    z = np.full((5, 6), 42.0)
+    z.flags.writeable = False
+    np.testing.assert_array_equal(z, m.fixed_copy_r(z))
+    np.testing.assert_array_equal(m.fixed_r_const(), m.fixed_r())
+    assert not m.fixed_r_const().flags.writeable
+    np.testing.assert_array_equal(m.fixed_copy_r(m.fixed_r_const()), m.fixed_r_const())
 
 
-@pytest.requires_eigen_and_numpy
 def test_nonunit_stride_from_python():
-    from pybind11_tests import double_row, double_col, double_mat_cm, double_mat_rm
+    counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
+
+    counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
+    for slice_idx, ref_mat in enumerate(slices):
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
+
+    # Mutator:
+    m.double_threer(second_row)
+    m.double_threec(second_col)
+    np.testing.assert_array_equal(counting_mat, [[0., 2, 2], [6, 16, 10], [6, 14, 8]])
+
+
+def test_negative_stride_from_python(msg):
+    """Eigen doesn't support (as of yet) negative strides. When a function takes an Eigen matrix by
+    copy or const reference, we can pass a numpy array that has negative strides.  Otherwise, an
+    exception will be thrown as Eigen will not be able to map the numpy array."""
 
     counting_mat = np.arange(9.0, dtype=np.float32).reshape((3, 3))
-    first_row = counting_mat[0, :]
-    first_col = counting_mat[:, 0]
-    assert np.array_equal(double_row(first_row), 2.0 * first_row)
-    assert np.array_equal(double_col(first_row), 2.0 * first_row)
-    assert np.array_equal(double_row(first_col), 2.0 * first_col)
-    assert np.array_equal(double_col(first_col), 2.0 * first_col)
+    counting_mat = counting_mat[::-1, ::-1]
+    second_row = counting_mat[1, :]
+    second_col = counting_mat[:, 1]
+    np.testing.assert_array_equal(m.double_row(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_col(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_complex(second_row), 2.0 * second_row)
+    np.testing.assert_array_equal(m.double_row(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_col(second_col), 2.0 * second_col)
+    np.testing.assert_array_equal(m.double_complex(second_col), 2.0 * second_col)
 
     counting_3d = np.arange(27.0, dtype=np.float32).reshape((3, 3, 3))
+    counting_3d = counting_3d[::-1, ::-1, ::-1]
     slices = [counting_3d[0, :, :], counting_3d[:, 0, :], counting_3d[:, :, 0]]
     for slice_idx, ref_mat in enumerate(slices):
-        assert np.array_equal(double_mat_cm(ref_mat), 2.0 * ref_mat)
-        assert np.array_equal(double_mat_rm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_cm(ref_mat), 2.0 * ref_mat)
+        np.testing.assert_array_equal(m.double_mat_rm(ref_mat), 2.0 * ref_mat)
 
+    # Mutator:
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threer(second_row)
+    assert msg(excinfo.value) == """
+        double_threer(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float32[1, 3], flags.writeable]) -> None
 
-@pytest.requires_eigen_and_numpy
-def test_nonunit_stride_to_python():
-    from pybind11_tests import diagonal, diagonal_1, diagonal_n, block
+        Invoked with: array([ 5.,  4.,  3.], dtype=float32)
+    """  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.double_threec(second_col)
+    assert msg(excinfo.value) == """
+        double_threec(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float32[3, 1], flags.writeable]) -> None
+
+        Invoked with: array([ 7.,  4.,  1.], dtype=float32)
+    """  # noqa: E501 line too long
 
-    assert np.all(diagonal(ref) == ref.diagonal())
-    assert np.all(diagonal_1(ref) == ref.diagonal(1))
+
+def test_nonunit_stride_to_python():
+    assert np.all(m.diagonal(ref) == ref.diagonal())
+    assert np.all(m.diagonal_1(ref) == ref.diagonal(1))
     for i in range(-5, 7):
-        assert np.all(diagonal_n(ref, i) == ref.diagonal(i)), "diagonal_n({})".format(i)
+        assert np.all(m.diagonal_n(ref, i) == ref.diagonal(i)), "m.diagonal_n({})".format(i)
 
-    assert np.all(block(ref, 2, 1, 3, 3) == ref[2:5, 1:4])
-    assert np.all(block(ref, 1, 4, 4, 2) == ref[1:, 4:])
-    assert np.all(block(ref, 1, 4, 3, 2) == ref[1:4, 4:])
+    assert np.all(m.block(ref, 2, 1, 3, 3) == ref[2:5, 1:4])
+    assert np.all(m.block(ref, 1, 4, 4, 2) == ref[1:, 4:])
+    assert np.all(m.block(ref, 1, 4, 3, 2) == ref[1:4, 4:])
 
 
-@pytest.requires_eigen_and_numpy
 def test_eigen_ref_to_python():
-    from pybind11_tests import cholesky1, cholesky2, cholesky3, cholesky4, cholesky5, cholesky6
-
-    chols = [cholesky1, cholesky2, cholesky3, cholesky4, cholesky5, cholesky6]
+    chols = [m.cholesky1, m.cholesky2, m.cholesky3, m.cholesky4]
     for i, chol in enumerate(chols, start=1):
-        mymat = chol(np.array([[1, 2, 4], [2, 13, 23], [4, 23, 77]]))
+        mymat = chol(np.array([[1., 2, 4], [2, 13, 23], [4, 23, 77]]))
         assert np.all(mymat == np.array([[1, 0, 0], [2, 3, 0], [4, 5, 6]])), "cholesky{}".format(i)
 
 
-@pytest.requires_eigen_and_numpy
-def test_special_matrix_objects():
-    from pybind11_tests import incr_diag, symmetric_upper, symmetric_lower
+def assign_both(a1, a2, r, c, v):
+    a1[r, c] = v
+    a2[r, c] = v
+
+
+def array_copy_but_one(a, r, c, v):
+    z = np.array(a, copy=True)
+    z[r, c] = v
+    return z
+
+
+def test_eigen_return_references():
+    """Tests various ways of returning references and non-referencing copies"""
+
+    master = np.ones((10, 10))
+    a = m.ReturnTester()
+    a_get1 = a.get()
+    assert not a_get1.flags.owndata and a_get1.flags.writeable
+    assign_both(a_get1, master, 3, 3, 5)
+    a_get2 = a.get_ptr()
+    assert not a_get2.flags.owndata and a_get2.flags.writeable
+    assign_both(a_get1, master, 2, 3, 6)
+
+    a_view1 = a.view()
+    assert not a_view1.flags.owndata and not a_view1.flags.writeable
+    with pytest.raises(ValueError):
+        a_view1[2, 3] = 4
+    a_view2 = a.view_ptr()
+    assert not a_view2.flags.owndata and not a_view2.flags.writeable
+    with pytest.raises(ValueError):
+        a_view2[2, 3] = 4
+
+    a_copy1 = a.copy_get()
+    assert a_copy1.flags.owndata and a_copy1.flags.writeable
+    np.testing.assert_array_equal(a_copy1, master)
+    a_copy1[7, 7] = -44  # Shouldn't affect anything else
+    c1want = array_copy_but_one(master, 7, 7, -44)
+    a_copy2 = a.copy_view()
+    assert a_copy2.flags.owndata and a_copy2.flags.writeable
+    np.testing.assert_array_equal(a_copy2, master)
+    a_copy2[4, 4] = -22  # Shouldn't affect anything else
+    c2want = array_copy_but_one(master, 4, 4, -22)
+
+    a_ref1 = a.ref()
+    assert not a_ref1.flags.owndata and a_ref1.flags.writeable
+    assign_both(a_ref1, master, 1, 1, 15)
+    a_ref2 = a.ref_const()
+    assert not a_ref2.flags.owndata and not a_ref2.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref2[5, 5] = 33
+    a_ref3 = a.ref_safe()
+    assert not a_ref3.flags.owndata and a_ref3.flags.writeable
+    assign_both(a_ref3, master, 0, 7, 99)
+    a_ref4 = a.ref_const_safe()
+    assert not a_ref4.flags.owndata and not a_ref4.flags.writeable
+    with pytest.raises(ValueError):
+        a_ref4[7, 0] = 987654321
+
+    a_copy3 = a.copy_ref()
+    assert a_copy3.flags.owndata and a_copy3.flags.writeable
+    np.testing.assert_array_equal(a_copy3, master)
+    a_copy3[8, 1] = 11
+    c3want = array_copy_but_one(master, 8, 1, 11)
+    a_copy4 = a.copy_ref_const()
+    assert a_copy4.flags.owndata and a_copy4.flags.writeable
+    np.testing.assert_array_equal(a_copy4, master)
+    a_copy4[8, 4] = 88
+    c4want = array_copy_but_one(master, 8, 4, 88)
+
+    a_block1 = a.block(3, 3, 2, 2)
+    assert not a_block1.flags.owndata and a_block1.flags.writeable
+    a_block1[0, 0] = 55
+    master[3, 3] = 55
+    a_block2 = a.block_safe(2, 2, 3, 2)
+    assert not a_block2.flags.owndata and a_block2.flags.writeable
+    a_block2[2, 1] = -123
+    master[4, 3] = -123
+    a_block3 = a.block_const(6, 7, 4, 3)
+    assert not a_block3.flags.owndata and not a_block3.flags.writeable
+    with pytest.raises(ValueError):
+        a_block3[2, 2] = -44444
+
+    a_copy5 = a.copy_block(2, 2, 2, 3)
+    assert a_copy5.flags.owndata and a_copy5.flags.writeable
+    np.testing.assert_array_equal(a_copy5, master[2:4, 2:5])
+    a_copy5[1, 1] = 777
+    c5want = array_copy_but_one(master[2:4, 2:5], 1, 1, 777)
+
+    a_corn1 = a.corners()
+    assert not a_corn1.flags.owndata and a_corn1.flags.writeable
+    a_corn1 *= 50
+    a_corn1[1, 1] = 999
+    master[0, 0] = 50
+    master[0, 9] = 50
+    master[9, 0] = 50
+    master[9, 9] = 999
+    a_corn2 = a.corners_const()
+    assert not a_corn2.flags.owndata and not a_corn2.flags.writeable
+    with pytest.raises(ValueError):
+        a_corn2[1, 0] = 51
+
+    # All of the changes made all the way along should be visible everywhere
+    # now (except for the copies, of course)
+    np.testing.assert_array_equal(a_get1, master)
+    np.testing.assert_array_equal(a_get2, master)
+    np.testing.assert_array_equal(a_view1, master)
+    np.testing.assert_array_equal(a_view2, master)
+    np.testing.assert_array_equal(a_ref1, master)
+    np.testing.assert_array_equal(a_ref2, master)
+    np.testing.assert_array_equal(a_ref3, master)
+    np.testing.assert_array_equal(a_ref4, master)
+    np.testing.assert_array_equal(a_block1, master[3:5, 3:5])
+    np.testing.assert_array_equal(a_block2, master[2:5, 2:4])
+    np.testing.assert_array_equal(a_block3, master[6:10, 7:10])
+    np.testing.assert_array_equal(a_corn1, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+    np.testing.assert_array_equal(a_corn2, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+
+    np.testing.assert_array_equal(a_copy1, c1want)
+    np.testing.assert_array_equal(a_copy2, c2want)
+    np.testing.assert_array_equal(a_copy3, c3want)
+    np.testing.assert_array_equal(a_copy4, c4want)
+    np.testing.assert_array_equal(a_copy5, c5want)
+
+
+def assert_keeps_alive(cl, method, *args):
+    cstats = ConstructorStats.get(cl)
+    start_with = cstats.alive()
+    a = cl()
+    assert cstats.alive() == start_with + 1
+    z = method(a, *args)
+    assert cstats.alive() == start_with + 1
+    del a
+    # Here's the keep alive in action:
+    assert cstats.alive() == start_with + 1
+    del z
+    # Keep alive should have expired:
+    assert cstats.alive() == start_with
+
+
+def test_eigen_keepalive():
+    a = m.ReturnTester()
+    cstats = ConstructorStats.get(m.ReturnTester)
+    assert cstats.alive() == 1
+    unsafe = [a.ref(), a.ref_const(), a.block(1, 2, 3, 4)]
+    copies = [a.copy_get(), a.copy_view(), a.copy_ref(), a.copy_ref_const(),
+              a.copy_block(4, 3, 2, 1)]
+    del a
+    assert cstats.alive() == 0
+    del unsafe
+    del copies
+
+    for meth in [m.ReturnTester.get, m.ReturnTester.get_ptr, m.ReturnTester.view,
+                 m.ReturnTester.view_ptr, m.ReturnTester.ref_safe, m.ReturnTester.ref_const_safe,
+                 m.ReturnTester.corners, m.ReturnTester.corners_const]:
+        assert_keeps_alive(m.ReturnTester, meth)
+
+    for meth in [m.ReturnTester.block_safe, m.ReturnTester.block_const]:
+        assert_keeps_alive(m.ReturnTester, meth, 4, 3, 2, 1)
+
+
+def test_eigen_ref_mutators():
+    """Tests Eigen's ability to mutate numpy values"""
+
+    orig = np.array([[1., 2, 3], [4, 5, 6], [7, 8, 9]])
+    zr = np.array(orig)
+    zc = np.array(orig, order='F')
+    m.add_rm(zr, 1, 0, 100)
+    assert np.all(zr == np.array([[1., 2, 3], [104, 5, 6], [7, 8, 9]]))
+    m.add_cm(zc, 1, 0, 200)
+    assert np.all(zc == np.array([[1., 2, 3], [204, 5, 6], [7, 8, 9]]))
+
+    m.add_any(zr, 1, 0, 20)
+    assert np.all(zr == np.array([[1., 2, 3], [124, 5, 6], [7, 8, 9]]))
+    m.add_any(zc, 1, 0, 10)
+    assert np.all(zc == np.array([[1., 2, 3], [214, 5, 6], [7, 8, 9]]))
+
+    # Can't reference a col-major array with a row-major Ref, and vice versa:
+    with pytest.raises(TypeError):
+        m.add_rm(zc, 1, 0, 1)
+    with pytest.raises(TypeError):
+        m.add_cm(zr, 1, 0, 1)
+
+    # Overloads:
+    m.add1(zr, 1, 0, -100)
+    m.add2(zr, 1, 0, -20)
+    assert np.all(zr == orig)
+    m.add1(zc, 1, 0, -200)
+    m.add2(zc, 1, 0, -10)
+    assert np.all(zc == orig)
+
+    # a non-contiguous slice (this won't work on either the row- or
+    # column-contiguous refs, but should work for the any)
+    cornersr = zr[0::2, 0::2]
+    cornersc = zc[0::2, 0::2]
+
+    assert np.all(cornersr == np.array([[1., 3], [7, 9]]))
+    assert np.all(cornersc == np.array([[1., 3], [7, 9]]))
+
+    with pytest.raises(TypeError):
+        m.add_rm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersr, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_rm(cornersc, 0, 1, 25)
+    with pytest.raises(TypeError):
+        m.add_cm(cornersc, 0, 1, 25)
+    m.add_any(cornersr, 0, 1, 25)
+    m.add_any(cornersc, 0, 1, 44)
+    assert np.all(zr == np.array([[1., 2, 28], [4, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1., 2, 47], [4, 5, 6], [7, 8, 9]]))
+
+    # You shouldn't be allowed to pass a non-writeable array to a mutating Eigen method:
+    zro = zr[0:4, 0:4]
+    zro.flags.writeable = False
+    with pytest.raises(TypeError):
+        m.add_rm(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add_any(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add1(zro, 0, 0, 0)
+    with pytest.raises(TypeError):
+        m.add2(zro, 0, 0, 0)
+
+    # integer array shouldn't be passable to a double-matrix-accepting mutating func:
+    zi = np.array([[1, 2], [3, 4]])
+    with pytest.raises(TypeError):
+        m.add_rm(zi)
+
+
+def test_numpy_ref_mutators():
+    """Tests numpy mutating Eigen matrices (for returned Eigen::Ref<...>s)"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    zc = m.get_cm_ref()
+    zcro = m.get_cm_const_ref()
+    zr = m.get_rm_ref()
+    zrro = m.get_rm_const_ref()
+
+    assert [zc[1, 2], zcro[1, 2], zr[1, 2], zrro[1, 2]] == [23] * 4
+
+    assert not zc.flags.owndata and zc.flags.writeable
+    assert not zr.flags.owndata and zr.flags.writeable
+    assert not zcro.flags.owndata and not zcro.flags.writeable
+    assert not zrro.flags.owndata and not zrro.flags.writeable
+
+    zc[1, 2] = 99
+    expect = np.array([[11., 12, 13], [21, 22, 99], [31, 32, 33]])
+    # We should have just changed zc, of course, but also zcro and the original eigen matrix
+    assert np.all(zc == expect)
+    assert np.all(zcro == expect)
+    assert np.all(m.get_cm_ref() == expect)
+
+    zr[1, 2] = 99
+    assert np.all(zr == expect)
+    assert np.all(zrro == expect)
+    assert np.all(m.get_rm_ref() == expect)
+
+    # Make sure the readonly ones are numpy-readonly:
+    with pytest.raises(ValueError):
+        zcro[1, 2] = 6
+    with pytest.raises(ValueError):
+        zrro[1, 2] = 6
+
+    # We should be able to explicitly copy like this (and since we're copying,
+    # the const should drop away)
+    y1 = np.array(m.get_cm_const_ref())
+
+    assert y1.flags.owndata and y1.flags.writeable
+    # We should get copies of the eigen data, which was modified above:
+    assert y1[1, 2] == 99
+    y1[1, 2] += 12
+    assert y1[1, 2] == 111
+    assert zc[1, 2] == 99  # Make sure we aren't referencing the original
+
+
+def test_both_ref_mutators():
+    """Tests a complex chain of nested eigen/numpy references"""
+
+    m.reset_refs()  # In case another test already changed it
+
+    z = m.get_cm_ref()  # numpy -> eigen
+    z[0, 2] -= 3
+    z2 = m.incr_matrix(z, 1)  # numpy -> eigen -> numpy -> eigen
+    z2[1, 1] += 6
+    z3 = m.incr_matrix(z, 2)  # (numpy -> eigen)^3
+    z3[2, 2] += -5
+    z4 = m.incr_matrix(z, 3)  # (numpy -> eigen)^4
+    z4[1, 1] -= 1
+    z5 = m.incr_matrix(z, 4)  # (numpy -> eigen)^5
+    z5[0, 0] = 0
+    assert np.all(z == z2)
+    assert np.all(z == z3)
+    assert np.all(z == z4)
+    assert np.all(z == z5)
+    expect = np.array([[0., 22, 20], [31, 37, 33], [41, 42, 38]])
+    assert np.all(z == expect)
+
+    y = np.array(range(100), dtype='float64').reshape(10, 10)
+    y2 = m.incr_matrix_any(y, 10)  # np -> eigen -> np
+    y3 = m.incr_matrix_any(y2[0::2, 0::2], -33)  # np -> eigen -> np slice -> np -> eigen -> np
+    y4 = m.even_rows(y3)  # numpy -> eigen slice -> (... y3)
+    y5 = m.even_cols(y4)  # numpy -> eigen slice -> (... y4)
+    y6 = m.incr_matrix_any(y5, 1000)  # numpy -> eigen -> (... y5)
+
+    # Apply same mutations using just numpy:
+    yexpect = np.array(range(100), dtype='float64').reshape(10, 10)
+    yexpect += 10
+    yexpect[0::2, 0::2] -= 33
+    yexpect[0::4, 0::4] += 1000
+    assert np.all(y6 == yexpect[0::4, 0::4])
+    assert np.all(y5 == yexpect[0::4, 0::4])
+    assert np.all(y4 == yexpect[0::4, 0::2])
+    assert np.all(y3 == yexpect[0::2, 0::2])
+    assert np.all(y2 == yexpect)
+    assert np.all(y == yexpect)
+
+
+def test_nocopy_wrapper():
+    # get_elem requires a column-contiguous matrix reference, but should be
+    # callable with other types of matrix (via copying):
+    int_matrix_colmajor = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='F')
+    dbl_matrix_colmajor = np.array(int_matrix_colmajor, dtype='double', order='F', copy=True)
+    int_matrix_rowmajor = np.array(int_matrix_colmajor, order='C', copy=True)
+    dbl_matrix_rowmajor = np.array(int_matrix_rowmajor, dtype='double', order='C', copy=True)
+
+    # All should be callable via get_elem:
+    assert m.get_elem(int_matrix_colmajor) == 8
+    assert m.get_elem(dbl_matrix_colmajor) == 8
+    assert m.get_elem(int_matrix_rowmajor) == 8
+    assert m.get_elem(dbl_matrix_rowmajor) == 8
+
+    # All but the second should fail with m.get_elem_nocopy:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_colmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    assert m.get_elem_nocopy(dbl_matrix_colmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(int_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.f_contiguous' in str(excinfo.value))
+
+    # For the row-major test, we take a long matrix in row-major, so only the third is allowed:
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(int_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_colmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+    assert m.get_elem_rm_nocopy(int_matrix_rowmajor) == 8
+    with pytest.raises(TypeError) as excinfo:
+        m.get_elem_rm_nocopy(dbl_matrix_rowmajor)
+    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
+            ', flags.c_contiguous' in str(excinfo.value))
+
+
+def test_eigen_ref_life_support():
+    """Ensure the lifetime of temporary arrays created by the `Ref` caster
+
+    The `Ref` caster sometimes creates a copy which needs to stay alive. This needs to
+    happen both for directs casts (just the array) or indirectly (e.g. list of arrays).
+    """
 
-    assert np.all(incr_diag(7) == np.diag([1, 2, 3, 4, 5, 6, 7]))
+    a = np.full(shape=10, fill_value=8, dtype=np.int8)
+    assert m.get_elem_direct(a) == 8
 
-    asymm = np.array([[ 1,  2,  3,  4],
+    list_of_a = [a]
+    assert m.get_elem_indirect(list_of_a) == 8
+
+
+def test_special_matrix_objects():
+    assert np.all(m.incr_diag(7) == np.diag([1., 2, 3, 4, 5, 6, 7]))
+
+    asymm = np.array([[ 1.,  2,  3,  4],
                       [ 5,  6,  7,  8],
                       [ 9, 10, 11, 12],
                       [13, 14, 15, 16]])
@@ -102,44 +603,79 @@ def test_special_matrix_objects():
             symm_lower[i, j] = symm_lower[j, i]
             symm_upper[j, i] = symm_upper[i, j]
 
-    assert np.all(symmetric_lower(asymm) == symm_lower)
-    assert np.all(symmetric_upper(asymm) == symm_upper)
+    assert np.all(m.symmetric_lower(asymm) == symm_lower)
+    assert np.all(m.symmetric_upper(asymm) == symm_upper)
 
 
-@pytest.requires_eigen_and_numpy
 def test_dense_signature(doc):
-    from pybind11_tests import double_col, double_row, double_mat_rm
-
-    assert doc(double_col) == """
+    assert doc(m.double_col) == """
         double_col(arg0: numpy.ndarray[float32[m, 1]]) -> numpy.ndarray[float32[m, 1]]
     """
-    assert doc(double_row) == """
+    assert doc(m.double_row) == """
         double_row(arg0: numpy.ndarray[float32[1, n]]) -> numpy.ndarray[float32[1, n]]
     """
-    assert doc(double_mat_rm) == """
+    assert doc(m.double_complex) == """
+        double_complex(arg0: numpy.ndarray[complex64[m, 1]]) -> numpy.ndarray[complex64[m, 1]]
+    """
+    assert doc(m.double_mat_rm) == """
         double_mat_rm(arg0: numpy.ndarray[float32[m, n]]) -> numpy.ndarray[float32[m, n]]
     """
 
 
+def test_named_arguments():
+    a = np.array([[1.0, 2], [3, 4], [5, 6]])
+    b = np.ones((2, 1))
+
+    assert np.all(m.matrix_multiply(a, b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(A=a, B=b) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(B=b, A=a) == np.array([[3.], [7], [11]]))
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(b, a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(A=b, B=a)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.matrix_multiply(B=a, A=b)
+    assert str(excinfo.value) == 'Nonconformable matrices!'
+
+
 @pytest.requires_eigen_and_scipy
 def test_sparse():
-    from pybind11_tests import sparse_r, sparse_c, sparse_passthrough_r, sparse_passthrough_c
-
-    assert_sparse_equal_ref(sparse_r())
-    assert_sparse_equal_ref(sparse_c())
-    assert_sparse_equal_ref(sparse_passthrough_r(sparse_r()))
-    assert_sparse_equal_ref(sparse_passthrough_c(sparse_c()))
-    assert_sparse_equal_ref(sparse_passthrough_r(sparse_c()))
-    assert_sparse_equal_ref(sparse_passthrough_c(sparse_r()))
+    assert_sparse_equal_ref(m.sparse_r())
+    assert_sparse_equal_ref(m.sparse_c())
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_r()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_r(m.sparse_c()))
+    assert_sparse_equal_ref(m.sparse_copy_c(m.sparse_r()))
 
 
 @pytest.requires_eigen_and_scipy
 def test_sparse_signature(doc):
-    from pybind11_tests import sparse_passthrough_r, sparse_passthrough_c
-
-    assert doc(sparse_passthrough_r) == """
-        sparse_passthrough_r(arg0: scipy.sparse.csr_matrix[float32]) -> scipy.sparse.csr_matrix[float32]
+    assert doc(m.sparse_copy_r) == """
+        sparse_copy_r(arg0: scipy.sparse.csr_matrix[float32]) -> scipy.sparse.csr_matrix[float32]
     """  # noqa: E501 line too long
-    assert doc(sparse_passthrough_c) == """
-        sparse_passthrough_c(arg0: scipy.sparse.csc_matrix[float32]) -> scipy.sparse.csc_matrix[float32]
+    assert doc(m.sparse_copy_c) == """
+        sparse_copy_c(arg0: scipy.sparse.csc_matrix[float32]) -> scipy.sparse.csc_matrix[float32]
     """  # noqa: E501 line too long
+
+
+def test_issue738():
+    """Ignore strides on a length-1 dimension (even if they would be incompatible length > 1)"""
+    assert np.all(m.iss738_f1(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f1(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+    assert np.all(m.iss738_f2(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
+    assert np.all(m.iss738_f2(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+
+
+def test_custom_operator_new():
+    """Using Eigen types as member variables requires a class-specific
+    operator new with proper alignment"""
+
+    o = m.CustomOperatorNew()
+    np.testing.assert_allclose(o.a, 0.0)
+    np.testing.assert_allclose(o.b.diagonal(), 1.0)
diff --git a/pybind11/tests/test_embed/CMakeLists.txt b/pybind11/tests/test_embed/CMakeLists.txt
new file mode 100644
index 000000000..0a43e0e22
--- /dev/null
+++ b/pybind11/tests/test_embed/CMakeLists.txt
@@ -0,0 +1,34 @@
+if(${PYTHON_MODULE_EXTENSION} MATCHES "pypy")
+  add_custom_target(cpptest)  # Dummy target on PyPy. Embedding is not supported.
+  set(_suppress_unused_variable_warning "${DOWNLOAD_CATCH}")
+  return()
+endif()
+
+find_package(Catch 1.9.3)
+if(NOT CATCH_FOUND)
+  message(STATUS "Catch not detected. Interpreter tests will be skipped. Install Catch headers"
+                 " manually or use `cmake -DDOWNLOAD_CATCH=1` to fetch them automatically.")
+  return()
+endif()
+
+add_executable(test_embed
+  catch.cpp
+  test_interpreter.cpp
+)
+target_include_directories(test_embed PRIVATE ${CATCH_INCLUDE_DIR})
+pybind11_enable_warnings(test_embed)
+
+if(NOT CMAKE_VERSION VERSION_LESS 3.0)
+  target_link_libraries(test_embed PRIVATE pybind11::embed)
+else()
+  target_include_directories(test_embed PRIVATE ${PYBIND11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
+  target_compile_options(test_embed PRIVATE ${PYBIND11_CPP_STANDARD})
+  target_link_libraries(test_embed PRIVATE ${PYTHON_LIBRARIES})
+endif()
+
+find_package(Threads REQUIRED)
+target_link_libraries(test_embed PUBLIC ${CMAKE_THREAD_LIBS_INIT})
+
+add_custom_target(cpptest COMMAND $<TARGET_FILE:test_embed>
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+add_dependencies(check cpptest)
diff --git a/pybind11/tests/test_embed/catch.cpp b/pybind11/tests/test_embed/catch.cpp
new file mode 100644
index 000000000..cface485d
--- /dev/null
+++ b/pybind11/tests/test_embed/catch.cpp
@@ -0,0 +1,16 @@
+// The Catch implementation is compiled here. This is a standalone
+// translation unit to avoid recompiling it for every test change.
+
+#include <pybind11/embed.h>
+
+#define CATCH_CONFIG_RUNNER
+#include <catch.hpp>
+
+namespace py = pybind11;
+
+int main(int argc, const char *argv[]) {
+    py::scoped_interpreter guard{};
+    auto result = Catch::Session().run(argc, argv);
+
+    return result < 0xff ? result : 0xff;
+}
diff --git a/pybind11/tests/test_embed/test_interpreter.cpp b/pybind11/tests/test_embed/test_interpreter.cpp
new file mode 100644
index 000000000..6b5f051f2
--- /dev/null
+++ b/pybind11/tests/test_embed/test_interpreter.cpp
@@ -0,0 +1,269 @@
+#include <pybind11/embed.h>
+#include <catch.hpp>
+
+#include <thread>
+#include <fstream>
+#include <functional>
+
+namespace py = pybind11;
+using namespace py::literals;
+
+class Widget {
+public:
+    Widget(std::string message) : message(message) { }
+    virtual ~Widget() = default;
+
+    std::string the_message() const { return message; }
+    virtual int the_answer() const = 0;
+
+private:
+    std::string message;
+};
+
+class PyWidget final : public Widget {
+    using Widget::Widget;
+
+    int the_answer() const override { PYBIND11_OVERLOAD_PURE(int, Widget, the_answer); }
+};
+
+PYBIND11_EMBEDDED_MODULE(widget_module, m) {
+    py::class_<Widget, PyWidget>(m, "Widget")
+        .def(py::init<std::string>())
+        .def_property_readonly("the_message", &Widget::the_message);
+
+    m.def("add", [](int i, int j) { return i + j; });
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_exception, ) {
+    throw std::runtime_error("C++ Error");
+}
+
+PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
+    auto d = py::dict();
+    d["missing"].cast<py::object>();
+}
+
+TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
+    auto module = py::module::import("test_interpreter");
+    REQUIRE(py::hasattr(module, "DerivedWidget"));
+
+    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module.attr("__dict__"));
+    py::exec(R"(
+        widget = DerivedWidget("{} - {}".format(hello, x))
+        message = widget.the_message
+    )", py::globals(), locals);
+    REQUIRE(locals["message"].cast<std::string>() == "Hello, World! - 5");
+
+    auto py_widget = module.attr("DerivedWidget")("The question");
+    auto message = py_widget.attr("the_message");
+    REQUIRE(message.cast<std::string>() == "The question");
+
+    const auto &cpp_widget = py_widget.cast<const Widget &>();
+    REQUIRE(cpp_widget.the_answer() == 42);
+}
+
+TEST_CASE("Import error handling") {
+    REQUIRE_NOTHROW(py::module::import("widget_module"));
+    REQUIRE_THROWS_WITH(py::module::import("throw_exception"),
+                        "ImportError: C++ Error");
+    REQUIRE_THROWS_WITH(py::module::import("throw_error_already_set"),
+                        Catch::Contains("ImportError: KeyError"));
+}
+
+TEST_CASE("There can be only one interpreter") {
+    static_assert(std::is_move_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_move_assignable<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_constructible<py::scoped_interpreter>::value, "");
+    static_assert(!std::is_copy_assignable<py::scoped_interpreter>::value, "");
+
+    REQUIRE_THROWS_WITH(py::initialize_interpreter(), "The interpreter is already running");
+    REQUIRE_THROWS_WITH(py::scoped_interpreter(), "The interpreter is already running");
+
+    py::finalize_interpreter();
+    REQUIRE_NOTHROW(py::scoped_interpreter());
+    {
+        auto pyi1 = py::scoped_interpreter();
+        auto pyi2 = std::move(pyi1);
+    }
+    py::initialize_interpreter();
+}
+
+bool has_pybind11_internals_builtin() {
+    auto builtins = py::handle(PyEval_GetBuiltins());
+    return builtins.contains(PYBIND11_INTERNALS_ID);
+};
+
+bool has_pybind11_internals_static() {
+    return py::detail::get_internals_ptr() != nullptr;
+}
+
+TEST_CASE("Restart the interpreter") {
+    // Verify pre-restart state.
+    REQUIRE(py::module::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Restart the interpreter.
+    py::finalize_interpreter();
+    REQUIRE(Py_IsInitialized() == 0);
+
+    py::initialize_interpreter();
+    REQUIRE(Py_IsInitialized() == 1);
+
+    // Internals are deleted after a restart.
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    pybind11::detail::get_internals();
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Make sure that an interpreter with no get_internals() created until finalize still gets the
+    // internals destroyed
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    bool ran = false;
+    py::module::import("__main__").attr("internals_destroy_test") =
+        py::capsule(&ran, [](void *ran) { py::detail::get_internals(); *static_cast<bool *>(ran) = true; });
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+    REQUIRE_FALSE(ran);
+    py::finalize_interpreter();
+    REQUIRE(ran);
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    // C++ modules can be reloaded.
+    auto cpp_module = py::module::import("widget_module");
+    REQUIRE(cpp_module.attr("add")(1, 2).cast<int>() == 3);
+
+    // C++ type information is reloaded and can be used in python modules.
+    auto py_module = py::module::import("test_interpreter");
+    auto py_widget = py_module.attr("DerivedWidget")("Hello after restart");
+    REQUIRE(py_widget.attr("the_message").cast<std::string>() == "Hello after restart");
+}
+
+TEST_CASE("Subinterpreter") {
+    // Add tags to the modules in the main interpreter and test the basics.
+    py::module::import("__main__").attr("main_tag") = "main interpreter";
+    {
+        auto m = py::module::import("widget_module");
+        m.attr("extension_module_tag") = "added to module in main interpreter";
+
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+    REQUIRE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    /// Create and switch to a subinterpreter.
+    auto main_tstate = PyThreadState_Get();
+    auto sub_tstate = Py_NewInterpreter();
+
+    // Subinterpreters get their own copy of builtins. detail::get_internals() still
+    // works by returning from the static variable, i.e. all interpreters share a single
+    // global pybind11::internals;
+    REQUIRE_FALSE(has_pybind11_internals_builtin());
+    REQUIRE(has_pybind11_internals_static());
+
+    // Modules tags should be gone.
+    REQUIRE_FALSE(py::hasattr(py::module::import("__main__"), "tag"));
+    {
+        auto m = py::module::import("widget_module");
+        REQUIRE_FALSE(py::hasattr(m, "extension_module_tag"));
+
+        // Function bindings should still work.
+        REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
+    }
+
+    // Restore main interpreter.
+    Py_EndInterpreter(sub_tstate);
+    PyThreadState_Swap(main_tstate);
+
+    REQUIRE(py::hasattr(py::module::import("__main__"), "main_tag"));
+    REQUIRE(py::hasattr(py::module::import("widget_module"), "extension_module_tag"));
+}
+
+TEST_CASE("Execution frame") {
+    // When the interpreter is embedded, there is no execution frame, but `py::exec`
+    // should still function by using reasonable globals: `__main__.__dict__`.
+    py::exec("var = dict(number=42)");
+    REQUIRE(py::globals()["var"]["number"].cast<int>() == 42);
+}
+
+TEST_CASE("Threads") {
+    // Restart interpreter to ensure threads are not initialized
+    py::finalize_interpreter();
+    py::initialize_interpreter();
+    REQUIRE_FALSE(has_pybind11_internals_static());
+
+    constexpr auto num_threads = 10;
+    auto locals = py::dict("count"_a=0);
+
+    {
+        py::gil_scoped_release gil_release{};
+        REQUIRE(has_pybind11_internals_static());
+
+        auto threads = std::vector<std::thread>();
+        for (auto i = 0; i < num_threads; ++i) {
+            threads.emplace_back([&]() {
+                py::gil_scoped_acquire gil{};
+                locals["count"] = locals["count"].cast<int>() + 1;
+            });
+        }
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    REQUIRE(locals["count"].cast<int>() == num_threads);
+}
+
+// Scope exit utility https://stackoverflow.com/a/36644501/7255855
+struct scope_exit {
+    std::function<void()> f_;
+    explicit scope_exit(std::function<void()> f) noexcept : f_(std::move(f)) {}
+    ~scope_exit() { if (f_) f_(); }
+};
+
+TEST_CASE("Reload module from file") {
+    // Disable generation of cached bytecode (.pyc files) for this test, otherwise
+    // Python might pick up an old version from the cache instead of the new versions
+    // of the .py files generated below
+    auto sys = py::module::import("sys");
+    bool dont_write_bytecode = sys.attr("dont_write_bytecode").cast<bool>();
+    sys.attr("dont_write_bytecode") = true;
+    // Reset the value at scope exit
+    scope_exit reset_dont_write_bytecode([&]() {
+        sys.attr("dont_write_bytecode") = dont_write_bytecode;
+    });
+
+    std::string module_name = "test_module_reload";
+    std::string module_file = module_name + ".py";
+
+    // Create the module .py file
+    std::ofstream test_module(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 1\n";
+    test_module.close();
+    // Delete the file at scope exit
+    scope_exit delete_module_file([&]() {
+        std::remove(module_file.c_str());
+    });
+
+    // Import the module from file
+    auto module = py::module::import(module_name.c_str());
+    int result = module.attr("test")().cast<int>();
+    REQUIRE(result == 1);
+
+    // Update the module .py file with a small change
+    test_module.open(module_file);
+    test_module << "def test():\n";
+    test_module << "    return 2\n";
+    test_module.close();
+
+    // Reload the module
+    module.reload();
+    result = module.attr("test")().cast<int>();
+    REQUIRE(result == 2);
+}
diff --git a/pybind11/tests/test_embed/test_interpreter.py b/pybind11/tests/test_embed/test_interpreter.py
new file mode 100644
index 000000000..26a047921
--- /dev/null
+++ b/pybind11/tests/test_embed/test_interpreter.py
@@ -0,0 +1,9 @@
+from widget_module import Widget
+
+
+class DerivedWidget(Widget):
+    def __init__(self, message):
+        super(DerivedWidget, self).__init__(message)
+
+    def the_answer(self):
+        return 42
diff --git a/pybind11/tests/test_enum.cpp b/pybind11/tests/test_enum.cpp
index 09f334cdb..49f31ba1f 100644
--- a/pybind11/tests/test_enum.cpp
+++ b/pybind11/tests/test_enum.cpp
@@ -9,60 +9,63 @@
 
 #include "pybind11_tests.h"
 
-enum UnscopedEnum {
-    EOne = 1,
-    ETwo
-};
-
-enum class ScopedEnum {
-    Two = 2,
-    Three
-};
-
-enum Flags {
-    Read = 4,
-    Write = 2,
-    Execute = 1
-};
-
-class ClassWithUnscopedEnum {
-public:
-    enum EMode {
-        EFirstMode = 1,
-        ESecondMode
+TEST_SUBMODULE(enums, m) {
+    // test_unscoped_enum
+    enum UnscopedEnum {
+        EOne = 1,
+        ETwo
     };
-
-    static EMode test_function(EMode mode) {
-        return mode;
-    }
-};
-
-std::string test_scoped_enum(ScopedEnum z) {
-    return "ScopedEnum::" + std::string(z == ScopedEnum::Two ? "Two" : "Three");
-}
-
-test_initializer enums([](py::module &m) {
-    m.def("test_scoped_enum", &test_scoped_enum);
-
     py::enum_<UnscopedEnum>(m, "UnscopedEnum", py::arithmetic())
         .value("EOne", EOne)
         .value("ETwo", ETwo)
         .export_values();
 
+    // test_scoped_enum
+    enum class ScopedEnum {
+        Two = 2,
+        Three
+    };
     py::enum_<ScopedEnum>(m, "ScopedEnum", py::arithmetic())
         .value("Two", ScopedEnum::Two)
         .value("Three", ScopedEnum::Three);
 
+    m.def("test_scoped_enum", [](ScopedEnum z) {
+        return "ScopedEnum::" + std::string(z == ScopedEnum::Two ? "Two" : "Three");
+    });
+
+    // test_binary_operators
+    enum Flags {
+        Read = 4,
+        Write = 2,
+        Execute = 1
+    };
     py::enum_<Flags>(m, "Flags", py::arithmetic())
         .value("Read", Flags::Read)
         .value("Write", Flags::Write)
         .value("Execute", Flags::Execute)
         .export_values();
 
+    // test_implicit_conversion
+    class ClassWithUnscopedEnum {
+    public:
+        enum EMode {
+            EFirstMode = 1,
+            ESecondMode
+        };
+
+        static EMode test_function(EMode mode) {
+            return mode;
+        }
+    };
     py::class_<ClassWithUnscopedEnum> exenum_class(m, "ClassWithUnscopedEnum");
     exenum_class.def_static("test_function", &ClassWithUnscopedEnum::test_function);
     py::enum_<ClassWithUnscopedEnum::EMode>(exenum_class, "EMode")
         .value("EFirstMode", ClassWithUnscopedEnum::EFirstMode)
         .value("ESecondMode", ClassWithUnscopedEnum::ESecondMode)
         .export_values();
-});
+
+    // test_enum_to_int
+    m.def("test_enum_to_int", [](int) { });
+    m.def("test_enum_to_uint", [](uint32_t) { });
+    m.def("test_enum_to_long_long", [](long long) { });
+}
diff --git a/pybind11/tests/test_enum.py b/pybind11/tests/test_enum.py
index de5f3c6f6..d8eff5278 100644
--- a/pybind11/tests/test_enum.py
+++ b/pybind11/tests/test_enum.py
@@ -1,42 +1,50 @@
 import pytest
+from pybind11_tests import enums as m
 
 
 def test_unscoped_enum():
-    from pybind11_tests import UnscopedEnum, EOne
-
-    assert str(UnscopedEnum.EOne) == "UnscopedEnum.EOne"
-    assert str(UnscopedEnum.ETwo) == "UnscopedEnum.ETwo"
-    assert str(EOne) == "UnscopedEnum.EOne"
+    assert str(m.UnscopedEnum.EOne) == "UnscopedEnum.EOne"
+    assert str(m.UnscopedEnum.ETwo) == "UnscopedEnum.ETwo"
+    assert str(m.EOne) == "UnscopedEnum.EOne"
+    # __members__ property
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo}
+    # __members__ readonly
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.__members__ = {}
+    # __members__ returns a copy
+    foo = m.UnscopedEnum.__members__
+    foo["bar"] = "baz"
+    assert m.UnscopedEnum.__members__ == \
+        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo}
 
     # no TypeError exception for unscoped enum ==/!= int comparisons
-    y = UnscopedEnum.ETwo
+    y = m.UnscopedEnum.ETwo
     assert y == 2
     assert y != 3
 
-    assert int(UnscopedEnum.ETwo) == 2
-    assert str(UnscopedEnum(2)) == "UnscopedEnum.ETwo"
+    assert int(m.UnscopedEnum.ETwo) == 2
+    assert str(m.UnscopedEnum(2)) == "UnscopedEnum.ETwo"
 
     # order
-    assert UnscopedEnum.EOne < UnscopedEnum.ETwo
-    assert UnscopedEnum.EOne < 2
-    assert UnscopedEnum.ETwo > UnscopedEnum.EOne
-    assert UnscopedEnum.ETwo > 1
-    assert UnscopedEnum.ETwo <= 2
-    assert UnscopedEnum.ETwo >= 2
-    assert UnscopedEnum.EOne <= UnscopedEnum.ETwo
-    assert UnscopedEnum.EOne <= 2
-    assert UnscopedEnum.ETwo >= UnscopedEnum.EOne
-    assert UnscopedEnum.ETwo >= 1
-    assert not (UnscopedEnum.ETwo < UnscopedEnum.EOne)
-    assert not (2 < UnscopedEnum.EOne)
+    assert m.UnscopedEnum.EOne < m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne < 2
+    assert m.UnscopedEnum.ETwo > m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo > 1
+    assert m.UnscopedEnum.ETwo <= 2
+    assert m.UnscopedEnum.ETwo >= 2
+    assert m.UnscopedEnum.EOne <= m.UnscopedEnum.ETwo
+    assert m.UnscopedEnum.EOne <= 2
+    assert m.UnscopedEnum.ETwo >= m.UnscopedEnum.EOne
+    assert m.UnscopedEnum.ETwo >= 1
+    assert not (m.UnscopedEnum.ETwo < m.UnscopedEnum.EOne)
+    assert not (2 < m.UnscopedEnum.EOne)
 
 
 def test_scoped_enum():
-    from pybind11_tests import ScopedEnum, test_scoped_enum
-
-    assert test_scoped_enum(ScopedEnum.Three) == "ScopedEnum::Three"
-    z = ScopedEnum.Two
-    assert test_scoped_enum(z) == "ScopedEnum::Two"
+    assert m.test_scoped_enum(m.ScopedEnum.Three) == "ScopedEnum::Three"
+    z = m.ScopedEnum.Two
+    assert m.test_scoped_enum(z) == "ScopedEnum::Two"
 
     # expected TypeError exceptions for scoped enum ==/!= int comparisons
     with pytest.raises(TypeError):
@@ -45,23 +53,21 @@ def test_scoped_enum():
         assert z != 3
 
     # order
-    assert ScopedEnum.Two < ScopedEnum.Three
-    assert ScopedEnum.Three > ScopedEnum.Two
-    assert ScopedEnum.Two <= ScopedEnum.Three
-    assert ScopedEnum.Two <= ScopedEnum.Two
-    assert ScopedEnum.Two >= ScopedEnum.Two
-    assert ScopedEnum.Three >= ScopedEnum.Two
+    assert m.ScopedEnum.Two < m.ScopedEnum.Three
+    assert m.ScopedEnum.Three > m.ScopedEnum.Two
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Three
+    assert m.ScopedEnum.Two <= m.ScopedEnum.Two
+    assert m.ScopedEnum.Two >= m.ScopedEnum.Two
+    assert m.ScopedEnum.Three >= m.ScopedEnum.Two
 
 
 def test_implicit_conversion():
-    from pybind11_tests import ClassWithUnscopedEnum
-
-    assert str(ClassWithUnscopedEnum.EMode.EFirstMode) == "EMode.EFirstMode"
-    assert str(ClassWithUnscopedEnum.EFirstMode) == "EMode.EFirstMode"
+    assert str(m.ClassWithUnscopedEnum.EMode.EFirstMode) == "EMode.EFirstMode"
+    assert str(m.ClassWithUnscopedEnum.EFirstMode) == "EMode.EFirstMode"
 
-    f = ClassWithUnscopedEnum.test_function
-    first = ClassWithUnscopedEnum.EFirstMode
-    second = ClassWithUnscopedEnum.ESecondMode
+    f = m.ClassWithUnscopedEnum.test_function
+    first = m.ClassWithUnscopedEnum.EFirstMode
+    second = m.ClassWithUnscopedEnum.ESecondMode
 
     assert f(first) == 1
 
@@ -86,23 +92,30 @@ def test_implicit_conversion():
 
 
 def test_binary_operators():
-    from pybind11_tests import Flags
-
-    assert int(Flags.Read) == 4
-    assert int(Flags.Write) == 2
-    assert int(Flags.Execute) == 1
-    assert int(Flags.Read | Flags.Write | Flags.Execute) == 7
-    assert int(Flags.Read | Flags.Write) == 6
-    assert int(Flags.Read | Flags.Execute) == 5
-    assert int(Flags.Write | Flags.Execute) == 3
-    assert int(Flags.Write | 1) == 3
-
-    state = Flags.Read | Flags.Write
-    assert (state & Flags.Read) != 0
-    assert (state & Flags.Write) != 0
-    assert (state & Flags.Execute) == 0
+    assert int(m.Flags.Read) == 4
+    assert int(m.Flags.Write) == 2
+    assert int(m.Flags.Execute) == 1
+    assert int(m.Flags.Read | m.Flags.Write | m.Flags.Execute) == 7
+    assert int(m.Flags.Read | m.Flags.Write) == 6
+    assert int(m.Flags.Read | m.Flags.Execute) == 5
+    assert int(m.Flags.Write | m.Flags.Execute) == 3
+    assert int(m.Flags.Write | 1) == 3
+
+    state = m.Flags.Read | m.Flags.Write
+    assert (state & m.Flags.Read) != 0
+    assert (state & m.Flags.Write) != 0
+    assert (state & m.Flags.Execute) == 0
     assert (state & 1) == 0
 
     state2 = ~state
     assert state2 == -7
     assert int(state ^ state2) == -1
+
+
+def test_enum_to_int():
+    m.test_enum_to_int(m.Flags.Read)
+    m.test_enum_to_int(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_uint(m.Flags.Read)
+    m.test_enum_to_uint(m.ClassWithUnscopedEnum.EMode.EFirstMode)
+    m.test_enum_to_long_long(m.Flags.Read)
+    m.test_enum_to_long_long(m.ClassWithUnscopedEnum.EMode.EFirstMode)
diff --git a/pybind11/tests/test_eval.cpp b/pybind11/tests/test_eval.cpp
index ed4c226fe..e09482191 100644
--- a/pybind11/tests/test_eval.cpp
+++ b/pybind11/tests/test_eval.cpp
@@ -11,7 +11,9 @@
 #include <pybind11/eval.h>
 #include "pybind11_tests.h"
 
-test_initializer eval([](py::module &m) {
+TEST_SUBMODULE(eval_, m) {
+    // test_evals
+
     auto global = py::dict(py::module::import("__main__").attr("__dict__"));
 
     m.def("test_eval_statements", [global]() {
@@ -20,14 +22,24 @@ test_initializer eval([](py::module &m) {
             return 42;
         });
 
-        auto result = py::eval<py::eval_statements>(
-            "print('Hello World!');\n"
-            "x = call_test();",
+        // Regular string literal
+        py::exec(
+            "message = 'Hello World!'\n"
+            "x = call_test()",
             global, local
         );
+
+        // Multi-line raw string literal
+        py::exec(R"(
+            if x == 42:
+                print(message)
+            else:
+                raise RuntimeError
+            )", global, local
+        );
         auto x = local["x"].cast<int>();
 
-        return result == py::none() && x == 42;
+        return x == 42;
     });
 
     m.def("test_eval", [global]() {
@@ -45,7 +57,7 @@ test_initializer eval([](py::module &m) {
 
         auto result = py::eval<py::eval_single_statement>("x = call_test()", py::dict(), local);
         auto x = local["x"].cast<int>();
-        return result == py::none() && x == 42;
+        return result.is_none() && x == 42;
     });
 
     m.def("test_eval_file", [global](py::str filename) {
@@ -56,7 +68,7 @@ test_initializer eval([](py::module &m) {
         local["call_test2"] = py::cpp_function([&](int value) { val_out = value; });
 
         auto result = py::eval_file(filename, global, local);
-        return val_out == 43 && result == py::none();
+        return val_out == 43 && result.is_none();
     });
 
     m.def("test_eval_failure", []() {
@@ -76,4 +88,4 @@ test_initializer eval([](py::module &m) {
         }
         return false;
     });
-});
+}
diff --git a/pybind11/tests/test_eval.py b/pybind11/tests/test_eval.py
index 8715dbadb..bda4ef6bf 100644
--- a/pybind11/tests/test_eval.py
+++ b/pybind11/tests/test_eval.py
@@ -1,19 +1,17 @@
 import os
+from pybind11_tests import eval_ as m
 
 
 def test_evals(capture):
-    from pybind11_tests import (test_eval_statements, test_eval, test_eval_single_statement,
-                                test_eval_file, test_eval_failure, test_eval_file_failure)
-
     with capture:
-        assert test_eval_statements()
+        assert m.test_eval_statements()
     assert capture == "Hello World!"
 
-    assert test_eval()
-    assert test_eval_single_statement()
+    assert m.test_eval()
+    assert m.test_eval_single_statement()
 
     filename = os.path.join(os.path.dirname(__file__), "test_eval_call.py")
-    assert test_eval_file(filename)
+    assert m.test_eval_file(filename)
 
-    assert test_eval_failure()
-    assert test_eval_file_failure()
+    assert m.test_eval_failure()
+    assert m.test_eval_file_failure()
diff --git a/pybind11/tests/test_exceptions.cpp b/pybind11/tests/test_exceptions.cpp
index 706b500f2..ae28abb48 100644
--- a/pybind11/tests/test_exceptions.cpp
+++ b/pybind11/tests/test_exceptions.cpp
@@ -58,34 +58,6 @@ class MyException5_1 : public MyException5 {
     using MyException5::MyException5;
 };
 
-void throws1() {
-    throw MyException("this error should go to a custom type");
-}
-
-void throws2() {
-    throw MyException2("this error should go to a standard Python exception");
-}
-
-void throws3() {
-    throw MyException3("this error cannot be translated");
-}
-
-void throws4() {
-    throw MyException4("this error is rethrown");
-}
-
-void throws5() {
-    throw MyException5("this is a helper-defined translated exception");
-}
-
-void throws5_1() {
-    throw MyException5_1("MyException5 subclass");
-}
-
-void throws_logic_error() {
-    throw std::logic_error("this error should fall through to the standard handler");
-}
-
 struct PythonCallInDestructor {
     PythonCallInDestructor(const py::dict &d) : d(d) {}
     ~PythonCallInDestructor() { d["good"] = true; }
@@ -93,7 +65,11 @@ struct PythonCallInDestructor {
     py::dict d;
 };
 
-test_initializer custom_exceptions([](py::module &m) {
+TEST_SUBMODULE(exceptions, m) {
+    m.def("throw_std_exception", []() {
+        throw std::runtime_error("This exception was intentionally thrown.");
+    });
+
     // make a new custom exception and use it as a translation target
     static py::exception<MyException> ex(m, "MyException");
     py::register_exception_translator([](std::exception_ptr p) {
@@ -133,13 +109,20 @@ test_initializer custom_exceptions([](py::module &m) {
     // A slightly more complicated one that declares MyException5_1 as a subclass of MyException5
     py::register_exception<MyException5_1>(m, "MyException5_1", ex5.ptr());
 
-    m.def("throws1", &throws1);
-    m.def("throws2", &throws2);
-    m.def("throws3", &throws3);
-    m.def("throws4", &throws4);
-    m.def("throws5", &throws5);
-    m.def("throws5_1", &throws5_1);
-    m.def("throws_logic_error", &throws_logic_error);
+    m.def("throws1", []() { throw MyException("this error should go to a custom type"); });
+    m.def("throws2", []() { throw MyException2("this error should go to a standard Python exception"); });
+    m.def("throws3", []() { throw MyException3("this error cannot be translated"); });
+    m.def("throws4", []() { throw MyException4("this error is rethrown"); });
+    m.def("throws5", []() { throw MyException5("this is a helper-defined translated exception"); });
+    m.def("throws5_1", []() { throw MyException5_1("MyException5 subclass"); });
+    m.def("throws_logic_error", []() { throw std::logic_error("this error should fall through to the standard handler"); });
+    m.def("exception_matches", []() {
+        py::dict foo;
+        try { foo["bar"]; }
+        catch (py::error_already_set& ex) {
+            if (!ex.matches(PyExc_KeyError)) throw;
+        }
+    });
 
     m.def("throw_already_set", [](bool err) {
         if (err)
@@ -170,4 +153,16 @@ test_initializer custom_exceptions([](py::module &m) {
         }
         return false;
     });
-});
+
+    // test_nested_throws
+    m.def("try_catch", [m](py::object exc_type, py::function f, py::args args) {
+        try { f(*args); }
+        catch (py::error_already_set &ex) {
+            if (ex.matches(exc_type))
+                py::print(ex.what());
+            else
+                throw;
+        }
+    });
+
+}
diff --git a/pybind11/tests/test_exceptions.py b/pybind11/tests/test_exceptions.py
index 0025e4eb6..8d37c09b8 100644
--- a/pybind11/tests/test_exceptions.py
+++ b/pybind11/tests/test_exceptions.py
@@ -1,74 +1,144 @@
 import pytest
 
+from pybind11_tests import exceptions as m
+import pybind11_cross_module_tests as cm
 
-def test_error_already_set(msg):
-    from pybind11_tests import throw_already_set
 
+def test_std_exception(msg):
     with pytest.raises(RuntimeError) as excinfo:
-        throw_already_set(False)
+        m.throw_std_exception()
+    assert msg(excinfo.value) == "This exception was intentionally thrown."
+
+
+def test_error_already_set(msg):
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_already_set(False)
     assert msg(excinfo.value) == "Unknown internal error occurred"
 
     with pytest.raises(ValueError) as excinfo:
-        throw_already_set(True)
+        m.throw_already_set(True)
     assert msg(excinfo.value) == "foo"
 
 
-def test_python_call_in_catch():
-    from pybind11_tests import python_call_in_destructor
+def test_cross_module_exceptions():
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.raise_runtime_error()
+    assert str(excinfo.value) == "My runtime error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.raise_value_error()
+    assert str(excinfo.value) == "My value error"
+
+    with pytest.raises(ValueError) as excinfo:
+        cm.throw_pybind_value_error()
+    assert str(excinfo.value) == "pybind11 value error"
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.throw_pybind_type_error()
+    assert str(excinfo.value) == "pybind11 type error"
+
+    with pytest.raises(StopIteration) as excinfo:
+        cm.throw_stop_iteration()
+
 
+def test_python_call_in_catch():
     d = {}
-    assert python_call_in_destructor(d) is True
+    assert m.python_call_in_destructor(d) is True
     assert d["good"] is True
 
 
-def test_custom(msg):
-    from pybind11_tests import (MyException, MyException5, MyException5_1,
-                                throws1, throws2, throws3, throws4, throws5, throws5_1,
-                                throws_logic_error)
+def test_exception_matches():
+    m.exception_matches()
 
-    # Can we catch a MyException?"
-    with pytest.raises(MyException) as excinfo:
-        throws1()
+
+def test_custom(msg):
+    # Can we catch a MyException?
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws1()
     assert msg(excinfo.value) == "this error should go to a custom type"
 
     # Can we translate to standard Python exceptions?
     with pytest.raises(RuntimeError) as excinfo:
-        throws2()
+        m.throws2()
     assert msg(excinfo.value) == "this error should go to a standard Python exception"
 
     # Can we handle unknown exceptions?
     with pytest.raises(RuntimeError) as excinfo:
-        throws3()
+        m.throws3()
     assert msg(excinfo.value) == "Caught an unknown exception!"
 
     # Can we delegate to another handler by rethrowing?
-    with pytest.raises(MyException) as excinfo:
-        throws4()
+    with pytest.raises(m.MyException) as excinfo:
+        m.throws4()
     assert msg(excinfo.value) == "this error is rethrown"
 
-    # "Can we fall-through to the default handler?"
+    # Can we fall-through to the default handler?
     with pytest.raises(RuntimeError) as excinfo:
-        throws_logic_error()
+        m.throws_logic_error()
     assert msg(excinfo.value) == "this error should fall through to the standard handler"
 
     # Can we handle a helper-declared exception?
-    with pytest.raises(MyException5) as excinfo:
-        throws5()
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5()
     assert msg(excinfo.value) == "this is a helper-defined translated exception"
 
     # Exception subclassing:
-    with pytest.raises(MyException5) as excinfo:
-        throws5_1()
+    with pytest.raises(m.MyException5) as excinfo:
+        m.throws5_1()
     assert msg(excinfo.value) == "MyException5 subclass"
-    assert isinstance(excinfo.value, MyException5_1)
+    assert isinstance(excinfo.value, m.MyException5_1)
 
-    with pytest.raises(MyException5_1) as excinfo:
-        throws5_1()
+    with pytest.raises(m.MyException5_1) as excinfo:
+        m.throws5_1()
     assert msg(excinfo.value) == "MyException5 subclass"
 
-    with pytest.raises(MyException5) as excinfo:
+    with pytest.raises(m.MyException5) as excinfo:
         try:
-            throws5()
-        except MyException5_1:
+            m.throws5()
+        except m.MyException5_1:
             raise RuntimeError("Exception error: caught child from parent")
     assert msg(excinfo.value) == "this is a helper-defined translated exception"
+
+
+def test_nested_throws(capture):
+    """Tests nested (e.g. C++ -> Python -> C++) exception handling"""
+
+    def throw_myex():
+        raise m.MyException("nested error")
+
+    def throw_myex5():
+        raise m.MyException5("nested error 5")
+
+    # In the comments below, the exception is caught in the first step, thrown in the last step
+
+    # C++ -> Python
+    with capture:
+        m.try_catch(m.MyException5, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # Python -> C++ -> Python
+    with pytest.raises(m.MyException) as excinfo:
+        m.try_catch(m.MyException5, throw_myex)
+    assert str(excinfo.value) == "nested error"
+
+    def pycatch(exctype, f, *args):
+        try:
+            f(*args)
+        except m.MyException as e:
+            print(e)
+
+    # C++ -> Python -> C++ -> Python
+    with capture:
+        m.try_catch(
+            m.MyException5, pycatch, m.MyException, m.try_catch, m.MyException, throw_myex5)
+    assert str(capture).startswith("MyException5: nested error 5")
+
+    # C++ -> Python -> C++
+    with capture:
+        m.try_catch(m.MyException, pycatch, m.MyException5, m.throws4)
+    assert capture == "this error is rethrown"
+
+    # Python -> C++ -> Python -> C++
+    with pytest.raises(m.MyException5) as excinfo:
+        m.try_catch(m.MyException, pycatch, m.MyException, m.throws5)
+    assert str(excinfo.value) == "this is a helper-defined translated exception"
diff --git a/pybind11/tests/test_factory_constructors.cpp b/pybind11/tests/test_factory_constructors.cpp
new file mode 100644
index 000000000..fb33377b2
--- /dev/null
+++ b/pybind11/tests/test_factory_constructors.cpp
@@ -0,0 +1,337 @@
+/*
+    tests/test_factory_constructors.cpp -- tests construction from a factory function
+                                           via py::init_factory()
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "constructor_stats.h"
+#include <cmath>
+
+// Classes for testing python construction via C++ factory function:
+// Not publically constructible, copyable, or movable:
+class TestFactory1 {
+    friend class TestFactoryHelper;
+    TestFactory1() : value("(empty)") { print_default_created(this); }
+    TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory1(TestFactory1 &&) = delete;
+    TestFactory1(const TestFactory1 &) = delete;
+    TestFactory1 &operator=(TestFactory1 &&) = delete;
+    TestFactory1 &operator=(const TestFactory1 &) = delete;
+public:
+    std::string value;
+    ~TestFactory1() { print_destroyed(this); }
+};
+// Non-public construction, but moveable:
+class TestFactory2 {
+    friend class TestFactoryHelper;
+    TestFactory2() : value("(empty2)") { print_default_created(this); }
+    TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
+    TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
+public:
+    TestFactory2(TestFactory2 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory2 &operator=(TestFactory2 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    ~TestFactory2() { print_destroyed(this); }
+};
+// Mixed direct/factory construction:
+class TestFactory3 {
+protected:
+    friend class TestFactoryHelper;
+    TestFactory3() : value("(empty3)") { print_default_created(this); }
+    TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
+public:
+    TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory3(TestFactory3 &&m) { value = std::move(m.value); print_move_created(this); }
+    TestFactory3 &operator=(TestFactory3 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    std::string value;
+    virtual ~TestFactory3() { print_destroyed(this); }
+};
+// Inheritance test
+class TestFactory4 : public TestFactory3 {
+public:
+    TestFactory4() : TestFactory3() { print_default_created(this); }
+    TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
+    virtual ~TestFactory4() { print_destroyed(this); }
+};
+// Another class for an invalid downcast test
+class TestFactory5 : public TestFactory3 {
+public:
+    TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
+    virtual ~TestFactory5() { print_destroyed(this); }
+};
+
+class TestFactory6 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory6(int i) : value{i} { print_created(this, i); }
+    TestFactory6(TestFactory6 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory6(const TestFactory6 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory6() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF6 : public TestFactory6 {
+public:
+    // Special constructor that allows the factory to construct a PyTF6 from a TestFactory6 only
+    // when an alias is needed:
+    PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) { alias = true; print_created(this, "move", value); }
+    PyTF6(int i) : TestFactory6(i) { alias = true; print_created(this, i); }
+    PyTF6(PyTF6 &&f) : TestFactory6(std::move(f)) { print_move_created(this); }
+    PyTF6(const PyTF6 &f) : TestFactory6(f) { print_copy_created(this); }
+    PyTF6(std::string s) : TestFactory6((int) s.size()) { alias = true; print_created(this, s); }
+    virtual ~PyTF6() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory6, get, /*no args*/); }
+};
+
+class TestFactory7 {
+protected:
+    int value;
+    bool alias = false;
+public:
+    TestFactory7(int i) : value{i} { print_created(this, i); }
+    TestFactory7(TestFactory7 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    TestFactory7(const TestFactory7 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
+    virtual ~TestFactory7() { print_destroyed(this); }
+    virtual int get() { return value; }
+    bool has_alias() { return alias; }
+};
+class PyTF7 : public TestFactory7 {
+public:
+    PyTF7(int i) : TestFactory7(i) { alias = true; print_created(this, i); }
+    PyTF7(PyTF7 &&f) : TestFactory7(std::move(f)) { print_move_created(this); }
+    PyTF7(const PyTF7 &f) : TestFactory7(f) { print_copy_created(this); }
+    virtual ~PyTF7() { print_destroyed(this); }
+    int get() override { PYBIND11_OVERLOAD(int, TestFactory7, get, /*no args*/); }
+};
+
+
+class TestFactoryHelper {
+public:
+    // Non-movable, non-copyable type:
+    // Return via pointer:
+    static TestFactory1 *construct1() { return new TestFactory1(); }
+    // Holder:
+    static std::unique_ptr<TestFactory1> construct1(int a) { return std::unique_ptr<TestFactory1>(new TestFactory1(a)); }
+    // pointer again
+    static TestFactory1 *construct1_string(std::string a) { return new TestFactory1(a); }
+
+    // Moveable type:
+    // pointer:
+    static TestFactory2 *construct2() { return new TestFactory2(); }
+    // holder:
+    static std::unique_ptr<TestFactory2> construct2(int a) { return std::unique_ptr<TestFactory2>(new TestFactory2(a)); }
+    // by value moving:
+    static TestFactory2 construct2(std::string a) { return TestFactory2(a); }
+
+    // shared_ptr holder type:
+    // pointer:
+    static TestFactory3 *construct3() { return new TestFactory3(); }
+    // holder:
+    static std::shared_ptr<TestFactory3> construct3(int a) { return std::shared_ptr<TestFactory3>(new TestFactory3(a)); }
+};
+
+TEST_SUBMODULE(factory_constructors, m) {
+
+    // Define various trivial types to allow simpler overload resolution:
+    py::module m_tag = m.def_submodule("tag");
+#define MAKE_TAG_TYPE(Name) \
+    struct Name##_tag {}; \
+    py::class_<Name##_tag>(m_tag, #Name "_tag").def(py::init<>()); \
+    m_tag.attr(#Name) = py::cast(Name##_tag{})
+    MAKE_TAG_TYPE(pointer);
+    MAKE_TAG_TYPE(unique_ptr);
+    MAKE_TAG_TYPE(move);
+    MAKE_TAG_TYPE(shared_ptr);
+    MAKE_TAG_TYPE(derived);
+    MAKE_TAG_TYPE(TF4);
+    MAKE_TAG_TYPE(TF5);
+    MAKE_TAG_TYPE(null_ptr);
+    MAKE_TAG_TYPE(base);
+    MAKE_TAG_TYPE(invalid_base);
+    MAKE_TAG_TYPE(alias);
+    MAKE_TAG_TYPE(unaliasable);
+    MAKE_TAG_TYPE(mixed);
+
+    // test_init_factory_basic, test_bad_type
+    py::class_<TestFactory1>(m, "TestFactory1")
+        .def(py::init([](unique_ptr_tag, int v) { return TestFactoryHelper::construct1(v); }))
+        .def(py::init(&TestFactoryHelper::construct1_string)) // raw function pointer
+        .def(py::init([](pointer_tag) { return TestFactoryHelper::construct1(); }))
+        .def(py::init([](py::handle, int v, py::handle) { return TestFactoryHelper::construct1(v); }))
+        .def_readwrite("value", &TestFactory1::value)
+        ;
+    py::class_<TestFactory2>(m, "TestFactory2")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](unique_ptr_tag, std::string v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](move_tag) { return TestFactoryHelper::construct2(); }))
+        .def_readwrite("value", &TestFactory2::value)
+        ;
+
+    // Stateful & reused:
+    int c = 1;
+    auto c4a = [c](pointer_tag, TF4_tag, int a) { (void) c; return new TestFactory4(a);};
+
+    // test_init_factory_basic, test_init_factory_casting
+    py::class_<TestFactory3, std::shared_ptr<TestFactory3>>(m, "TestFactory3")
+        .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct3(v); }))
+        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }))
+        .def("__init__", [](TestFactory3 &self, std::string v) { new (&self) TestFactory3(v); }) // placement-new ctor
+
+        // factories returning a derived type:
+        .def(py::init(c4a)) // derived ptr
+        .def(py::init([](pointer_tag, TF5_tag, int a) { return new TestFactory5(a); }))
+        // derived shared ptr:
+        .def(py::init([](shared_ptr_tag, TF4_tag, int a) { return std::make_shared<TestFactory4>(a); }))
+        .def(py::init([](shared_ptr_tag, TF5_tag, int a) { return std::make_shared<TestFactory5>(a); }))
+
+        // Returns nullptr:
+        .def(py::init([](null_ptr_tag) { return (TestFactory3 *) nullptr; }))
+
+        .def_readwrite("value", &TestFactory3::value)
+        ;
+
+    // test_init_factory_casting
+    py::class_<TestFactory4, TestFactory3, std::shared_ptr<TestFactory4>>(m, "TestFactory4")
+        .def(py::init(c4a)) // pointer
+        ;
+
+    // Doesn't need to be registered, but registering makes getting ConstructorStats easier:
+    py::class_<TestFactory5, TestFactory3, std::shared_ptr<TestFactory5>>(m, "TestFactory5");
+
+    // test_init_factory_alias
+    // Alias testing
+    py::class_<TestFactory6, PyTF6>(m, "TestFactory6")
+        .def(py::init([](base_tag, int i) { return TestFactory6(i); }))
+        .def(py::init([](alias_tag, int i) { return PyTF6(i); }))
+        .def(py::init([](alias_tag, std::string s) { return PyTF6(s); }))
+        .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF6(i); }))
+        .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory6(i); }))
+        .def(py::init([](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
+
+        .def("get", &TestFactory6::get)
+        .def("has_alias", &TestFactory6::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference)
+        ;
+
+    // test_init_factory_dual
+    // Separate alias constructor testing
+    py::class_<TestFactory7, PyTF7, std::shared_ptr<TestFactory7>>(m, "TestFactory7")
+        .def(py::init(
+            [](int i) { return TestFactory7(i); },
+            [](int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](pointer_tag, int i) { return new TestFactory7(i); },
+            [](pointer_tag, int i) { return new PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, int i) { return new TestFactory7(i); },
+            [](mixed_tag, int i) { return PyTF7(i); }))
+        .def(py::init(
+            [](mixed_tag, std::string s) { return TestFactory7((int) s.size()); },
+            [](mixed_tag, std::string s) { return new PyTF7((int) s.size()); }))
+        .def(py::init(
+            [](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
+            [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
+        .def(py::init(
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
+            [](alias_tag, pointer_tag, int i) { return new PyTF7(10*i); }))
+        .def(py::init(
+            [](shared_ptr_tag, base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, base_tag, int i) { auto *p = new PyTF7(i); return std::shared_ptr<TestFactory7>(p); }))
+        .def(py::init(
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); },
+            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); })) // <-- invalid alias factory
+
+        .def("get", &TestFactory7::get)
+        .def("has_alias", &TestFactory7::has_alias)
+
+        .def_static("get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
+        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference)
+        ;
+
+    // test_placement_new_alternative
+    // Class with a custom new operator but *without* a placement new operator (issue #948)
+    class NoPlacementNew {
+    public:
+        NoPlacementNew(int i) : i(i) { }
+        static void *operator new(std::size_t s) {
+            auto *p = ::operator new(s);
+            py::print("operator new called, returning", reinterpret_cast<uintptr_t>(p));
+            return p;
+        }
+        static void operator delete(void *p) {
+            py::print("operator delete called on", reinterpret_cast<uintptr_t>(p));
+            ::operator delete(p);
+        }
+        int i;
+    };
+    // As of 2.2, `py::init<args>` no longer requires placement new
+    py::class_<NoPlacementNew>(m, "NoPlacementNew")
+        .def(py::init<int>())
+        .def(py::init([]() { return new NoPlacementNew(100); }))
+        .def_readwrite("i", &NoPlacementNew::i)
+        ;
+
+
+    // test_reallocations
+    // Class that has verbose operator_new/operator_delete calls
+    struct NoisyAlloc {
+        NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
+        NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
+        ~NoisyAlloc() { py::print("~NoisyAlloc()"); }
+
+        static void *operator new(size_t s) { py::print("noisy new"); return ::operator new(s); }
+        static void *operator new(size_t, void *p) { py::print("noisy placement new"); return p; }
+        static void operator delete(void *p, size_t) { py::print("noisy delete"); ::operator delete(p); }
+        static void operator delete(void *, void *) { py::print("noisy placement delete"); }
+#if defined(_MSC_VER) && _MSC_VER < 1910
+        // MSVC 2015 bug: the above "noisy delete" isn't invoked (fixed in MSVC 2017)
+        static void operator delete(void *p) { py::print("noisy delete"); ::operator delete(p); }
+#endif
+    };
+    py::class_<NoisyAlloc>(m, "NoisyAlloc")
+        // Since these overloads have the same number of arguments, the dispatcher will try each of
+        // them until the arguments convert.  Thus we can get a pre-allocation here when passing a
+        // single non-integer:
+        .def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }) // Regular constructor, runs first, requires preallocation
+        .def(py::init([](double d) { return new NoisyAlloc(d); }))
+
+        // The two-argument version: first the factory pointer overload.
+        .def(py::init([](int i, int) { return new NoisyAlloc(i); }))
+        // Return-by-value:
+        .def(py::init([](double d, int) { return NoisyAlloc(d); }))
+        // Old-style placement new init; requires preallocation
+        .def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); })
+        // Requires deallocation of previous overload preallocated value:
+        .def(py::init([](int i, double) { return new NoisyAlloc(i); }))
+        // Regular again: requires yet another preallocation
+        .def("__init__", [](NoisyAlloc &a, int i, std::string) { new (&a) NoisyAlloc(i); })
+        ;
+
+
+
+
+    // static_assert testing (the following def's should all fail with appropriate compilation errors):
+#if 0
+    struct BadF1Base {};
+    struct BadF1 : BadF1Base {};
+    struct PyBadF1 : BadF1 {};
+    py::class_<BadF1, PyBadF1, std::shared_ptr<BadF1>> bf1(m, "BadF1");
+    // wrapped factory function must return a compatible pointer, holder, or value
+    bf1.def(py::init([]() { return 3; }));
+    // incompatible factory function pointer return type
+    bf1.def(py::init([]() { static int three = 3; return &three; }));
+    // incompatible factory function std::shared_ptr<T> return type: cannot convert shared_ptr<T> to holder
+    // (non-polymorphic base)
+    bf1.def(py::init([]() { return std::shared_ptr<BadF1Base>(new BadF1()); }));
+#endif
+}
diff --git a/pybind11/tests/test_factory_constructors.py b/pybind11/tests/test_factory_constructors.py
new file mode 100644
index 000000000..78a3910ad
--- /dev/null
+++ b/pybind11/tests/test_factory_constructors.py
@@ -0,0 +1,459 @@
+import pytest
+import re
+
+from pybind11_tests import factory_constructors as m
+from pybind11_tests.factory_constructors import tag
+from pybind11_tests import ConstructorStats
+
+
+def test_init_factory_basic():
+    """Tests py::init_factory() wrapper around various ways of returning the object"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory1, m.TestFactory2, m.TestFactory3]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    x1 = m.TestFactory1(tag.unique_ptr, 3)
+    assert x1.value == "3"
+    y1 = m.TestFactory1(tag.pointer)
+    assert y1.value == "(empty)"
+    z1 = m.TestFactory1("hi!")
+    assert z1.value == "hi!"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+
+    x2 = m.TestFactory2(tag.move)
+    assert x2.value == "(empty2)"
+    y2 = m.TestFactory2(tag.pointer, 7)
+    assert y2.value == "7"
+    z2 = m.TestFactory2(tag.unique_ptr, "hi again")
+    assert z2.value == "hi again"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+
+    x3 = m.TestFactory3(tag.shared_ptr)
+    assert x3.value == "(empty3)"
+    y3 = m.TestFactory3(tag.pointer, 42)
+    assert y3.value == "42"
+    z3 = m.TestFactory3("bye")
+    assert z3.value == "bye"
+
+    with pytest.raises(TypeError) as excinfo:
+        m.TestFactory3(tag.null_ptr)
+    assert str(excinfo.value) == "pybind11::init(): factory function returned nullptr"
+
+    assert [i.alive() for i in cstats] == [3, 3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 9
+
+    del x1, y2, y3, z3
+    assert [i.alive() for i in cstats] == [2, 2, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    del x2, x3, y1, z1, z2
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["3", "hi!"],
+        ["7", "hi again"],
+        ["42", "bye"]
+    ]
+    assert [i.default_constructions for i in cstats] == [1, 1, 1]
+
+
+def test_init_factory_signature(msg):
+    with pytest.raises(TypeError) as excinfo:
+        m.TestFactory1("invalid", "constructor", "arguments")
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int)
+            2. m.factory_constructors.TestFactory1(arg0: str)
+            3. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.pointer_tag)
+            4. m.factory_constructors.TestFactory1(arg0: handle, arg1: int, arg2: handle)
+
+        Invoked with: 'invalid', 'constructor', 'arguments'
+    """  # noqa: E501 line too long
+
+    assert msg(m.TestFactory1.__init__.__doc__) == """
+        __init__(*args, **kwargs)
+        Overloaded function.
+
+        1. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int) -> None
+
+        2. __init__(self: m.factory_constructors.TestFactory1, arg0: str) -> None
+
+        3. __init__(self: m.factory_constructors.TestFactory1, arg0: m.factory_constructors.tag.pointer_tag) -> None
+
+        4. __init__(self: m.factory_constructors.TestFactory1, arg0: handle, arg1: int, arg2: handle) -> None
+    """  # noqa: E501 line too long
+
+
+def test_init_factory_casting():
+    """Tests py::init_factory() wrapper with various upcasting and downcasting returns"""
+
+    cstats = [ConstructorStats.get(c) for c in [m.TestFactory3, m.TestFactory4, m.TestFactory5]]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    # Construction from derived references:
+    a = m.TestFactory3(tag.pointer, tag.TF4, 4)
+    assert a.value == "4"
+    b = m.TestFactory3(tag.shared_ptr, tag.TF4, 5)
+    assert b.value == "5"
+    c = m.TestFactory3(tag.pointer, tag.TF5, 6)
+    assert c.value == "6"
+    d = m.TestFactory3(tag.shared_ptr, tag.TF5, 7)
+    assert d.value == "7"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    # Shared a lambda with TF3:
+    e = m.TestFactory4(tag.pointer, tag.TF4, 8)
+    assert e.value == "8"
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 5
+    assert [i.alive() for i in cstats] == [5, 3, 2]
+
+    del a
+    assert [i.alive() for i in cstats] == [4, 2, 2]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    del b, c, e
+    assert [i.alive() for i in cstats] == [1, 0, 1]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 1
+
+    del d
+    assert [i.alive() for i in cstats] == [0, 0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["4", "5", "6", "7", "8"],
+        ["4", "5", "8"],
+        ["6", "7"]
+    ]
+
+
+def test_init_factory_alias():
+    """Tests py::init_factory() wrapper with value conversions and alias types"""
+
+    cstats = [m.TestFactory6.get_cstats(), m.TestFactory6.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    a = m.TestFactory6(tag.base, 1)
+    assert a.get() == 1
+    assert not a.has_alias()
+    b = m.TestFactory6(tag.alias, "hi there")
+    assert b.get() == 8
+    assert b.has_alias()
+    c = m.TestFactory6(tag.alias, 3)
+    assert c.get() == 3
+    assert c.has_alias()
+    d = m.TestFactory6(tag.alias, tag.pointer, 4)
+    assert d.get() == 4
+    assert d.has_alias()
+    e = m.TestFactory6(tag.base, tag.pointer, 5)
+    assert e.get() == 5
+    assert not e.has_alias()
+    f = m.TestFactory6(tag.base, tag.alias, tag.pointer, 6)
+    assert f.get() == 6
+    assert f.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    assert [i.alive() for i in cstats] == [6, 4]
+
+    del a, b, e
+    assert [i.alive() for i in cstats] == [3, 3]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    del f, c, d
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    class MyTest(m.TestFactory6):
+        def __init__(self, *args):
+            m.TestFactory6.__init__(self, *args)
+
+        def get(self):
+            return -5 + m.TestFactory6.get(self)
+
+    # Return Class by value, moved into new alias:
+    z = MyTest(tag.base, 123)
+    assert z.get() == 118
+    assert z.has_alias()
+
+    # Return alias by value, moved into new alias:
+    y = MyTest(tag.alias, "why hello!")
+    assert y.get() == 5
+    assert y.has_alias()
+
+    # Return Class by pointer, moved into new alias then original destroyed:
+    x = MyTest(tag.base, tag.pointer, 47)
+    assert x.get() == 42
+    assert x.has_alias()
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 3
+    assert [i.alive() for i in cstats] == [3, 3]
+    del x, y, z
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "8", "3", "4", "5", "6", "123", "10", "47"],
+        ["hi there", "3", "4", "6", "move", "123", "why hello!", "move", "47"]
+    ]
+
+
+def test_init_factory_dual():
+    """Tests init factory functions with dual main/alias factory functions"""
+    from pybind11_tests.factory_constructors import TestFactory7
+
+    cstats = [TestFactory7.get_cstats(), TestFactory7.get_alias_cstats()]
+    cstats[0].alive()  # force gc
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    class PythFactory7(TestFactory7):
+        def get(self):
+            return 100 + TestFactory7.get(self)
+
+    a1 = TestFactory7(1)
+    a2 = PythFactory7(2)
+    assert a1.get() == 1
+    assert a2.get() == 102
+    assert not a1.has_alias()
+    assert a2.has_alias()
+
+    b1 = TestFactory7(tag.pointer, 3)
+    b2 = PythFactory7(tag.pointer, 4)
+    assert b1.get() == 3
+    assert b2.get() == 104
+    assert not b1.has_alias()
+    assert b2.has_alias()
+
+    c1 = TestFactory7(tag.mixed, 5)
+    c2 = PythFactory7(tag.mixed, 6)
+    assert c1.get() == 5
+    assert c2.get() == 106
+    assert not c1.has_alias()
+    assert c2.has_alias()
+
+    d1 = TestFactory7(tag.base, tag.pointer, 7)
+    d2 = PythFactory7(tag.base, tag.pointer, 8)
+    assert d1.get() == 7
+    assert d2.get() == 108
+    assert not d1.has_alias()
+    assert d2.has_alias()
+
+    # Both return an alias; the second multiplies the value by 10:
+    e1 = TestFactory7(tag.alias, tag.pointer, 9)
+    e2 = PythFactory7(tag.alias, tag.pointer, 10)
+    assert e1.get() == 9
+    assert e2.get() == 200
+    assert e1.has_alias()
+    assert e2.has_alias()
+
+    f1 = TestFactory7(tag.shared_ptr, tag.base, 11)
+    f2 = PythFactory7(tag.shared_ptr, tag.base, 12)
+    assert f1.get() == 11
+    assert f2.get() == 112
+    assert not f1.has_alias()
+    assert f2.has_alias()
+
+    g1 = TestFactory7(tag.shared_ptr, tag.invalid_base, 13)
+    assert g1.get() == 13
+    assert not g1.has_alias()
+    with pytest.raises(TypeError) as excinfo:
+        PythFactory7(tag.shared_ptr, tag.invalid_base, 14)
+    assert (str(excinfo.value) ==
+            "pybind11::init(): construction failed: returned holder-wrapped instance is not an "
+            "alias instance")
+
+    assert [i.alive() for i in cstats] == [13, 7]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 13
+
+    del a1, a2, b1, d1, e1, e2
+    assert [i.alive() for i in cstats] == [7, 4]
+    assert ConstructorStats.detail_reg_inst() == n_inst + 7
+    del b2, c1, c2, d2, f1, f2, g1
+    assert [i.alive() for i in cstats] == [0, 0]
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    assert [i.values() for i in cstats] == [
+        ["1", "2", "3", "4", "5", "6", "7", "8", "9", "100", "11", "12", "13", "14"],
+        ["2", "4", "6", "8", "9", "100", "12"]
+    ]
+
+
+def test_no_placement_new(capture):
+    """Prior to 2.2, `py::init<...>` relied on the type supporting placement
+    new; this tests a class without placement new support."""
+    with capture:
+        a = m.NoPlacementNew(123)
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert a.i == 123
+    with capture:
+        del a
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+    with capture:
+        b = m.NoPlacementNew()
+
+    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    assert found
+    assert b.i == 100
+    with capture:
+        del b
+        pytest.gc_collect()
+    assert capture == "operator delete called on " + found.group(1)
+
+
+def test_multiple_inheritance():
+    class MITest(m.TestFactory1, m.TestFactory2):
+        def __init__(self):
+            m.TestFactory1.__init__(self, tag.unique_ptr, 33)
+            m.TestFactory2.__init__(self, tag.move)
+
+    a = MITest()
+    assert m.TestFactory1.value.fget(a) == "33"
+    assert m.TestFactory2.value.fget(a) == "(empty2)"
+
+
+def create_and_destroy(*args):
+    a = m.NoisyAlloc(*args)
+    print("---")
+    del a
+    pytest.gc_collect()
+
+
+def strip_comments(s):
+    return re.sub(r'\s+#.*', '', s)
+
+
+def test_reallocations(capture, msg):
+    """When the constructor is overloaded, previous overloads can require a preallocated value.
+    This test makes sure that such preallocated values only happen when they might be necessary,
+    and that they are deallocated properly"""
+
+    pytest.gc_collect()
+
+    with capture:
+        create_and_destroy(1)
+    assert msg(capture) == """
+        noisy new
+        noisy placement new
+        NoisyAlloc(int 1)
+        ---
+        ~NoisyAlloc()
+        noisy delete
+    """
+    with capture:
+        create_and_destroy(1.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # allocation required to attempt first overload
+        noisy delete            # have to dealloc before considering factory init overload
+        noisy new               # pointer factory calling "new", part 1: allocation
+        NoisyAlloc(double 1.5)  # ... part two, invoking constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2, 3)
+    assert msg(capture) == strip_comments("""
+        noisy new          # pointer factory calling "new", allocation
+        NoisyAlloc(int 2)  # constructor
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(2.5, 3)
+    assert msg(capture) == strip_comments("""
+        NoisyAlloc(double 2.5)  # construction (local func variable: operator_new not called)
+        noisy new               # return-by-value "new" part 1: allocation
+        ~NoisyAlloc()           # moved-away local func variable destruction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(3.5, 4.5)
+    assert msg(capture) == strip_comments("""
+        noisy new               # preallocation needed before invoking placement-new overload
+        noisy placement new     # Placement new
+        NoisyAlloc(double 3.5)  # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(4, 0.5)
+    assert msg(capture) == strip_comments("""
+        noisy new          # preallocation needed before invoking placement-new overload
+        noisy delete       # deallocation of preallocated storage
+        noisy new          # Factory pointer allocation
+        NoisyAlloc(int 4)  # factory pointer construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+    with capture:
+        create_and_destroy(5, "hi")
+    assert msg(capture) == strip_comments("""
+        noisy new            # preallocation needed before invoking first placement new
+        noisy delete         # delete before considering new-style constructor
+        noisy new            # preallocation for second placement new
+        noisy placement new  # Placement new in the second placement new overload
+        NoisyAlloc(int 5)    # construction
+        ---
+        ~NoisyAlloc()  # Destructor
+        noisy delete   # operator delete
+    """)
+
+
+@pytest.unsupported_on_py2
+def test_invalid_self():
+    """Tests invocation of the pybind-registered base class with an invalid `self` argument.  You
+    can only actually do this on Python 3: Python 2 raises an exception itself if you try."""
+    class NotPybindDerived(object):
+        pass
+
+    # Attempts to initialize with an invalid type passed as `self`:
+    class BrokenTF1(m.TestFactory1):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory1.__init__(a, tag.pointer)
+            elif bad == 2:
+                a = NotPybindDerived()
+                m.TestFactory1.__init__(a, tag.pointer)
+
+    # Same as above, but for a class with an alias:
+    class BrokenTF6(m.TestFactory6):
+        def __init__(self, bad):
+            if bad == 1:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.base, 1)
+            elif bad == 2:
+                a = m.TestFactory2(tag.pointer, 1)
+                m.TestFactory6.__init__(a, tag.alias, 1)
+            elif bad == 3:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.base, 1)
+            elif bad == 4:
+                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.alias, 1)
+
+    for arg in (1, 2):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF1(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
+
+    for arg in (1, 2, 3, 4):
+        with pytest.raises(TypeError) as excinfo:
+            BrokenTF6(arg)
+        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
diff --git a/pybind11/tests/test_inheritance.cpp b/pybind11/tests/test_inheritance.cpp
deleted file mode 100644
index 2ec0b4a7a..000000000
--- a/pybind11/tests/test_inheritance.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
-    tests/test_inheritance.cpp -- inheritance, automatic upcasting for polymorphic types
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-
-class Pet {
-public:
-    Pet(const std::string &name, const std::string &species)
-        : m_name(name), m_species(species) {}
-    std::string name() const { return m_name; }
-    std::string species() const { return m_species; }
-private:
-    std::string m_name;
-    std::string m_species;
-};
-
-class Dog : public Pet {
-public:
-    Dog(const std::string &name) : Pet(name, "dog") {}
-    std::string bark() const { return "Woof!"; }
-};
-
-class Rabbit : public Pet {
-public:
-    Rabbit(const std::string &name) : Pet(name, "parrot") {}
-};
-
-class Hamster : public Pet {
-public:
-    Hamster(const std::string &name) : Pet(name, "rodent") {}
-};
-
-std::string pet_name_species(const Pet &pet) {
-    return pet.name() + " is a " + pet.species();
-}
-
-std::string dog_bark(const Dog &dog) {
-    return dog.bark();
-}
-
-
-struct BaseClass { virtual ~BaseClass() {} };
-struct DerivedClass1 : BaseClass { };
-struct DerivedClass2 : BaseClass { };
-
-test_initializer inheritance([](py::module &m) {
-    py::class_<Pet> pet_class(m, "Pet");
-    pet_class
-        .def(py::init<std::string, std::string>())
-        .def("name", &Pet::name)
-        .def("species", &Pet::species);
-
-    /* One way of declaring a subclass relationship: reference parent's class_ object */
-    py::class_<Dog>(m, "Dog", pet_class)
-        .def(py::init<std::string>());
-
-    /* Another way of declaring a subclass relationship: reference parent's C++ type */
-    py::class_<Rabbit, Pet>(m, "Rabbit")
-        .def(py::init<std::string>());
-
-    /* And another: list parent in class template arguments */
-    py::class_<Hamster, Pet>(m, "Hamster")
-        .def(py::init<std::string>());
-
-    m.def("pet_name_species", pet_name_species);
-    m.def("dog_bark", dog_bark);
-
-    py::class_<BaseClass>(m, "BaseClass").def(py::init<>());
-    py::class_<DerivedClass1>(m, "DerivedClass1").def(py::init<>());
-    py::class_<DerivedClass2>(m, "DerivedClass2").def(py::init<>());
-
-    m.def("return_class_1", []() -> BaseClass* { return new DerivedClass1(); });
-    m.def("return_class_2", []() -> BaseClass* { return new DerivedClass2(); });
-    m.def("return_class_n", [](int n) -> BaseClass* {
-        if (n == 1) return new DerivedClass1();
-        if (n == 2) return new DerivedClass2();
-        return new BaseClass();
-    });
-    m.def("return_none", []() -> BaseClass* { return nullptr; });
-
-    m.def("test_isinstance", [](py::list l) {
-        struct Unregistered { }; // checks missing type_info code path
-
-        return py::make_tuple(
-            py::isinstance<py::tuple>(l[0]),
-            py::isinstance<py::dict>(l[1]),
-            py::isinstance<Pet>(l[2]),
-            py::isinstance<Pet>(l[3]),
-            py::isinstance<Dog>(l[4]),
-            py::isinstance<Rabbit>(l[5]),
-            py::isinstance<Unregistered>(l[6])
-        );
-    });
-});
diff --git a/pybind11/tests/test_inheritance.py b/pybind11/tests/test_inheritance.py
deleted file mode 100644
index 7bb52be02..000000000
--- a/pybind11/tests/test_inheritance.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pytest
-
-
-def test_inheritance(msg):
-    from pybind11_tests import Pet, Dog, Rabbit, Hamster, dog_bark, pet_name_species
-
-    roger = Rabbit('Rabbit')
-    assert roger.name() + " is a " + roger.species() == "Rabbit is a parrot"
-    assert pet_name_species(roger) == "Rabbit is a parrot"
-
-    polly = Pet('Polly', 'parrot')
-    assert polly.name() + " is a " + polly.species() == "Polly is a parrot"
-    assert pet_name_species(polly) == "Polly is a parrot"
-
-    molly = Dog('Molly')
-    assert molly.name() + " is a " + molly.species() == "Molly is a dog"
-    assert pet_name_species(molly) == "Molly is a dog"
-
-    fred = Hamster('Fred')
-    assert fred.name() + " is a " + fred.species() == "Fred is a rodent"
-
-    assert dog_bark(molly) == "Woof!"
-
-    with pytest.raises(TypeError) as excinfo:
-        dog_bark(polly)
-    assert msg(excinfo.value) == """
-        dog_bark(): incompatible function arguments. The following argument types are supported:
-            1. (arg0: m.Dog) -> str
-
-        Invoked with: <m.Pet object at 0>
-    """
-
-
-def test_automatic_upcasting():
-    from pybind11_tests import return_class_1, return_class_2, return_class_n, return_none
-
-    assert type(return_class_1()).__name__ == "DerivedClass1"
-    assert type(return_class_2()).__name__ == "DerivedClass2"
-    assert type(return_none()).__name__ == "NoneType"
-    # Repeat these a few times in a random order to ensure no invalid caching is applied
-    assert type(return_class_n(1)).__name__ == "DerivedClass1"
-    assert type(return_class_n(2)).__name__ == "DerivedClass2"
-    assert type(return_class_n(0)).__name__ == "BaseClass"
-    assert type(return_class_n(2)).__name__ == "DerivedClass2"
-    assert type(return_class_n(2)).__name__ == "DerivedClass2"
-    assert type(return_class_n(0)).__name__ == "BaseClass"
-    assert type(return_class_n(1)).__name__ == "DerivedClass1"
-
-
-def test_isinstance():
-    from pybind11_tests import test_isinstance, Pet, Dog
-
-    objects = [tuple(), dict(), Pet("Polly", "parrot")] + [Dog("Molly")] * 4
-    expected = (True, True, True, True, True, False, False)
-    assert test_isinstance(objects) == expected
diff --git a/pybind11/tests/test_iostream.cpp b/pybind11/tests/test_iostream.cpp
new file mode 100644
index 000000000..e67f88af5
--- /dev/null
+++ b/pybind11/tests/test_iostream.cpp
@@ -0,0 +1,73 @@
+/*
+    tests/test_iostream.cpp -- Usage of scoped_output_redirect
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+
+#include <pybind11/iostream.h>
+#include "pybind11_tests.h"
+#include <iostream>
+
+
+void noisy_function(std::string msg, bool flush) {
+
+    std::cout << msg;
+    if (flush)
+        std::cout << std::flush;
+}
+
+void noisy_funct_dual(std::string msg, std::string emsg) {
+    std::cout << msg;
+    std::cerr << emsg;
+}
+
+TEST_SUBMODULE(iostream, m) {
+
+    add_ostream_redirect(m);
+
+    // test_evals
+
+    m.def("captured_output_default", [](std::string msg) {
+        py::scoped_ostream_redirect redir;
+        std::cout << msg << std::flush;
+    });
+
+    m.def("captured_output", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cout, py::module::import("sys").attr("stdout"));
+        std::cout << msg << std::flush;
+    });
+
+    m.def("guard_output", &noisy_function,
+            py::call_guard<py::scoped_ostream_redirect>(),
+            py::arg("msg"), py::arg("flush")=true);
+
+    m.def("captured_err", [](std::string msg) {
+        py::scoped_ostream_redirect redir(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("noisy_function", &noisy_function, py::arg("msg"), py::arg("flush") = true);
+
+    m.def("dual_guard", &noisy_funct_dual,
+            py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),
+            py::arg("msg"), py::arg("emsg"));
+
+    m.def("raw_output", [](std::string msg) {
+        std::cout << msg << std::flush;
+    });
+
+    m.def("raw_err", [](std::string msg) {
+        std::cerr << msg << std::flush;
+    });
+
+    m.def("captured_dual", [](std::string msg, std::string emsg) {
+        py::scoped_ostream_redirect redirout(std::cout, py::module::import("sys").attr("stdout"));
+        py::scoped_ostream_redirect redirerr(std::cerr, py::module::import("sys").attr("stderr"));
+        std::cout << msg << std::flush;
+        std::cerr << emsg << std::flush;
+    });
+}
diff --git a/pybind11/tests/test_iostream.py b/pybind11/tests/test_iostream.py
new file mode 100644
index 000000000..3364849a4
--- /dev/null
+++ b/pybind11/tests/test_iostream.py
@@ -0,0 +1,203 @@
+from pybind11_tests import iostream as m
+import sys
+
+from contextlib import contextmanager
+
+try:
+    # Python 3
+    from io import StringIO
+except ImportError:
+    # Python 2
+    try:
+        from cStringIO import StringIO
+    except ImportError:
+        from StringIO import StringIO
+
+try:
+    # Python 3.4
+    from contextlib import redirect_stdout
+except ImportError:
+    @contextmanager
+    def redirect_stdout(target):
+        original = sys.stdout
+        sys.stdout = target
+        yield
+        sys.stdout = original
+
+try:
+    # Python 3.5
+    from contextlib import redirect_stderr
+except ImportError:
+    @contextmanager
+    def redirect_stderr(target):
+        original = sys.stderr
+        sys.stderr = target
+        yield
+        sys.stderr = original
+
+
+def test_captured(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.captured_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+    m.captured_err(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+
+
+def test_guard_capture(capsys):
+    msg = "I've been redirected to Python, I hope!"
+    m.guard_output(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+
+
+def test_series_captured(capture):
+    with capture:
+        m.captured_output("a")
+        m.captured_output("b")
+    assert capture == "ab"
+
+
+def test_flush(capfd):
+    msg = "(not flushed)"
+    msg2 = "(flushed)"
+
+    with m.ostream_redirect():
+        m.noisy_function(msg, flush=False)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == ''
+
+        m.noisy_function(msg2, flush=True)
+        stdout, stderr = capfd.readouterr()
+        assert stdout == msg + msg2
+
+        m.noisy_function(msg, flush=False)
+
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+
+
+def test_not_captured(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_err(capfd):
+    msg = "Something that should not show up in log"
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.raw_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        m.captured_err(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+
+
+def test_multi_captured(capfd):
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.captured_output("a")
+        m.raw_output("b")
+        m.captured_output("c")
+        m.raw_output("d")
+    stdout, stderr = capfd.readouterr()
+    assert stdout == 'bd'
+    assert stream.getvalue() == 'ac'
+
+
+def test_dual(capsys):
+    m.captured_dual("a", "b")
+    stdout, stderr = capsys.readouterr()
+    assert stdout == "a"
+    assert stderr == "b"
+
+
+def test_redirect(capfd):
+    msg = "Should not be in log!"
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        with m.ostream_redirect():
+            m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stream.getvalue() == msg
+
+    stream = StringIO()
+    with redirect_stdout(stream):
+        m.raw_output(msg)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stream.getvalue() == ''
+
+
+def test_redirect_err(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    with redirect_stderr(stream):
+        with m.ostream_redirect(stdout=False):
+            m.raw_output(msg)
+            m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == msg
+    assert stderr == ''
+    assert stream.getvalue() == msg2
+
+
+def test_redirect_both(capfd):
+    msg = "StdOut"
+    msg2 = "StdErr"
+
+    stream = StringIO()
+    stream2 = StringIO()
+    with redirect_stdout(stream):
+        with redirect_stderr(stream2):
+            with m.ostream_redirect():
+                m.raw_output(msg)
+                m.raw_err(msg2)
+    stdout, stderr = capfd.readouterr()
+    assert stdout == ''
+    assert stderr == ''
+    assert stream.getvalue() == msg
+    assert stream2.getvalue() == msg2
diff --git a/pybind11/tests/test_issues.cpp b/pybind11/tests/test_issues.cpp
deleted file mode 100644
index 4c59a1b12..000000000
--- a/pybind11/tests/test_issues.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
-    tests/test_issues.cpp -- collection of testcases for miscellaneous issues
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-#include "constructor_stats.h"
-#include <pybind11/stl.h>
-#include <pybind11/operators.h>
-#include <pybind11/complex.h>
-
-#define TRACKERS(CLASS) CLASS() { print_default_created(this); } ~CLASS() { print_destroyed(this); }
-struct NestABase { int value = -2; TRACKERS(NestABase) };
-struct NestA : NestABase { int value = 3; NestA& operator+=(int i) { value += i; return *this; } TRACKERS(NestA) };
-struct NestB { NestA a; int value = 4; NestB& operator-=(int i) { value -= i; return *this; } TRACKERS(NestB) };
-struct NestC { NestB b; int value = 5; NestC& operator*=(int i) { value *= i; return *this; } TRACKERS(NestC) };
-
-/// #393
-class OpTest1 {};
-class OpTest2 {};
-
-OpTest1 operator+(const OpTest1 &, const OpTest1 &) {
-    py::print("Add OpTest1 with OpTest1");
-    return OpTest1();
-}
-OpTest2 operator+(const OpTest2 &, const OpTest2 &) {
-    py::print("Add OpTest2 with OpTest2");
-    return OpTest2();
-}
-OpTest2 operator+(const OpTest2 &, const OpTest1 &) {
-    py::print("Add OpTest2 with OpTest1");
-    return OpTest2();
-}
-
-// #461
-class Dupe1 {
-public:
-    Dupe1(int v) : v_{v} {}
-    int get_value() const { return v_; }
-private:
-    int v_;
-};
-class Dupe2 {};
-class Dupe3 {};
-class DupeException : public std::runtime_error {};
-
-// #478
-template <typename T> class custom_unique_ptr {
-public:
-    custom_unique_ptr() { print_default_created(this); }
-    custom_unique_ptr(T *ptr) : _ptr{ptr} { print_created(this, ptr); }
-    custom_unique_ptr(custom_unique_ptr<T> &&move) : _ptr{move._ptr} { move._ptr = nullptr; print_move_created(this); }
-    custom_unique_ptr &operator=(custom_unique_ptr<T> &&move) { print_move_assigned(this); if (_ptr) destruct_ptr(); _ptr = move._ptr; move._ptr = nullptr; return *this; }
-    custom_unique_ptr(const custom_unique_ptr<T> &) = delete;
-    void operator=(const custom_unique_ptr<T> &copy) = delete;
-    ~custom_unique_ptr() { print_destroyed(this); if (_ptr) destruct_ptr(); }
-private:
-    T *_ptr = nullptr;
-    void destruct_ptr() { delete _ptr; }
-};
-PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
-
-/// Issue #528: templated constructor
-struct TplConstrClass {
-    template <typename T> TplConstrClass(const T &arg) : str{arg} {}
-    std::string str;
-    bool operator==(const TplConstrClass &t) const { return t.str == str; }
-};
-namespace std {
-template <> struct hash<TplConstrClass> { size_t operator()(const TplConstrClass &t) const { return std::hash<std::string>()(t.str); } };
-}
-
-
-void init_issues(py::module &m) {
-    py::module m2 = m.def_submodule("issues");
-
-#if !defined(_MSC_VER)
-    // Visual Studio 2015 currently cannot compile this test
-    // (see the comment in type_caster_base::make_copy_constructor)
-    // #70 compilation issue if operator new is not public
-    class NonConstructible { private: void *operator new(size_t bytes) throw(); };
-    py::class_<NonConstructible>(m, "Foo");
-    m2.def("getstmt", []() -> NonConstructible * { return nullptr; },
-        py::return_value_policy::reference);
-#endif
-
-    // #137: const char* isn't handled properly
-    m2.def("print_cchar", [](const char *s) { return std::string(s); });
-
-    // #150: char bindings broken
-    m2.def("print_char", [](char c) { return std::string(1, c); });
-
-    // #159: virtual function dispatch has problems with similar-named functions
-    struct Base { virtual std::string dispatch() const {
-        /* for some reason MSVC2015 can't compile this if the function is pure virtual */
-        return {};
-    }; };
-
-    struct DispatchIssue : Base {
-        virtual std::string dispatch() const {
-            PYBIND11_OVERLOAD_PURE(std::string, Base, dispatch, /* no arguments */);
-        }
-    };
-
-    py::class_<Base, DispatchIssue>(m2, "DispatchIssue")
-        .def(py::init<>())
-        .def("dispatch", &Base::dispatch);
-
-    m2.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
-
-    struct Placeholder { int i; Placeholder(int i) : i(i) { } };
-
-    py::class_<Placeholder>(m2, "Placeholder")
-        .def(py::init<int>())
-        .def("__repr__", [](const Placeholder &p) { return "Placeholder[" + std::to_string(p.i) + "]"; });
-
-    // #171: Can't return reference wrappers (or STL datastructures containing them)
-    m2.def("return_vec_of_reference_wrapper", [](std::reference_wrapper<Placeholder> p4) {
-        Placeholder *p1 = new Placeholder{1};
-        Placeholder *p2 = new Placeholder{2};
-        Placeholder *p3 = new Placeholder{3};
-        std::vector<std::reference_wrapper<Placeholder>> v;
-        v.push_back(std::ref(*p1));
-        v.push_back(std::ref(*p2));
-        v.push_back(std::ref(*p3));
-        v.push_back(p4);
-        return v;
-    });
-
-    // #181: iterator passthrough did not compile
-    m2.def("iterator_passthrough", [](py::iterator s) -> py::iterator {
-        return py::make_iterator(std::begin(s), std::end(s));
-    });
-
-    // #187: issue involving std::shared_ptr<> return value policy & garbage collection
-    struct ElementBase { virtual void foo() { } /* Force creation of virtual table */ };
-    struct ElementA : ElementBase {
-        ElementA(int v) : v(v) { }
-        int value() { return v; }
-        int v;
-    };
-
-    struct ElementList {
-        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
-        std::vector<std::shared_ptr<ElementBase>> l;
-    };
-
-    py::class_<ElementBase, std::shared_ptr<ElementBase>> (m2, "ElementBase");
-
-    py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m2, "ElementA")
-        .def(py::init<int>())
-        .def("value", &ElementA::value);
-
-    py::class_<ElementList, std::shared_ptr<ElementList>>(m2, "ElementList")
-        .def(py::init<>())
-        .def("add", &ElementList::add)
-        .def("get", [](ElementList &el) {
-            py::list list;
-            for (auto &e : el.l)
-                list.append(py::cast(e));
-            return list;
-        });
-
-    // (no id): should not be able to pass 'None' to a reference argument
-    m2.def("get_element", [](ElementA &el) { return el.value(); });
-
-    // (no id): don't cast doubles to ints
-    m2.def("expect_float", [](float f) { return f; });
-    m2.def("expect_int", [](int i) { return i; });
-
-    try {
-        py::class_<Placeholder>(m2, "Placeholder");
-        throw std::logic_error("Expected an exception!");
-    } catch (std::runtime_error &) {
-        /* All good */
-    }
-
-    // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
-    class StrIssue {
-    public:
-        StrIssue(int i) : val{i} {}
-        StrIssue() : StrIssue(-1) {}
-        int value() const { return val; }
-    private:
-        int val;
-    };
-    py::class_<StrIssue> si(m2, "StrIssue");
-    si  .def(py::init<int>())
-        .def(py::init<>())
-        .def("__str__", [](const StrIssue &si) { return "StrIssue[" + std::to_string(si.value()) + "]"; })
-        ;
-
-    // Issue #328: first member in a class can't be used in operators
-    py::class_<NestABase>(m2, "NestABase").def(py::init<>()).def_readwrite("value", &NestABase::value);
-    py::class_<NestA>(m2, "NestA").def(py::init<>()).def(py::self += int())
-        .def("as_base", [](NestA &a) -> NestABase& { return (NestABase&) a; }, py::return_value_policy::reference_internal);
-    py::class_<NestB>(m2, "NestB").def(py::init<>()).def(py::self -= int()).def_readwrite("a", &NestB::a);
-    py::class_<NestC>(m2, "NestC").def(py::init<>()).def(py::self *= int()).def_readwrite("b", &NestC::b);
-    m2.def("get_NestA", [](const NestA &a) { return a.value; });
-    m2.def("get_NestB", [](const NestB &b) { return b.value; });
-    m2.def("get_NestC", [](const NestC &c) { return c.value; });
-
-    // Issue 389: r_v_p::move should fall-through to copy on non-movable objects
-    class MoveIssue1 {
-    public:
-        MoveIssue1(int v) : v{v} {}
-        MoveIssue1(const MoveIssue1 &c) { v = c.v; }
-        MoveIssue1(MoveIssue1 &&) = delete;
-        int v;
-    };
-    class MoveIssue2 {
-    public:
-        MoveIssue2(int v) : v{v} {}
-        MoveIssue2(MoveIssue2 &&) = default;
-        int v;
-    };
-    py::class_<MoveIssue1>(m2, "MoveIssue1").def(py::init<int>()).def_readwrite("value", &MoveIssue1::v);
-    py::class_<MoveIssue2>(m2, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
-    m2.def("get_moveissue1", [](int i) -> MoveIssue1 * { return new MoveIssue1(i); }, py::return_value_policy::move);
-    m2.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
-
-    // Issues 392/397: overridding reference-returning functions
-    class OverrideTest {
-    public:
-        struct A { std::string value = "hi"; };
-        std::string v;
-        A a;
-        explicit OverrideTest(const std::string &v) : v{v} {}
-        virtual std::string str_value() { return v; }
-        virtual std::string &str_ref() { return v; }
-        virtual A A_value() { return a; }
-        virtual A &A_ref() { return a; }
-    };
-    class PyOverrideTest : public OverrideTest {
-    public:
-        using OverrideTest::OverrideTest;
-        std::string str_value() override { PYBIND11_OVERLOAD(std::string, OverrideTest, str_value); }
-        // Not allowed (uncommenting should hit a static_assert failure): we can't get a reference
-        // to a python numeric value, since we only copy values in the numeric type caster:
-//      std::string &str_ref() override { PYBIND11_OVERLOAD(std::string &, OverrideTest, str_ref); }
-        // But we can work around it like this:
-    private:
-        std::string _tmp;
-        std::string str_ref_helper() { PYBIND11_OVERLOAD(std::string, OverrideTest, str_ref); }
-    public:
-        std::string &str_ref() override { return _tmp = str_ref_helper(); }
-
-        A A_value() override { PYBIND11_OVERLOAD(A, OverrideTest, A_value); }
-        A &A_ref() override { PYBIND11_OVERLOAD(A &, OverrideTest, A_ref); }
-    };
-    py::class_<OverrideTest::A>(m2, "OverrideTest_A")
-        .def_readwrite("value", &OverrideTest::A::value);
-    py::class_<OverrideTest, PyOverrideTest>(m2, "OverrideTest")
-        .def(py::init<const std::string &>())
-        .def("str_value", &OverrideTest::str_value)
-//      .def("str_ref", &OverrideTest::str_ref)
-        .def("A_value", &OverrideTest::A_value)
-        .def("A_ref", &OverrideTest::A_ref);
-
-    /// Issue 393: need to return NotSupported to ensure correct arithmetic operator behavior
-    py::class_<OpTest1>(m2, "OpTest1")
-        .def(py::init<>())
-        .def(py::self + py::self);
-
-    py::class_<OpTest2>(m2, "OpTest2")
-        .def(py::init<>())
-        .def(py::self + py::self)
-        .def("__add__", [](const OpTest2& c2, const OpTest1& c1) { return c2 + c1; })
-        .def("__radd__", [](const OpTest2& c2, const OpTest1& c1) { return c2 + c1; });
-
-    // Issue 388: Can't make iterators via make_iterator() with different r/v policies
-    static std::vector<int> list = { 1, 2, 3 };
-    m2.def("make_iterator_1", []() { return py::make_iterator<py::return_value_policy::copy>(list); });
-    m2.def("make_iterator_2", []() { return py::make_iterator<py::return_value_policy::automatic>(list); });
-
-    static std::vector<std::string> nothrows;
-    // Issue 461: registering two things with the same name:
-    py::class_<Dupe1>(m2, "Dupe1")
-        .def("get_value", &Dupe1::get_value)
-        ;
-    m2.def("dupe1_factory", [](int v) { return new Dupe1(v); });
-
-    py::class_<Dupe2>(m2, "Dupe2");
-    py::exception<DupeException>(m2, "DupeException");
-
-    try {
-        m2.def("Dupe1", [](int v) { return new Dupe1(v); });
-        nothrows.emplace_back("Dupe1");
-    }
-    catch (std::runtime_error &) {}
-    try {
-        py::class_<Dupe3>(m2, "dupe1_factory");
-        nothrows.emplace_back("dupe1_factory");
-    }
-    catch (std::runtime_error &) {}
-    try {
-        py::exception<Dupe3>(m2, "Dupe2");
-        nothrows.emplace_back("Dupe2");
-    }
-    catch (std::runtime_error &) {}
-    try {
-        m2.def("DupeException", []() { return 30; });
-        nothrows.emplace_back("DupeException1");
-    }
-    catch (std::runtime_error &) {}
-    try {
-        py::class_<DupeException>(m2, "DupeException");
-        nothrows.emplace_back("DupeException2");
-    }
-    catch (std::runtime_error &) {}
-    m2.def("dupe_exception_failures", []() {
-        py::list l;
-        for (auto &e : nothrows) l.append(py::cast(e));
-        return l;
-    });
-
-    /// Issue #471: shared pointer instance not dellocated
-    class SharedChild : public std::enable_shared_from_this<SharedChild> {
-    public:
-        SharedChild() { print_created(this); }
-        ~SharedChild() { print_destroyed(this); }
-    };
-
-    class SharedParent {
-    public:
-        SharedParent() : child(std::make_shared<SharedChild>()) { }
-        const SharedChild &get_child() const { return *child; }
-
-    private:
-        std::shared_ptr<SharedChild> child;
-    };
-
-    py::class_<SharedChild, std::shared_ptr<SharedChild>>(m, "SharedChild");
-    py::class_<SharedParent, std::shared_ptr<SharedParent>>(m, "SharedParent")
-        .def(py::init<>())
-        .def("get_child", &SharedParent::get_child, py::return_value_policy::reference);
-
-    /// Issue/PR #478: unique ptrs constructed and freed without destruction
-    class SpecialHolderObj {
-    public:
-        int val = 0;
-        SpecialHolderObj *ch = nullptr;
-        SpecialHolderObj(int v, bool make_child = true) : val{v}, ch{make_child ? new SpecialHolderObj(val+1, false) : nullptr}
-        { print_created(this, val); }
-        ~SpecialHolderObj() { delete ch; print_destroyed(this); }
-        SpecialHolderObj *child() { return ch; }
-    };
-
-    py::class_<SpecialHolderObj, custom_unique_ptr<SpecialHolderObj>>(m, "SpecialHolderObj")
-        .def(py::init<int>())
-        .def("child", &SpecialHolderObj::child, pybind11::return_value_policy::reference_internal)
-        .def_readwrite("val", &SpecialHolderObj::val)
-        .def_static("holder_cstats", &ConstructorStats::get<custom_unique_ptr<SpecialHolderObj>>,
-                py::return_value_policy::reference);
-
-    /// Issue #484: number conversion generates unhandled exceptions
-    m2.def("test_complex", [](float x) { py::print("{}"_s.format(x)); });
-    m2.def("test_complex", [](std::complex<float> x) { py::print("({}, {})"_s.format(x.real(), x.imag())); });
-
-    /// Issue #511: problem with inheritance + overwritten def_static
-    struct MyBase {
-        static std::unique_ptr<MyBase> make() {
-            return std::unique_ptr<MyBase>(new MyBase());
-        }
-    };
-
-    struct MyDerived : MyBase {
-        static std::unique_ptr<MyDerived> make() {
-            return std::unique_ptr<MyDerived>(new MyDerived());
-        }
-    };
-
-    py::class_<MyBase>(m2, "MyBase")
-        .def_static("make", &MyBase::make);
-
-    py::class_<MyDerived, MyBase>(m2, "MyDerived")
-        .def_static("make", &MyDerived::make)
-        .def_static("make2", &MyDerived::make);
-
-    py::dict d;
-    std::string bar = "bar";
-    d["str"] = bar;
-    d["num"] = 3.7;
-
-    /// Issue #528: templated constructor
-    m2.def("tpl_constr_vector", [](std::vector<TplConstrClass> &) {});
-    m2.def("tpl_constr_map", [](std::unordered_map<TplConstrClass, TplConstrClass> &) {});
-    m2.def("tpl_constr_set", [](std::unordered_set<TplConstrClass> &) {});
-#if defined(PYBIND11_HAS_OPTIONAL)
-    m2.def("tpl_constr_optional", [](std::optional<TplConstrClass> &) {});
-#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
-    m2.def("tpl_constr_optional", [](std::experimental::optional<TplConstrClass> &) {});
-#endif
-}
-
-// MSVC workaround: trying to use a lambda here crashes MSCV
-test_initializer issues(&init_issues);
diff --git a/pybind11/tests/test_issues.py b/pybind11/tests/test_issues.py
deleted file mode 100644
index e60b5ca90..000000000
--- a/pybind11/tests/test_issues.py
+++ /dev/null
@@ -1,251 +0,0 @@
-import pytest
-from pybind11_tests import ConstructorStats
-
-
-def test_regressions():
-    from pybind11_tests.issues import print_cchar, print_char
-
-    # #137: const char* isn't handled properly
-    assert print_cchar("const char *") == "const char *"
-    # #150: char bindings broken
-    assert print_char("c") == "c"
-
-
-def test_dispatch_issue(msg):
-    """#159: virtual function dispatch has problems with similar-named functions"""
-    from pybind11_tests.issues import DispatchIssue, dispatch_issue_go
-
-    class PyClass1(DispatchIssue):
-        def dispatch(self):
-            return "Yay.."
-
-    class PyClass2(DispatchIssue):
-        def dispatch(self):
-            with pytest.raises(RuntimeError) as excinfo:
-                super(PyClass2, self).dispatch()
-            assert msg(excinfo.value) == 'Tried to call pure virtual function "Base::dispatch"'
-
-            p = PyClass1()
-            return dispatch_issue_go(p)
-
-    b = PyClass2()
-    assert dispatch_issue_go(b) == "Yay.."
-
-
-def test_reference_wrapper():
-    """#171: Can't return reference wrappers (or STL data structures containing them)"""
-    from pybind11_tests.issues import Placeholder, return_vec_of_reference_wrapper
-
-    assert str(return_vec_of_reference_wrapper(Placeholder(4))) == \
-        "[Placeholder[1], Placeholder[2], Placeholder[3], Placeholder[4]]"
-
-
-def test_iterator_passthrough():
-    """#181: iterator passthrough did not compile"""
-    from pybind11_tests.issues import iterator_passthrough
-
-    assert list(iterator_passthrough(iter([3, 5, 7, 9, 11, 13, 15]))) == [3, 5, 7, 9, 11, 13, 15]
-
-
-def test_shared_ptr_gc():
-    """// #187: issue involving std::shared_ptr<> return value policy & garbage collection"""
-    from pybind11_tests.issues import ElementList, ElementA
-
-    el = ElementList()
-    for i in range(10):
-        el.add(ElementA(i))
-    pytest.gc_collect()
-    for i, v in enumerate(el.get()):
-        assert i == v.value()
-
-
-def test_no_id(msg):
-    from pybind11_tests.issues import get_element, expect_float, expect_int
-
-    with pytest.raises(TypeError) as excinfo:
-        get_element(None)
-    assert msg(excinfo.value) == """
-        get_element(): incompatible function arguments. The following argument types are supported:
-            1. (arg0: m.issues.ElementA) -> int
-
-        Invoked with: None
-    """
-
-    with pytest.raises(TypeError) as excinfo:
-        expect_int(5.2)
-    assert msg(excinfo.value) == """
-        expect_int(): incompatible function arguments. The following argument types are supported:
-            1. (arg0: int) -> int
-
-        Invoked with: 5.2
-    """
-    assert expect_float(12) == 12
-
-
-def test_str_issue(msg):
-    """Issue #283: __str__ called on uninitialized instance when constructor arguments invalid"""
-    from pybind11_tests.issues import StrIssue
-
-    assert str(StrIssue(3)) == "StrIssue[3]"
-
-    with pytest.raises(TypeError) as excinfo:
-        str(StrIssue("no", "such", "constructor"))
-    assert msg(excinfo.value) == """
-        __init__(): incompatible constructor arguments. The following argument types are supported:
-            1. m.issues.StrIssue(arg0: int)
-            2. m.issues.StrIssue()
-
-        Invoked with: 'no', 'such', 'constructor'
-    """
-
-
-def test_nested():
-    """ #328: first member in a class can't be used in operators"""
-    from pybind11_tests.issues import NestA, NestB, NestC, get_NestA, get_NestB, get_NestC
-
-    a = NestA()
-    b = NestB()
-    c = NestC()
-
-    a += 10
-    assert get_NestA(a) == 13
-    b.a += 100
-    assert get_NestA(b.a) == 103
-    c.b.a += 1000
-    assert get_NestA(c.b.a) == 1003
-    b -= 1
-    assert get_NestB(b) == 3
-    c.b -= 3
-    assert get_NestB(c.b) == 1
-    c *= 7
-    assert get_NestC(c) == 35
-
-    abase = a.as_base()
-    assert abase.value == -2
-    a.as_base().value += 44
-    assert abase.value == 42
-    assert c.b.a.as_base().value == -2
-    c.b.a.as_base().value += 44
-    assert c.b.a.as_base().value == 42
-
-    del c
-    pytest.gc_collect()
-    del a  # Should't delete while abase is still alive
-    pytest.gc_collect()
-
-    assert abase.value == 42
-    del abase, b
-    pytest.gc_collect()
-
-
-def test_move_fallback():
-    from pybind11_tests.issues import get_moveissue1, get_moveissue2
-    m2 = get_moveissue2(2)
-    assert m2.value == 2
-    m1 = get_moveissue1(1)
-    assert m1.value == 1
-
-
-def test_override_ref():
-    from pybind11_tests.issues import OverrideTest
-    o = OverrideTest("asdf")
-
-    # Not allowed (see associated .cpp comment)
-    # i = o.str_ref()
-    # assert o.str_ref() == "asdf"
-    assert o.str_value() == "asdf"
-
-    assert o.A_value().value == "hi"
-    a = o.A_ref()
-    assert a.value == "hi"
-    a.value = "bye"
-    assert a.value == "bye"
-
-
-def test_operators_notimplemented(capture):
-    from pybind11_tests.issues import OpTest1, OpTest2
-    with capture:
-        c1, c2 = OpTest1(), OpTest2()
-        c1 + c1
-        c2 + c2
-        c2 + c1
-        c1 + c2
-    assert capture == """
-        Add OpTest1 with OpTest1
-        Add OpTest2 with OpTest2
-        Add OpTest2 with OpTest1
-        Add OpTest2 with OpTest1
-    """
-
-
-def test_iterator_rvpolicy():
-    """ Issue 388: Can't make iterators via make_iterator() with different r/v policies """
-    from pybind11_tests.issues import make_iterator_1
-    from pybind11_tests.issues import make_iterator_2
-
-    assert list(make_iterator_1()) == [1, 2, 3]
-    assert list(make_iterator_2()) == [1, 2, 3]
-    assert not isinstance(make_iterator_1(), type(make_iterator_2()))
-
-
-def test_dupe_assignment():
-    """ Issue 461: overwriting a class with a function """
-    from pybind11_tests.issues import dupe_exception_failures
-    assert dupe_exception_failures() == []
-
-
-def test_enable_shared_from_this_with_reference_rvp():
-    """ Issue #471: shared pointer instance not dellocated """
-    from pybind11_tests import SharedParent, SharedChild
-
-    parent = SharedParent()
-    child = parent.get_child()
-
-    cstats = ConstructorStats.get(SharedChild)
-    assert cstats.alive() == 1
-    del child, parent
-    assert cstats.alive() == 0
-
-
-def test_non_destructed_holders():
-    """ Issue #478: unique ptrs constructed and freed without destruction """
-    from pybind11_tests import SpecialHolderObj
-
-    a = SpecialHolderObj(123)
-    b = a.child()
-
-    assert a.val == 123
-    assert b.val == 124
-
-    cstats = SpecialHolderObj.holder_cstats()
-    assert cstats.alive() == 1
-    del b
-    assert cstats.alive() == 1
-    del a
-    assert cstats.alive() == 0
-
-
-def test_complex_cast(capture):
-    """ Issue #484: number conversion generates unhandled exceptions """
-    from pybind11_tests.issues import test_complex
-
-    with capture:
-        test_complex(1)
-        test_complex(2j)
-
-    assert capture == """
-        1.0
-        (0.0, 2.0)
-    """
-
-
-def test_inheritance_override_def_static():
-    from pybind11_tests.issues import MyBase, MyDerived
-
-    b = MyBase.make()
-    d1 = MyDerived.make2()
-    d2 = MyDerived.make()
-
-    assert isinstance(b, MyBase)
-    assert isinstance(d1, MyDerived)
-    assert isinstance(d2, MyDerived)
diff --git a/pybind11/tests/test_keep_alive.cpp b/pybind11/tests/test_keep_alive.cpp
deleted file mode 100644
index cd62a02e8..000000000
--- a/pybind11/tests/test_keep_alive.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-    tests/test_keep_alive.cpp -- keep_alive modifier (pybind11's version
-    of Boost.Python's with_custodian_and_ward / with_custodian_and_ward_postcall)
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-
-class Child {
-public:
-    Child() { py::print("Allocating child."); }
-    ~Child() { py::print("Releasing child."); }
-};
-
-class Parent {
-public:
-    Parent() { py::print("Allocating parent."); }
-    ~Parent() { py::print("Releasing parent."); }
-    void addChild(Child *) { }
-    Child *returnChild() { return new Child(); }
-    Child *returnNullChild() { return nullptr; }
-};
-
-test_initializer keep_alive([](py::module &m) {
-    py::class_<Parent>(m, "Parent")
-        .def(py::init<>())
-        .def("addChild", &Parent::addChild)
-        .def("addChildKeepAlive", &Parent::addChild, py::keep_alive<1, 2>())
-        .def("returnChild", &Parent::returnChild)
-        .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
-        .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
-        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
-
-    py::class_<Child>(m, "Child")
-        .def(py::init<>());
-});
diff --git a/pybind11/tests/test_keep_alive.py b/pybind11/tests/test_keep_alive.py
deleted file mode 100644
index bfd7d40c3..000000000
--- a/pybind11/tests/test_keep_alive.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import pytest
-
-
-def test_keep_alive_argument(capture):
-    from pybind11_tests import Parent, Child
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.addChild(Child())
-        pytest.gc_collect()
-    assert capture == """
-        Allocating child.
-        Releasing child.
-    """
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == "Releasing parent."
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.addChildKeepAlive(Child())
-        pytest.gc_collect()
-    assert capture == "Allocating child."
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == """
-        Releasing parent.
-        Releasing child.
-    """
-
-
-def test_keep_alive_return_value(capture):
-    from pybind11_tests import Parent
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.returnChild()
-        pytest.gc_collect()
-    assert capture == """
-        Allocating child.
-        Releasing child.
-    """
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == "Releasing parent."
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.returnChildKeepAlive()
-        pytest.gc_collect()
-    assert capture == "Allocating child."
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == """
-        Releasing parent.
-        Releasing child.
-    """
-
-
-def test_return_none(capture):
-    from pybind11_tests import Parent
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.returnNullChildKeepAliveChild()
-        pytest.gc_collect()
-    assert capture == ""
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == "Releasing parent."
-
-    with capture:
-        p = Parent()
-    assert capture == "Allocating parent."
-    with capture:
-        p.returnNullChildKeepAliveParent()
-        pytest.gc_collect()
-    assert capture == ""
-    with capture:
-        del p
-        pytest.gc_collect()
-    assert capture == "Releasing parent."
diff --git a/pybind11/tests/test_kwargs_and_defaults.cpp b/pybind11/tests/test_kwargs_and_defaults.cpp
index 24fc0cd5b..165f8017e 100644
--- a/pybind11/tests/test_kwargs_and_defaults.cpp
+++ b/pybind11/tests/test_kwargs_and_defaults.cpp
@@ -10,47 +10,62 @@
 #include "pybind11_tests.h"
 #include <pybind11/stl.h>
 
-std::string kw_func(int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); }
-
-std::string kw_func4(const std::vector<int> &entries) {
-    std::string ret = "{";
-    for (int i : entries)
-        ret += std::to_string(i) + " ";
-    ret.back() = '}';
-    return ret;
-}
+TEST_SUBMODULE(kwargs_and_defaults, m) {
+    auto kw_func = [](int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); };
 
-py::tuple args_function(py::args args) {
-    return args;
-}
+    // test_named_arguments
+    m.def("kw_func0", kw_func);
+    m.def("kw_func1", kw_func, py::arg("x"), py::arg("y"));
+    m.def("kw_func2", kw_func, py::arg("x") = 100, py::arg("y") = 200);
+    m.def("kw_func3", [](const char *) { }, py::arg("data") = std::string("Hello world!"));
 
-py::tuple args_kwargs_function(py::args args, py::kwargs kwargs) {
-    return py::make_tuple(args, kwargs);
-}
+    /* A fancier default argument */
+    std::vector<int> list{{13, 17}};
+    m.def("kw_func4", [](const std::vector<int> &entries) {
+        std::string ret = "{";
+        for (int i : entries)
+            ret += std::to_string(i) + " ";
+        ret.back() = '}';
+        return ret;
+    }, py::arg("myList") = list);
 
-struct KWClass {
-    void foo(int, float) {}
-};
+    m.def("kw_func_udl", kw_func, "x"_a, "y"_a=300);
+    m.def("kw_func_udl_z", kw_func, "x"_a, "y"_a=0);
 
-test_initializer arg_keywords_and_defaults([](py::module &m) {
-    m.def("kw_func0", &kw_func);
-    m.def("kw_func1", &kw_func, py::arg("x"), py::arg("y"));
-    m.def("kw_func2", &kw_func, py::arg("x") = 100, py::arg("y") = 200);
-    m.def("kw_func3", [](const char *) { }, py::arg("data") = std::string("Hello world!"));
+    // test_args_and_kwargs
+    m.def("args_function", [](py::args args) -> py::tuple { return args; });
+    m.def("args_kwargs_function", [](py::args args, py::kwargs kwargs) {
+        return py::make_tuple(args, kwargs);
+    });
 
-    /* A fancier default argument */
-    std::vector<int> list;
-    list.push_back(13);
-    list.push_back(17);
-    m.def("kw_func4", &kw_func4, py::arg("myList") = list);
+    // test_mixed_args_and_kwargs
+    m.def("mixed_plus_args", [](int i, double j, py::args args) {
+        return py::make_tuple(i, j, args);
+    });
+    m.def("mixed_plus_kwargs", [](int i, double j, py::kwargs kwargs) {
+        return py::make_tuple(i, j, kwargs);
+    });
+    auto mixed_plus_both = [](int i, double j, py::args args, py::kwargs kwargs) {
+        return py::make_tuple(i, j, args, kwargs);
+    };
+    m.def("mixed_plus_args_kwargs", mixed_plus_both);
 
-    m.def("args_function", &args_function);
-    m.def("args_kwargs_function", &args_kwargs_function);
+    m.def("mixed_plus_args_kwargs_defaults", mixed_plus_both,
+            py::arg("i") = 1, py::arg("j") = 3.14159);
 
-    m.def("kw_func_udl", &kw_func, "x"_a, "y"_a=300);
-    m.def("kw_func_udl_z", &kw_func, "x"_a, "y"_a=0);
+    // pybind11 won't allow these to be bound: args and kwargs, if present, must be at the end.
+    // Uncomment these to test that the static_assert is indeed working:
+//    m.def("bad_args1", [](py::args, int) {});
+//    m.def("bad_args2", [](py::kwargs, int) {});
+//    m.def("bad_args3", [](py::kwargs, py::args) {});
+//    m.def("bad_args4", [](py::args, int, py::kwargs) {});
+//    m.def("bad_args5", [](py::args, py::kwargs, int) {});
+//    m.def("bad_args6", [](py::args, py::args) {});
+//    m.def("bad_args7", [](py::kwargs, py::kwargs) {});
 
+    // test_function_signatures (along with most of the above)
+    struct KWClass { void foo(int, float) {} };
     py::class_<KWClass>(m, "KWClass")
         .def("foo0", &KWClass::foo)
         .def("foo1", &KWClass::foo, "x"_a, "y"_a);
-});
+}
diff --git a/pybind11/tests/test_kwargs_and_defaults.py b/pybind11/tests/test_kwargs_and_defaults.py
index 852d03c6e..733fe8593 100644
--- a/pybind11/tests/test_kwargs_and_defaults.py
+++ b/pybind11/tests/test_kwargs_and_defaults.py
@@ -1,57 +1,107 @@
 import pytest
-from pybind11_tests import (kw_func0, kw_func1, kw_func2, kw_func3, kw_func4, args_function,
-                            args_kwargs_function, kw_func_udl, kw_func_udl_z, KWClass)
+from pybind11_tests import kwargs_and_defaults as m
 
 
 def test_function_signatures(doc):
-    assert doc(kw_func0) == "kw_func0(arg0: int, arg1: int) -> str"
-    assert doc(kw_func1) == "kw_func1(x: int, y: int) -> str"
-    assert doc(kw_func2) == "kw_func2(x: int=100, y: int=200) -> str"
-    assert doc(kw_func3) == "kw_func3(data: str='Hello world!') -> None"
-    assert doc(kw_func4) == "kw_func4(myList: List[int]=[13, 17]) -> str"
-    assert doc(kw_func_udl) == "kw_func_udl(x: int, y: int=300) -> str"
-    assert doc(kw_func_udl_z) == "kw_func_udl_z(x: int, y: int=0) -> str"
-    assert doc(args_function) == "args_function(*args) -> tuple"
-    assert doc(args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
-    assert doc(KWClass.foo0) == "foo0(self: m.KWClass, arg0: int, arg1: float) -> None"
-    assert doc(KWClass.foo1) == "foo1(self: m.KWClass, x: int, y: float) -> None"
+    assert doc(m.kw_func0) == "kw_func0(arg0: int, arg1: int) -> str"
+    assert doc(m.kw_func1) == "kw_func1(x: int, y: int) -> str"
+    assert doc(m.kw_func2) == "kw_func2(x: int=100, y: int=200) -> str"
+    assert doc(m.kw_func3) == "kw_func3(data: str='Hello world!') -> None"
+    assert doc(m.kw_func4) == "kw_func4(myList: List[int]=[13, 17]) -> str"
+    assert doc(m.kw_func_udl) == "kw_func_udl(x: int, y: int=300) -> str"
+    assert doc(m.kw_func_udl_z) == "kw_func_udl_z(x: int, y: int=0) -> str"
+    assert doc(m.args_function) == "args_function(*args) -> tuple"
+    assert doc(m.args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
+    assert doc(m.KWClass.foo0) == \
+        "foo0(self: m.kwargs_and_defaults.KWClass, arg0: int, arg1: float) -> None"
+    assert doc(m.KWClass.foo1) == \
+        "foo1(self: m.kwargs_and_defaults.KWClass, x: int, y: float) -> None"
 
 
 def test_named_arguments(msg):
-    assert kw_func0(5, 10) == "x=5, y=10"
+    assert m.kw_func0(5, 10) == "x=5, y=10"
 
-    assert kw_func1(5, 10) == "x=5, y=10"
-    assert kw_func1(5, y=10) == "x=5, y=10"
-    assert kw_func1(y=10, x=5) == "x=5, y=10"
+    assert m.kw_func1(5, 10) == "x=5, y=10"
+    assert m.kw_func1(5, y=10) == "x=5, y=10"
+    assert m.kw_func1(y=10, x=5) == "x=5, y=10"
 
-    assert kw_func2() == "x=100, y=200"
-    assert kw_func2(5) == "x=5, y=200"
-    assert kw_func2(x=5) == "x=5, y=200"
-    assert kw_func2(y=10) == "x=100, y=10"
-    assert kw_func2(5, 10) == "x=5, y=10"
-    assert kw_func2(x=5, y=10) == "x=5, y=10"
+    assert m.kw_func2() == "x=100, y=200"
+    assert m.kw_func2(5) == "x=5, y=200"
+    assert m.kw_func2(x=5) == "x=5, y=200"
+    assert m.kw_func2(y=10) == "x=100, y=10"
+    assert m.kw_func2(5, 10) == "x=5, y=10"
+    assert m.kw_func2(x=5, y=10) == "x=5, y=10"
 
     with pytest.raises(TypeError) as excinfo:
         # noinspection PyArgumentList
-        kw_func2(x=5, y=10, z=12)
-    assert msg(excinfo.value) == """
-        kw_func2(): incompatible function arguments. The following argument types are supported:
-            1. (x: int=100, y: int=200) -> str
-
-        Invoked with:
-    """
+        m.kw_func2(x=5, y=10, z=12)
+    assert excinfo.match(
+        r'(?s)^kw_func2\(\): incompatible.*Invoked with: kwargs: ((x=5|y=10|z=12)(, |$))' + '{3}$')
 
-    assert kw_func4() == "{13 17}"
-    assert kw_func4(myList=[1, 2, 3]) == "{1 2 3}"
+    assert m.kw_func4() == "{13 17}"
+    assert m.kw_func4(myList=[1, 2, 3]) == "{1 2 3}"
 
-    assert kw_func_udl(x=5, y=10) == "x=5, y=10"
-    assert kw_func_udl_z(x=5) == "x=5, y=0"
+    assert m.kw_func_udl(x=5, y=10) == "x=5, y=10"
+    assert m.kw_func_udl_z(x=5) == "x=5, y=0"
 
 
 def test_arg_and_kwargs():
     args = 'arg1_value', 'arg2_value', 3
-    assert args_function(*args) == args
+    assert m.args_function(*args) == args
 
     args = 'a1', 'a2'
     kwargs = dict(arg3='a3', arg4=4)
-    assert args_kwargs_function(*args, **kwargs) == (args, kwargs)
+    assert m.args_kwargs_function(*args, **kwargs) == (args, kwargs)
+
+
+def test_mixed_args_and_kwargs(msg):
+    mpa = m.mixed_plus_args
+    mpk = m.mixed_plus_kwargs
+    mpak = m.mixed_plus_args_kwargs
+    mpakd = m.mixed_plus_args_kwargs_defaults
+
+    assert mpa(1, 2.5, 4, 99.5, None) == (1, 2.5, (4, 99.5, None))
+    assert mpa(1, 2.5) == (1, 2.5, ())
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa(1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with: 1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpa()
+    assert msg(excinfo.value) == """
+        mixed_plus_args(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: int, arg1: float, *args) -> tuple
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    assert mpk(-2, 3.5, pi=3.14159, e=2.71828) == (-2, 3.5, {'e': 2.71828, 'pi': 3.14159})
+    assert mpak(7, 7.7, 7.77, 7.777, 7.7777, minusseven=-7) == (
+        7, 7.7, (7.77, 7.777, 7.7777), {'minusseven': -7})
+    assert mpakd() == (1, 3.14159, (), {})
+    assert mpakd(3) == (3, 3.14159, (), {})
+    assert mpakd(j=2.71828) == (1, 2.71828, (), {})
+    assert mpakd(k=42) == (1, 3.14159, (), {'k': 42})
+    assert mpakd(1, 1, 2, 3, 5, 8, then=13, followedby=21) == (
+        1, 1, (2, 3, 5, 8), {'then': 13, 'followedby': 21})
+    # Arguments specified both positionally and via kwargs should fail:
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, i=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int=1, j: float=3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1; kwargs: i=1
+    """  # noqa: E501 line too long
+    with pytest.raises(TypeError) as excinfo:
+        assert mpakd(1, 2, j=1)
+    assert msg(excinfo.value) == """
+        mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
+            1. (i: int=1, j: float=3.14159, *args, **kwargs) -> tuple
+
+        Invoked with: 1, 2; kwargs: j=1
+    """  # noqa: E501 line too long
diff --git a/pybind11/tests/test_local_bindings.cpp b/pybind11/tests/test_local_bindings.cpp
new file mode 100644
index 000000000..97c02dbeb
--- /dev/null
+++ b/pybind11/tests/test_local_bindings.cpp
@@ -0,0 +1,101 @@
+/*
+    tests/test_local_bindings.cpp -- tests the py::module_local class feature which makes a class
+                                     binding local to the module in which it is defined.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include "local_bindings.h"
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <numeric>
+
+TEST_SUBMODULE(local_bindings, m) {
+    // test_load_external
+    m.def("load_external1", [](ExternalType1 &e) { return e.i; });
+    m.def("load_external2", [](ExternalType2 &e) { return e.i; });
+
+    // test_local_bindings
+    // Register a class with py::module_local:
+    bind_local<LocalType, -1>(m, "LocalType", py::module_local())
+        .def("get3", [](LocalType &t) { return t.i + 3; })
+        ;
+
+    m.def("local_value", [](LocalType &l) { return l.i; });
+
+    // test_nonlocal_failure
+    // The main pybind11 test module is loaded first, so this registration will succeed (the second
+    // one, in pybind11_cross_module_tests.cpp, is designed to fail):
+    bind_local<NonLocalType, 0>(m, "NonLocalType")
+        .def(py::init<int>())
+        .def("get", [](LocalType &i) { return i.i; })
+        ;
+
+    // test_duplicate_local
+    // py::module_local declarations should be visible across compilation units that get linked together;
+    // this tries to register a duplicate local.  It depends on a definition in test_class.cpp and
+    // should raise a runtime error from the duplicate definition attempt.  If test_class isn't
+    // available it *also* throws a runtime error (with "test_class not enabled" as value).
+    m.def("register_local_external", [m]() {
+        auto main = py::module::import("pybind11_tests");
+        if (py::hasattr(main, "class_")) {
+            bind_local<LocalExternal, 7>(m, "LocalExternal", py::module_local());
+        }
+        else throw std::runtime_error("test_class not enabled");
+    });
+
+    // test_stl_bind_local
+    // stl_bind.h binders defaults to py::module_local if the types are local or converting:
+    py::bind_vector<LocalVec>(m, "LocalVec");
+    py::bind_map<LocalMap>(m, "LocalMap");
+    // and global if the type (or one of the types, for the map) is global:
+    py::bind_vector<NonLocalVec>(m, "NonLocalVec");
+    py::bind_map<NonLocalMap>(m, "NonLocalMap");
+
+    // test_stl_bind_global
+    // They can, however, be overridden to global using `py::module_local(false)`:
+    bind_local<NonLocal2, 10>(m, "NonLocal2");
+    py::bind_vector<LocalVec2>(m, "LocalVec2", py::module_local());
+    py::bind_map<NonLocalMap2>(m, "NonLocalMap2", py::module_local(false));
+
+    // test_mixed_local_global
+    // We try this both with the global type registered first and vice versa (the order shouldn't
+    // matter).
+    m.def("register_mixed_global", [m]() {
+        bind_local<MixedGlobalLocal, 100>(m, "MixedGlobalLocal", py::module_local(false));
+    });
+    m.def("register_mixed_local", [m]() {
+        bind_local<MixedLocalGlobal, 1000>(m, "MixedLocalGlobal", py::module_local());
+    });
+    m.def("get_mixed_gl", [](int i) { return MixedGlobalLocal(i); });
+    m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
+
+    // test_internal_locals_differ
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+
+    // test_stl_caster_vs_stl_bind
+    m.def("load_vector_via_caster", [](std::vector<int> v) {
+        return std::accumulate(v.begin(), v.end(), 0);
+    });
+
+    // test_cross_module_calls
+    m.def("return_self", [](LocalVec *v) { return v; });
+    m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
+
+    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    py::class_<pets::Pet>(m, "Pet", py::module_local())
+        .def("get_name", &pets::Pet::name);
+    // Binding for local extending class:
+    py::class_<Cat, pets::Pet>(m, "Cat")
+        .def(py::init<std::string>());
+    m.def("pet_name", [](pets::Pet &p) { return p.name(); });
+
+    py::class_<MixGL>(m, "MixGL").def(py::init<int>());
+    m.def("get_gl_value", [](MixGL &o) { return o.i + 10; });
+
+    py::class_<MixGL2>(m, "MixGL2").def(py::init<int>());
+}
diff --git a/pybind11/tests/test_local_bindings.py b/pybind11/tests/test_local_bindings.py
new file mode 100644
index 000000000..b3dc3619c
--- /dev/null
+++ b/pybind11/tests/test_local_bindings.py
@@ -0,0 +1,226 @@
+import pytest
+
+from pybind11_tests import local_bindings as m
+
+
+def test_load_external():
+    """Load a `py::module_local` type that's only registered in an external module"""
+    import pybind11_cross_module_tests as cm
+
+    assert m.load_external1(cm.ExternalType1(11)) == 11
+    assert m.load_external2(cm.ExternalType2(22)) == 22
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external2(cm.ExternalType1(21)) == 21
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        assert m.load_external1(cm.ExternalType2(12)) == 12
+    assert "incompatible function arguments" in str(excinfo.value)
+
+
+def test_local_bindings():
+    """Tests that duplicate `py::module_local` class bindings work across modules"""
+
+    # Make sure we can load the second module with the conflicting (but local) definition:
+    import pybind11_cross_module_tests as cm
+
+    i1 = m.LocalType(5)
+    assert i1.get() == 4
+    assert i1.get3() == 8
+
+    i2 = cm.LocalType(10)
+    assert i2.get() == 11
+    assert i2.get2() == 12
+
+    assert not hasattr(i1, 'get2')
+    assert not hasattr(i2, 'get3')
+
+    # Loading within the local module
+    assert m.local_value(i1) == 5
+    assert cm.local_value(i2) == 10
+
+    # Cross-module loading works as well (on failure, the type loader looks for
+    # external module-local converters):
+    assert m.local_value(i2) == 10
+    assert cm.local_value(i1) == 5
+
+
+def test_nonlocal_failure():
+    """Tests that attempting to register a non-local type in multiple modules fails"""
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalType" is already registered!'
+
+
+def test_duplicate_local():
+    """Tests expected failure when registering a class twice with py::local in the same module"""
+    with pytest.raises(RuntimeError) as excinfo:
+        m.register_local_external()
+    import pybind11_tests
+    assert str(excinfo.value) == (
+        'generic_type: type "LocalExternal" is already registered!'
+        if hasattr(pybind11_tests, 'class_') else 'test_class not enabled')
+
+
+def test_stl_bind_local():
+    import pybind11_cross_module_tests as cm
+
+    v1, v2 = m.LocalVec(), cm.LocalVec()
+    v1.append(m.LocalType(1))
+    v1.append(m.LocalType(2))
+    v2.append(cm.LocalType(1))
+    v2.append(cm.LocalType(2))
+
+    # Cross module value loading:
+    v1.append(cm.LocalType(3))
+    v2.append(m.LocalType(3))
+
+    assert [i.get() for i in v1] == [0, 1, 2]
+    assert [i.get() for i in v2] == [2, 3, 4]
+
+    v3, v4 = m.NonLocalVec(), cm.NonLocalVec2()
+    v3.append(m.NonLocalType(1))
+    v3.append(m.NonLocalType(2))
+    v4.append(m.NonLocal2(3))
+    v4.append(m.NonLocal2(4))
+
+    assert [i.get() for i in v3] == [1, 2]
+    assert [i.get() for i in v4] == [13, 14]
+
+    d1, d2 = m.LocalMap(), cm.LocalMap()
+    d1["a"] = v1[0]
+    d1["b"] = v1[1]
+    d2["c"] = v2[0]
+    d2["d"] = v2[1]
+    assert {i: d1[i].get() for i in d1} == {'a': 0, 'b': 1}
+    assert {i: d2[i].get() for i in d2} == {'c': 2, 'd': 3}
+
+
+def test_stl_bind_global():
+    import pybind11_cross_module_tests as cm
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_vec()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalVec" is already registered!'
+
+    with pytest.raises(RuntimeError) as excinfo:
+        cm.register_nonlocal_map2()
+    assert str(excinfo.value) == 'generic_type: type "NonLocalMap2" is already registered!'
+
+
+def test_mixed_local_global():
+    """Local types take precedence over globally registered types: a module with a `module_local`
+    type can be registered even if the type is already registered globally.  With the module,
+    casting will go to the local type; outside the module casting goes to the global type."""
+    import pybind11_cross_module_tests as cm
+    m.register_mixed_global()
+    m.register_mixed_local()
+
+    a = []
+    a.append(m.MixedGlobalLocal(1))
+    a.append(m.MixedLocalGlobal(2))
+    a.append(m.get_mixed_gl(3))
+    a.append(m.get_mixed_lg(4))
+
+    assert [x.get() for x in a] == [101, 1002, 103, 1004]
+
+    cm.register_mixed_global_local()
+    cm.register_mixed_local_global()
+    a.append(m.MixedGlobalLocal(5))
+    a.append(m.MixedLocalGlobal(6))
+    a.append(cm.MixedGlobalLocal(7))
+    a.append(cm.MixedLocalGlobal(8))
+    a.append(m.get_mixed_gl(9))
+    a.append(m.get_mixed_lg(10))
+    a.append(cm.get_mixed_gl(11))
+    a.append(cm.get_mixed_lg(12))
+
+    assert [x.get() for x in a] == \
+        [101, 1002, 103, 1004, 105, 1006, 207, 2008, 109, 1010, 211, 2012]
+
+
+def test_internal_locals_differ():
+    """Makes sure the internal local type map differs across the two modules"""
+    import pybind11_cross_module_tests as cm
+    assert m.local_cpp_types_addr() != cm.local_cpp_types_addr()
+
+
+def test_stl_caster_vs_stl_bind(msg):
+    """One module uses a generic vector caster from `<pybind11/stl.h>` while the other
+    exports `std::vector<int>` via `py:bind_vector` and `py::module_local`"""
+    import pybind11_cross_module_tests as cm
+
+    v1 = cm.VectorInt([1, 2, 3])
+    assert m.load_vector_via_caster(v1) == 6
+    assert cm.load_vector_via_binding(v1) == 6
+
+    v2 = [1, 2, 3]
+    assert m.load_vector_via_caster(v2) == 6
+    with pytest.raises(TypeError) as excinfo:
+        cm.load_vector_via_binding(v2) == 6
+    assert msg(excinfo.value) == """
+    load_vector_via_binding(): incompatible function arguments. The following argument types are supported:
+        1. (arg0: pybind11_cross_module_tests.VectorInt) -> int
+
+    Invoked with: [1, 2, 3]
+    """  # noqa: E501 line too long
+
+
+def test_cross_module_calls():
+    import pybind11_cross_module_tests as cm
+
+    v1 = m.LocalVec()
+    v1.append(m.LocalType(1))
+    v2 = cm.LocalVec()
+    v2.append(cm.LocalType(2))
+
+    # Returning the self pointer should get picked up as returning an existing
+    # instance (even when that instance is of a foreign, non-local type).
+    assert m.return_self(v1) is v1
+    assert cm.return_self(v2) is v2
+    assert m.return_self(v2) is v2
+    assert cm.return_self(v1) is v1
+
+    assert m.LocalVec is not cm.LocalVec
+    # Returning a copy, on the other hand, always goes to the local type,
+    # regardless of where the source type came from.
+    assert type(m.return_copy(v1)) is m.LocalVec
+    assert type(m.return_copy(v2)) is m.LocalVec
+    assert type(cm.return_copy(v1)) is cm.LocalVec
+    assert type(cm.return_copy(v2)) is cm.LocalVec
+
+    # Test the example given in the documentation (which also tests inheritance casting):
+    mycat = m.Cat("Fluffy")
+    mydog = cm.Dog("Rover")
+    assert mycat.get_name() == "Fluffy"
+    assert mydog.name() == "Rover"
+    assert m.Cat.__base__.__name__ == "Pet"
+    assert cm.Dog.__base__.__name__ == "Pet"
+    assert m.Cat.__base__ is not cm.Dog.__base__
+    assert m.pet_name(mycat) == "Fluffy"
+    assert m.pet_name(mydog) == "Rover"
+    assert cm.pet_name(mycat) == "Fluffy"
+    assert cm.pet_name(mydog) == "Rover"
+
+    assert m.MixGL is not cm.MixGL
+    a = m.MixGL(1)
+    b = cm.MixGL(2)
+    assert m.get_gl_value(a) == 11
+    assert m.get_gl_value(b) == 12
+    assert cm.get_gl_value(a) == 101
+    assert cm.get_gl_value(b) == 102
+
+    c, d = m.MixGL2(3), cm.MixGL2(4)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(c)
+    assert "incompatible function arguments" in str(excinfo)
+    with pytest.raises(TypeError) as excinfo:
+        m.get_gl_value(d)
+    assert "incompatible function arguments" in str(excinfo)
diff --git a/pybind11/tests/test_methods_and_attributes.cpp b/pybind11/tests/test_methods_and_attributes.cpp
index f7d6d6855..cd15869f4 100644
--- a/pybind11/tests/test_methods_and_attributes.cpp
+++ b/pybind11/tests/test_methods_and_attributes.cpp
@@ -26,34 +26,43 @@ public:
     void operator=(const ExampleMandA &e) { print_copy_assigned(this); value = e.value; }
     void operator=(ExampleMandA &&e) { print_move_assigned(this); value = e.value; }
 
-    void add1(ExampleMandA other) { value += other.value; }           // passing by value
-    void add2(ExampleMandA &other) { value += other.value; }          // passing by reference
-    void add3(const ExampleMandA &other) { value += other.value; }    // passing by const reference
-    void add4(ExampleMandA *other) { value += other->value; }         // passing by pointer
-    void add5(const ExampleMandA *other) { value += other->value; }   // passing by const pointer
-
-    void add6(int other) { value += other; }                      // passing by value
-    void add7(int &other) { value += other; }                     // passing by reference
-    void add8(const int &other) { value += other; }               // passing by const reference
-    void add9(int *other) { value += *other; }                    // passing by pointer
-    void add10(const int *other) { value += *other; }             // passing by const pointer
-
-    ExampleMandA self1() { return *this; }                            // return by value
-    ExampleMandA &self2() { return *this; }                           // return by reference
-    const ExampleMandA &self3() { return *this; }                     // return by const reference
-    ExampleMandA *self4() { return this; }                            // return by pointer
-    const ExampleMandA *self5() { return this; }                      // return by const pointer
-
-    int internal1() { return value; }                             // return by value
-    int &internal2() { return value; }                            // return by reference
-    const int &internal3() { return value; }                      // return by const reference
-    int *internal4() { return &value; }                           // return by pointer
-    const int *internal5() { return &value; }                     // return by const pointer
-
-    py::str overloaded(int, float) { return "(int, float)"; }
-    py::str overloaded(float, int) { return "(float, int)"; }
-    py::str overloaded(int, float) const { return "(int, float) const"; }
-    py::str overloaded(float, int) const { return "(float, int) const"; }
+    void add1(ExampleMandA other) { value += other.value; }         // passing by value
+    void add2(ExampleMandA &other) { value += other.value; }        // passing by reference
+    void add3(const ExampleMandA &other) { value += other.value; }  // passing by const reference
+    void add4(ExampleMandA *other) { value += other->value; }       // passing by pointer
+    void add5(const ExampleMandA *other) { value += other->value; } // passing by const pointer
+
+    void add6(int other) { value += other; }                        // passing by value
+    void add7(int &other) { value += other; }                       // passing by reference
+    void add8(const int &other) { value += other; }                 // passing by const reference
+    void add9(int *other) { value += *other; }                      // passing by pointer
+    void add10(const int *other) { value += *other; }               // passing by const pointer
+
+    ExampleMandA self1() { return *this; }                          // return by value
+    ExampleMandA &self2() { return *this; }                         // return by reference
+    const ExampleMandA &self3() { return *this; }                   // return by const reference
+    ExampleMandA *self4() { return this; }                          // return by pointer
+    const ExampleMandA *self5() { return this; }                    // return by const pointer
+
+    int internal1() { return value; }                               // return by value
+    int &internal2() { return value; }                              // return by reference
+    const int &internal3() { return value; }                        // return by const reference
+    int *internal4() { return &value; }                             // return by pointer
+    const int *internal5() { return &value; }                       // return by const pointer
+
+    py::str overloaded()             { return "()"; }
+    py::str overloaded(int)          { return "(int)"; }
+    py::str overloaded(int, float)   { return "(int, float)"; }
+    py::str overloaded(float, int)   { return "(float, int)"; }
+    py::str overloaded(int, int)     { return "(int, int)"; }
+    py::str overloaded(float, float) { return "(float, float)"; }
+    py::str overloaded(int)          const { return "(int) const"; }
+    py::str overloaded(int, float)   const { return "(int, float) const"; }
+    py::str overloaded(float, int)   const { return "(float, int) const"; }
+    py::str overloaded(int, int)     const { return "(int, int) const"; }
+    py::str overloaded(float, float) const { return "(float, float) const"; }
+
+    static py::str overloaded(float) { return "static float"; }
 
     int value = 0;
 };
@@ -68,38 +77,135 @@ struct TestProperties {
     static int static_get() { return static_value; }
     static void static_set(int v) { static_value = v; }
 };
-
 int TestProperties::static_value = 1;
 
-struct SimpleValue { int value = 1; };
+struct TestPropertiesOverride : TestProperties {
+    int value = 99;
+    static int static_value;
+};
+int TestPropertiesOverride::static_value = 99;
 
 struct TestPropRVP {
-    SimpleValue v1;
-    SimpleValue v2;
-    static SimpleValue sv1;
-    static SimpleValue sv2;
-
-    const SimpleValue &get1() const { return v1; }
-    const SimpleValue &get2() const { return v2; }
-    SimpleValue get_rvalue() const { return v2; }
-    void set1(int v) { v1.value = v; }
-    void set2(int v) { v2.value = v; }
+    UserType v1{1};
+    UserType v2{1};
+    static UserType sv1;
+    static UserType sv2;
+
+    const UserType &get1() const { return v1; }
+    const UserType &get2() const { return v2; }
+    UserType get_rvalue() const { return v2; }
+    void set1(int v) { v1.set(v); }
+    void set2(int v) { v2.set(v); }
 };
+UserType TestPropRVP::sv1(1);
+UserType TestPropRVP::sv2(1);
+
+// py::arg/py::arg_v testing: these arguments just record their argument when invoked
+class ArgInspector1 { public: std::string arg = "(default arg inspector 1)"; };
+class ArgInspector2 { public: std::string arg = "(default arg inspector 2)"; };
+class ArgAlwaysConverts { };
+namespace pybind11 { namespace detail {
+template <> struct type_caster<ArgInspector1> {
+public:
+    PYBIND11_TYPE_CASTER(ArgInspector1, _("ArgInspector1"));
 
-SimpleValue TestPropRVP::sv1{};
-SimpleValue TestPropRVP::sv2{};
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector1 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
 
-class DynamicClass {
+    static handle cast(const ArgInspector1 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
+};
+template <> struct type_caster<ArgInspector2> {
 public:
-    DynamicClass() { print_default_created(this); }
-    ~DynamicClass() { print_destroyed(this); }
+    PYBIND11_TYPE_CASTER(ArgInspector2, _("ArgInspector2"));
+
+    bool load(handle src, bool convert) {
+        value.arg = "loading ArgInspector2 argument " +
+            std::string(convert ? "WITH" : "WITHOUT") + " conversion allowed.  "
+            "Argument value = " + (std::string) str(src);
+        return true;
+    }
+
+    static handle cast(const ArgInspector2 &src, return_value_policy, handle) {
+        return str(src.arg).release();
+    }
 };
+template <> struct type_caster<ArgAlwaysConverts> {
+public:
+    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, _("ArgAlwaysConverts"));
 
-class CppDerivedDynamicClass : public DynamicClass { };
+    bool load(handle, bool convert) {
+        return convert;
+    }
 
-test_initializer methods_and_attributes([](py::module &m) {
-    py::class_<ExampleMandA>(m, "ExampleMandA")
-        .def(py::init<>())
+    static handle cast(const ArgAlwaysConverts &, return_value_policy, handle) {
+        return py::none().release();
+    }
+};
+}}
+
+// test_custom_caster_destruction
+class DestructionTester {
+public:
+    DestructionTester() { print_default_created(this); }
+    ~DestructionTester() { print_destroyed(this); }
+    DestructionTester(const DestructionTester &) { print_copy_created(this); }
+    DestructionTester(DestructionTester &&) { print_move_created(this); }
+    DestructionTester &operator=(const DestructionTester &) { print_copy_assigned(this); return *this; }
+    DestructionTester &operator=(DestructionTester &&) { print_move_assigned(this); return *this; }
+};
+namespace pybind11 { namespace detail {
+template <> struct type_caster<DestructionTester> {
+    PYBIND11_TYPE_CASTER(DestructionTester, _("DestructionTester"));
+    bool load(handle, bool) { return true; }
+
+    static handle cast(const DestructionTester &, return_value_policy, handle) {
+        return py::bool_(true).release();
+    }
+};
+}}
+
+// Test None-allowed py::arg argument policy
+class NoneTester { public: int answer = 42; };
+int none1(const NoneTester &obj) { return obj.answer; }
+int none2(NoneTester *obj) { return obj ? obj->answer : -1; }
+int none3(std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
+int none4(std::shared_ptr<NoneTester> *obj) { return obj && *obj ? (*obj)->answer : -1; }
+int none5(std::shared_ptr<NoneTester> obj) { return obj ? obj->answer : -1; }
+
+struct StrIssue {
+    int val = -1;
+
+    StrIssue() = default;
+    StrIssue(int i) : val{i} {}
+};
+
+// Issues #854, #910: incompatible function args when member function/pointer is in unregistered base class
+class UnregisteredBase {
+public:
+    void do_nothing() const {}
+    void increase_value() { rw_value++; ro_value += 0.25; }
+    void set_int(int v) { rw_value = v; }
+    int get_int() const { return rw_value; }
+    double get_double() const { return ro_value; }
+    int rw_value = 42;
+    double ro_value = 1.25;
+};
+class RegisteredDerived : public UnregisteredBase {
+public:
+    using UnregisteredBase::UnregisteredBase;
+    double sum() const { return rw_value + ro_value; }
+};
+
+TEST_SUBMODULE(methods_and_attributes, m) {
+    // test_methods_and_attributes
+    py::class_<ExampleMandA> emna(m, "ExampleMandA");
+    emna.def(py::init<>())
         .def(py::init<int>())
         .def(py::init<const ExampleMandA&>())
         .def("add1", &ExampleMandA::add1)
@@ -123,20 +229,53 @@ test_initializer methods_and_attributes([](py::module &m) {
         .def("internal4", &ExampleMandA::internal4)
         .def("internal5", &ExampleMandA::internal5)
 #if defined(PYBIND11_OVERLOAD_CAST)
-        .def("overloaded", py::overload_cast<int, float>(&ExampleMandA::overloaded))
-        .def("overloaded", py::overload_cast<float, int>(&ExampleMandA::overloaded))
-        .def("overloaded_const", py::overload_cast<int, float>(&ExampleMandA::overloaded, py::const_))
-        .def("overloaded_const", py::overload_cast<float, int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded", py::overload_cast<>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,   float>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float,   int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<int,     int>(&ExampleMandA::overloaded))
+        .def("overloaded", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_float", py::overload_cast<float, float>(&ExampleMandA::overloaded))
+        .def("overloaded_const", py::overload_cast<int         >(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,   float>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float,   int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<int,     int>(&ExampleMandA::overloaded, py::const_))
+        .def("overloaded_const", py::overload_cast<float, float>(&ExampleMandA::overloaded, py::const_))
 #else
-        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int, float)>(&ExampleMandA::overloaded))
-        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float, int)>(&ExampleMandA::overloaded))
-        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int, float) const>(&ExampleMandA::overloaded))
-        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float, int) const>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)()>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int,   float)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float,   int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(int,     int)>(&ExampleMandA::overloaded))
+        .def("overloaded", static_cast<py::str (ExampleMandA::*)(float, float)>(&ExampleMandA::overloaded))
+        .def("overloaded_float", static_cast<py::str (ExampleMandA::*)(float, float)>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int         ) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int,   float) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float,   int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(int,     int) const>(&ExampleMandA::overloaded))
+        .def("overloaded_const", static_cast<py::str (ExampleMandA::*)(float, float) const>(&ExampleMandA::overloaded))
 #endif
+        // test_no_mixed_overloads
+        // Raise error if trying to mix static/non-static overloads on the same name:
+        .def_static("add_mixed_overloads1", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def       ("overload_mixed1", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded))
+                .def_static("overload_mixed1", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded));
+        })
+        .def_static("add_mixed_overloads2", []() {
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            emna.def_static("overload_mixed2", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded))
+                .def       ("overload_mixed2", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded));
+        })
         .def("__str__", &ExampleMandA::toString)
         .def_readwrite("value", &ExampleMandA::value);
 
-    py::class_<TestProperties>(m, "TestProperties", py::metaclass())
+    // test_copy_method
+    // Issue #443: can't call copied methods in Python 3
+    emna.attr("add2b") = emna.attr("add2");
+
+    // test_properties, test_static_properties, test_static_cls
+    py::class_<TestProperties>(m, "TestProperties")
         .def(py::init<>())
         .def_readonly("def_readonly", &TestProperties::value)
         .def_readwrite("def_readwrite", &TestProperties::value)
@@ -148,18 +287,24 @@ test_initializer methods_and_attributes([](py::module &m) {
                                       [](py::object) { return TestProperties::static_get(); })
         .def_property_static("def_property_static",
                              [](py::object) { return TestProperties::static_get(); },
-                             [](py::object, int v) { return TestProperties::static_set(v); });
+                             [](py::object, int v) { TestProperties::static_set(v); })
+        .def_property_static("static_cls",
+                             [](py::object cls) { return cls; },
+                             [](py::object cls, py::function f) { f(cls); });
 
-    py::class_<SimpleValue>(m, "SimpleValue")
-        .def_readwrite("value", &SimpleValue::value);
+    py::class_<TestPropertiesOverride, TestProperties>(m, "TestPropertiesOverride")
+        .def(py::init<>())
+        .def_readonly("def_readonly", &TestPropertiesOverride::value)
+        .def_readonly_static("def_readonly_static", &TestPropertiesOverride::static_value);
 
-    auto static_get1 = [](py::object) -> const SimpleValue & { return TestPropRVP::sv1; };
-    auto static_get2 = [](py::object) -> const SimpleValue & { return TestPropRVP::sv2; };
-    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.value = v; };
-    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.value = v; };
+    auto static_get1 = [](py::object) -> const UserType & { return TestPropRVP::sv1; };
+    auto static_get2 = [](py::object) -> const UserType & { return TestPropRVP::sv2; };
+    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.set(v); };
+    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.set(v); };
     auto rvp_copy = py::return_value_policy::copy;
 
-    py::class_<TestPropRVP>(m, "TestPropRVP", py::metaclass())
+    // test_property_return_value_policies
+    py::class_<TestPropRVP>(m, "TestPropRVP")
         .def(py::init<>())
         .def_property_readonly("ro_ref", &TestPropRVP::get1)
         .def_property_readonly("ro_copy", &TestPropRVP::get2, rvp_copy)
@@ -173,14 +318,129 @@ test_initializer methods_and_attributes([](py::module &m) {
         .def_property_static("static_rw_ref", static_get1, static_set1)
         .def_property_static("static_rw_copy", static_get2, static_set2, rvp_copy)
         .def_property_static("static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
+        // test_property_rvalue_policy
         .def_property_readonly("rvalue", &TestPropRVP::get_rvalue)
-        .def_property_readonly_static("static_rvalue", [](py::object) { return SimpleValue(); });
+        .def_property_readonly_static("static_rvalue", [](py::object) { return UserType(1); });
+
+    // test_metaclass_override
+    struct MetaclassOverride { };
+    py::class_<MetaclassOverride>(m, "MetaclassOverride", py::metaclass((PyObject *) &PyType_Type))
+        .def_property_readonly_static("readonly", [](py::object) { return 1; });
 
 #if !defined(PYPY_VERSION)
+    // test_dynamic_attributes
+    class DynamicClass {
+    public:
+        DynamicClass() { print_default_created(this); }
+        ~DynamicClass() { print_destroyed(this); }
+    };
     py::class_<DynamicClass>(m, "DynamicClass", py::dynamic_attr())
         .def(py::init());
 
+    class CppDerivedDynamicClass : public DynamicClass { };
     py::class_<CppDerivedDynamicClass, DynamicClass>(m, "CppDerivedDynamicClass")
         .def(py::init());
 #endif
-});
+
+    // test_noconvert_args
+    //
+    // Test converting.  The ArgAlwaysConverts is just there to make the first no-conversion pass
+    // fail so that our call always ends up happening via the second dispatch (the one that allows
+    // some conversion).
+    class ArgInspector {
+    public:
+        ArgInspector1 f(ArgInspector1 a, ArgAlwaysConverts) { return a; }
+        std::string g(ArgInspector1 a, const ArgInspector1 &b, int c, ArgInspector2 *d, ArgAlwaysConverts) {
+            return a.arg + "\n" + b.arg + "\n" + std::to_string(c) + "\n" + d->arg;
+        }
+        static ArgInspector2 h(ArgInspector2 a, ArgAlwaysConverts) { return a; }
+    };
+    py::class_<ArgInspector>(m, "ArgInspector")
+        .def(py::init<>())
+        .def("f", &ArgInspector::f, py::arg(), py::arg() = ArgAlwaysConverts())
+        .def("g", &ArgInspector::g, "a"_a.noconvert(), "b"_a, "c"_a.noconvert()=13, "d"_a=ArgInspector2(), py::arg() = ArgAlwaysConverts())
+        .def_static("h", &ArgInspector::h, py::arg().noconvert(), py::arg() = ArgAlwaysConverts())
+        ;
+    m.def("arg_inspect_func", [](ArgInspector2 a, ArgInspector1 b, ArgAlwaysConverts) { return a.arg + "\n" + b.arg; },
+            py::arg().noconvert(false), py::arg_v(nullptr, ArgInspector1()).noconvert(true), py::arg() = ArgAlwaysConverts());
+
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
+    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
+    m.def("ints_preferred", [](int i) { return i / 2; }, py::arg("i"));
+    m.def("ints_only", [](int i) { return i / 2; }, py::arg("i").noconvert());
+
+    // test_bad_arg_default
+    // Issue/PR #648: bad arg default debugging output
+#if !defined(NDEBUG)
+    m.attr("debug_enabled") = true;
+#else
+    m.attr("debug_enabled") = false;
+#endif
+    m.def("bad_arg_def_named", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg("a") = UnregisteredType());
+    });
+    m.def("bad_arg_def_unnamed", []{
+        auto m = py::module::import("pybind11_tests");
+        m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg() = UnregisteredType());
+    });
+
+    // test_accepts_none
+    py::class_<NoneTester, std::shared_ptr<NoneTester>>(m, "NoneTester")
+        .def(py::init<>());
+    m.def("no_none1", &none1, py::arg().none(false));
+    m.def("no_none2", &none2, py::arg().none(false));
+    m.def("no_none3", &none3, py::arg().none(false));
+    m.def("no_none4", &none4, py::arg().none(false));
+    m.def("no_none5", &none5, py::arg().none(false));
+    m.def("ok_none1", &none1);
+    m.def("ok_none2", &none2, py::arg().none(true));
+    m.def("ok_none3", &none3);
+    m.def("ok_none4", &none4, py::arg().none(true));
+    m.def("ok_none5", &none5);
+
+    // test_str_issue
+    // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
+    py::class_<StrIssue>(m, "StrIssue")
+        .def(py::init<int>())
+        .def(py::init<>())
+        .def("__str__", [](const StrIssue &si) {
+            return "StrIssue[" + std::to_string(si.val) + "]"; }
+        );
+
+    // test_unregistered_base_implementations
+    //
+    // Issues #854/910: incompatible function args when member function/pointer is in unregistered
+    // base class The methods and member pointers below actually resolve to members/pointers in
+    // UnregisteredBase; before this test/fix they would be registered via lambda with a first
+    // argument of an unregistered type, and thus uncallable.
+    py::class_<RegisteredDerived>(m, "RegisteredDerived")
+        .def(py::init<>())
+        .def("do_nothing", &RegisteredDerived::do_nothing)
+        .def("increase_value", &RegisteredDerived::increase_value)
+        .def_readwrite("rw_value", &RegisteredDerived::rw_value)
+        .def_readonly("ro_value", &RegisteredDerived::ro_value)
+        // These should trigger a static_assert if uncommented
+        //.def_readwrite("fails", &UserType::value) // should trigger a static_assert if uncommented
+        //.def_readonly("fails", &UserType::value) // should trigger a static_assert if uncommented
+        .def_property("rw_value_prop", &RegisteredDerived::get_int, &RegisteredDerived::set_int)
+        .def_property_readonly("ro_value_prop", &RegisteredDerived::get_double)
+        // This one is in the registered class:
+        .def("sum", &RegisteredDerived::sum)
+        ;
+
+    using Adapted = decltype(py::method_adaptor<RegisteredDerived>(&RegisteredDerived::do_nothing));
+    static_assert(std::is_same<Adapted, void (RegisteredDerived::*)() const>::value, "");
+
+    // test_custom_caster_destruction
+    // Test that `take_ownership` works on types with a custom type caster when given a pointer
+
+    // default policy: don't take ownership:
+    m.def("custom_caster_no_destroy", []() { static auto *dt = new DestructionTester(); return dt; });
+
+    m.def("custom_caster_destroy", []() { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Takes ownership: destroy when finished
+    m.def("custom_caster_destroy_const", []() -> const DestructionTester * { return new DestructionTester(); },
+            py::return_value_policy::take_ownership); // Likewise (const doesn't inhibit destruction)
+    m.def("destruction_tester_cstats", &ConstructorStats::get<DestructionTester>, py::return_value_policy::reference);
+}
diff --git a/pybind11/tests/test_methods_and_attributes.py b/pybind11/tests/test_methods_and_attributes.py
index 840ee707b..9fd9cb75c 100644
--- a/pybind11/tests/test_methods_and_attributes.py
+++ b/pybind11/tests/test_methods_and_attributes.py
@@ -1,10 +1,11 @@
 import pytest
-from pybind11_tests import ExampleMandA, ConstructorStats
+from pybind11_tests import methods_and_attributes as m
+from pybind11_tests import ConstructorStats
 
 
 def test_methods_and_attributes():
-    instance1 = ExampleMandA()
-    instance2 = ExampleMandA(32)
+    instance1 = m.ExampleMandA()
+    instance2 = m.ExampleMandA(32)
 
     instance1.add1(instance2)
     instance1.add2(instance2)
@@ -31,16 +32,27 @@ def test_methods_and_attributes():
     assert instance1.internal4() == 320
     assert instance1.internal5() == 320
 
+    assert instance1.overloaded() == "()"
+    assert instance1.overloaded(0) == "(int)"
     assert instance1.overloaded(1, 1.0) == "(int, float)"
     assert instance1.overloaded(2.0, 2) == "(float, int)"
-    assert instance1.overloaded_const(3, 3.0) == "(int, float) const"
-    assert instance1.overloaded_const(4.0, 4) == "(float, int) const"
+    assert instance1.overloaded(3,   3) == "(int, int)"
+    assert instance1.overloaded(4., 4.) == "(float, float)"
+    assert instance1.overloaded_const(-3) == "(int) const"
+    assert instance1.overloaded_const(5, 5.0) == "(int, float) const"
+    assert instance1.overloaded_const(6.0, 6) == "(float, int) const"
+    assert instance1.overloaded_const(7,   7) == "(int, int) const"
+    assert instance1.overloaded_const(8., 8.) == "(float, float) const"
+    assert instance1.overloaded_float(1, 1) == "(float, float)"
+    assert instance1.overloaded_float(1, 1.) == "(float, float)"
+    assert instance1.overloaded_float(1., 1) == "(float, float)"
+    assert instance1.overloaded_float(1., 1.) == "(float, float)"
 
     assert instance1.value == 320
     instance1.value = 100
     assert str(instance1) == "ExampleMandA[value=100]"
 
-    cstats = ConstructorStats.get(ExampleMandA)
+    cstats = ConstructorStats.get(m.ExampleMandA)
     assert cstats.alive() == 2
     del instance1, instance2
     assert cstats.alive() == 0
@@ -52,10 +64,25 @@ def test_methods_and_attributes():
     assert cstats.move_assignments == 0
 
 
-def test_properties():
-    from pybind11_tests import TestProperties
+def test_copy_method():
+    """Issue #443: calling copied methods fails in Python 3"""
+
+    m.ExampleMandA.add2c = m.ExampleMandA.add2
+    m.ExampleMandA.add2d = m.ExampleMandA.add2b
+    a = m.ExampleMandA(123)
+    assert a.value == 123
+    a.add2(m.ExampleMandA(-100))
+    assert a.value == 23
+    a.add2b(m.ExampleMandA(20))
+    assert a.value == 43
+    a.add2c(m.ExampleMandA(6))
+    assert a.value == 49
+    a.add2d(m.ExampleMandA(-7))
+    assert a.value == 42
 
-    instance = TestProperties()
+
+def test_properties():
+    instance = m.TestProperties()
 
     assert instance.def_readonly == 1
     with pytest.raises(AttributeError):
@@ -73,31 +100,96 @@ def test_properties():
 
 
 def test_static_properties():
-    from pybind11_tests import TestProperties as Type
+    assert m.TestProperties.def_readonly_static == 1
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_readonly_static = 2
+    assert "can't set attribute" in str(excinfo)
 
-    assert Type.def_readonly_static == 1
-    with pytest.raises(AttributeError):
-        Type.def_readonly_static = 2
+    m.TestProperties.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
 
-    Type.def_readwrite_static = 2
-    assert Type.def_readwrite_static == 2
+    assert m.TestProperties.def_property_readonly_static == 2
+    with pytest.raises(AttributeError) as excinfo:
+        m.TestProperties.def_property_readonly_static = 3
+    assert "can't set attribute" in str(excinfo)
 
-    assert Type.def_property_readonly_static == 2
-    with pytest.raises(AttributeError):
-        Type.def_property_readonly_static = 3
+    m.TestProperties.def_property_static = 3
+    assert m.TestProperties.def_property_static == 3
+
+    # Static property read and write via instance
+    instance = m.TestProperties()
+
+    m.TestProperties.def_readwrite_static = 0
+    assert m.TestProperties.def_readwrite_static == 0
+    assert instance.def_readwrite_static == 0
+
+    instance.def_readwrite_static = 2
+    assert m.TestProperties.def_readwrite_static == 2
+    assert instance.def_readwrite_static == 2
+
+    # It should be possible to override properties in derived classes
+    assert m.TestPropertiesOverride().def_readonly == 99
+    assert m.TestPropertiesOverride.def_readonly_static == 99
+
+
+def test_static_cls():
+    """Static property getter and setters expect the type object as the their only argument"""
+
+    instance = m.TestProperties()
+    assert m.TestProperties.static_cls is m.TestProperties
+    assert instance.static_cls is m.TestProperties
+
+    def check_self(self):
+        assert self is m.TestProperties
+
+    m.TestProperties.static_cls = check_self
+    instance.static_cls = check_self
+
+
+def test_metaclass_override():
+    """Overriding pybind11's default metaclass changes the behavior of `static_property`"""
+
+    assert type(m.ExampleMandA).__name__ == "pybind11_type"
+    assert type(m.MetaclassOverride).__name__ == "type"
 
-    Type.def_property_static = 3
-    assert Type.def_property_static == 3
+    assert m.MetaclassOverride.readonly == 1
+    assert type(m.MetaclassOverride.__dict__["readonly"]).__name__ == "pybind11_static_property"
+
+    # Regular `type` replaces the property instead of calling `__set__()`
+    m.MetaclassOverride.readonly = 2
+    assert m.MetaclassOverride.readonly == 2
+    assert isinstance(m.MetaclassOverride.__dict__["readonly"], int)
+
+
+def test_no_mixed_overloads():
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads1()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind static method ExampleMandA.overload_mixed1"
+             "(arg0: float) -> str")
+            )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.ExampleMandA.add_mixed_overloads2()
+    assert (str(excinfo.value) ==
+            "overloading a method with both static and instance methods is not supported; " +
+            ("compile in debug mode for more details" if not debug_enabled else
+             "error while attempting to bind instance method ExampleMandA.overload_mixed2"
+             "(self: pybind11_tests.methods_and_attributes.ExampleMandA, arg0: int, arg1: int)"
+             " -> str")
+            )
 
 
 @pytest.mark.parametrize("access", ["ro", "rw", "static_ro", "static_rw"])
 def test_property_return_value_policies(access):
-    from pybind11_tests import TestPropRVP
-
     if not access.startswith("static"):
-        obj = TestPropRVP()
+        obj = m.TestPropRVP()
     else:
-        obj = TestPropRVP
+        obj = m.TestPropRVP
 
     ref = getattr(obj, access + "_ref")
     assert ref.value == 1
@@ -118,30 +210,20 @@ def test_property_return_value_policies(access):
 
 def test_property_rvalue_policy():
     """When returning an rvalue, the return value policy is automatically changed from
-    `reference(_internal)` to `move`. The following would not work otherwise.
-    """
-    from pybind11_tests import TestPropRVP
+    `reference(_internal)` to `move`. The following would not work otherwise."""
 
-    instance = TestPropRVP()
+    instance = m.TestPropRVP()
     o = instance.rvalue
     assert o.value == 1
 
-
-def test_property_rvalue_policy_static():
-    """When returning an rvalue, the return value policy is automatically changed from
-    `reference(_internal)` to `move`. The following would not work otherwise.
-    """
-    from pybind11_tests import TestPropRVP
-    o = TestPropRVP.static_rvalue
-    assert o.value == 1
+    os = m.TestPropRVP.static_rvalue
+    assert os.value == 1
 
 
 # https://bitbucket.org/pypy/pypy/issues/2447
 @pytest.unsupported_on_pypy
 def test_dynamic_attributes():
-    from pybind11_tests import DynamicClass, CppDerivedDynamicClass
-
-    instance = DynamicClass()
+    instance = m.DynamicClass()
     assert not hasattr(instance, "foo")
     assert "foo" not in dir(instance)
 
@@ -161,16 +243,16 @@ def test_dynamic_attributes():
         instance.__dict__ = []
     assert str(excinfo.value) == "__dict__ must be set to a dictionary, not a 'list'"
 
-    cstats = ConstructorStats.get(DynamicClass)
+    cstats = ConstructorStats.get(m.DynamicClass)
     assert cstats.alive() == 1
     del instance
     assert cstats.alive() == 0
 
     # Derived classes should work as well
-    class PythonDerivedDynamicClass(DynamicClass):
+    class PythonDerivedDynamicClass(m.DynamicClass):
         pass
 
-    for cls in CppDerivedDynamicClass, PythonDerivedDynamicClass:
+    for cls in m.CppDerivedDynamicClass, PythonDerivedDynamicClass:
         derived = cls()
         derived.foobar = 100
         assert derived.foobar == 100
@@ -183,23 +265,212 @@ def test_dynamic_attributes():
 # https://bitbucket.org/pypy/pypy/issues/2447
 @pytest.unsupported_on_pypy
 def test_cyclic_gc():
-    from pybind11_tests import DynamicClass
-
     # One object references itself
-    instance = DynamicClass()
+    instance = m.DynamicClass()
     instance.circular_reference = instance
 
-    cstats = ConstructorStats.get(DynamicClass)
+    cstats = ConstructorStats.get(m.DynamicClass)
     assert cstats.alive() == 1
     del instance
     assert cstats.alive() == 0
 
     # Two object reference each other
-    i1 = DynamicClass()
-    i2 = DynamicClass()
+    i1 = m.DynamicClass()
+    i2 = m.DynamicClass()
     i1.cycle = i2
     i2.cycle = i1
 
     assert cstats.alive() == 2
     del i1, i2
     assert cstats.alive() == 0
+
+
+def test_noconvert_args(msg):
+    a = m.ArgInspector()
+    assert msg(a.f("hi")) == """
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = hi
+    """
+    assert msg(a.g("this is a", "this is b")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        13
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42)) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
+    """  # noqa: E501 line too long
+    assert msg(a.g("this is a", "this is b", 42, "this is d")) == """
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
+        loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
+        42
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = this is d
+    """
+    assert (a.h("arg 1") ==
+            "loading ArgInspector2 argument WITHOUT conversion allowed.  Argument value = arg 1")
+    assert msg(m.arg_inspect_func("A1", "A2")) == """
+        loading ArgInspector2 argument WITH conversion allowed.  Argument value = A1
+        loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = A2
+    """
+
+    assert m.floats_preferred(4) == 2.0
+    assert m.floats_only(4.0) == 2.0
+    with pytest.raises(TypeError) as excinfo:
+        m.floats_only(4)
+    assert msg(excinfo.value) == """
+        floats_only(): incompatible function arguments. The following argument types are supported:
+            1. (f: float) -> float
+
+        Invoked with: 4
+    """
+
+    assert m.ints_preferred(4) == 2
+    assert m.ints_preferred(True) == 0
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_preferred(4.0)
+    assert msg(excinfo.value) == """
+        ints_preferred(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """  # noqa: E501 line too long
+
+    assert m.ints_only(4) == 2
+    with pytest.raises(TypeError) as excinfo:
+        m.ints_only(4.0)
+    assert msg(excinfo.value) == """
+        ints_only(): incompatible function arguments. The following argument types are supported:
+            1. (i: int) -> int
+
+        Invoked with: 4.0
+    """
+
+
+def test_bad_arg_default(msg):
+    from pybind11_tests import debug_enabled
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_named()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'a: UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.bad_arg_def_unnamed()
+    assert msg(excinfo.value) == (
+        "arg(): could not convert default argument 'UnregisteredType' in function "
+        "'should_fail' into a Python object (type not registered yet?)"
+        if debug_enabled else
+        "arg(): could not convert default argument into a Python object (type not registered "
+        "yet?). Compile in debug mode for more information."
+    )
+
+
+def test_accepts_none(msg):
+    a = m.NoneTester()
+    assert m.no_none1(a) == 42
+    assert m.no_none2(a) == 42
+    assert m.no_none3(a) == 42
+    assert m.no_none4(a) == 42
+    assert m.no_none5(a) == 42
+    assert m.ok_none1(a) == 42
+    assert m.ok_none2(a) == 42
+    assert m.ok_none3(a) == 42
+    assert m.ok_none4(a) == 42
+    assert m.ok_none5(a) == 42
+
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none1(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none2(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none3(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none4(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none5(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    # The first one still raises because you can't pass None as a lvalue reference arg:
+    with pytest.raises(TypeError) as excinfo:
+        assert m.ok_none1(None) == -1
+    assert msg(excinfo.value) == """
+        ok_none1(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: m.methods_and_attributes.NoneTester) -> int
+
+        Invoked with: None
+    """
+
+    # The rest take the argument as pointer or holder, and accept None:
+    assert m.ok_none2(None) == -1
+    assert m.ok_none3(None) == -1
+    assert m.ok_none4(None) == -1
+    assert m.ok_none5(None) == -1
+
+
+def test_str_issue(msg):
+    """#283: __str__ called on uninitialized instance when constructor arguments invalid"""
+
+    assert str(m.StrIssue(3)) == "StrIssue[3]"
+
+    with pytest.raises(TypeError) as excinfo:
+        str(m.StrIssue("no", "such", "constructor"))
+    assert msg(excinfo.value) == """
+        __init__(): incompatible constructor arguments. The following argument types are supported:
+            1. m.methods_and_attributes.StrIssue(arg0: int)
+            2. m.methods_and_attributes.StrIssue()
+
+        Invoked with: 'no', 'such', 'constructor'
+    """
+
+
+def test_unregistered_base_implementations():
+    a = m.RegisteredDerived()
+    a.do_nothing()
+    assert a.rw_value == 42
+    assert a.ro_value == 1.25
+    a.rw_value += 5
+    assert a.sum() == 48.25
+    a.increase_value()
+    assert a.rw_value == 48
+    assert a.ro_value == 1.5
+    assert a.sum() == 49.5
+    assert a.rw_value_prop == 48
+    a.rw_value_prop += 1
+    assert a.rw_value_prop == 49
+    a.increase_value()
+    assert a.ro_value_prop == 1.75
+
+
+def test_custom_caster_destruction():
+    """Tests that returning a pointer to a type that gets converted with a custom type caster gets
+    destroyed when the function has py::return_value_policy::take_ownership policy applied."""
+
+    cstats = m.destruction_tester_cstats()
+    # This one *doesn't* have take_ownership: the pointer should be used but not destroyed:
+    z = m.custom_caster_no_destroy()
+    assert cstats.alive() == 1 and cstats.default_constructions == 1
+    assert z
+
+    # take_ownership applied: this constructs a new object, casts it, then destroys it:
+    z = m.custom_caster_destroy()
+    assert z
+    assert cstats.default_constructions == 2
+
+    # Same, but with a const pointer return (which should *not* inhibit destruction):
+    z = m.custom_caster_destroy_const()
+    assert z
+    assert cstats.default_constructions == 3
+
+    # Make sure we still only have the original object (from ..._no_destroy()) alive:
+    assert cstats.alive() == 1
diff --git a/pybind11/tests/test_modules.cpp b/pybind11/tests/test_modules.cpp
index 50c7d8412..c1475fa62 100644
--- a/pybind11/tests/test_modules.cpp
+++ b/pybind11/tests/test_modules.cpp
@@ -11,42 +11,38 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
 
-std::string submodule_func() {
-    return "submodule_func()";
-}
-
-class A {
-public:
-    A(int v) : v(v) { print_created(this, v); }
-    ~A() { print_destroyed(this); }
-    A(const A&) { print_copy_created(this); }
-    A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
-    std::string toString() { return "A[" + std::to_string(v) + "]"; }
-private:
-    int v;
-};
-
-class B {
-public:
-    B() { print_default_created(this); }
-    ~B() { print_destroyed(this); }
-    B(const B&) { print_copy_created(this); }
-    B& operator=(const B &copy) { print_copy_assigned(this); a1 = copy.a1; a2 = copy.a2; return *this; }
-    A &get_a1() { return a1; }
-    A &get_a2() { return a2; }
-
-    A a1{1};
-    A a2{2};
-};
-
-test_initializer modules([](py::module &m) {
-    py::module m_sub = m.def_submodule("submodule");
-    m_sub.def("submodule_func", &submodule_func);
+TEST_SUBMODULE(modules, m) {
+    // test_nested_modules
+    py::module m_sub = m.def_submodule("subsubmodule");
+    m_sub.def("submodule_func", []() { return "submodule_func()"; });
 
+    // test_reference_internal
+    class A {
+    public:
+        A(int v) : v(v) { print_created(this, v); }
+        ~A() { print_destroyed(this); }
+        A(const A&) { print_copy_created(this); }
+        A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
+        std::string toString() { return "A[" + std::to_string(v) + "]"; }
+    private:
+        int v;
+    };
     py::class_<A>(m_sub, "A")
         .def(py::init<int>())
         .def("__repr__", &A::toString);
 
+    class B {
+    public:
+        B() { print_default_created(this); }
+        ~B() { print_destroyed(this); }
+        B(const B&) { print_copy_created(this); }
+        B& operator=(const B &copy) { print_copy_assigned(this); a1 = copy.a1; a2 = copy.a2; return *this; }
+        A &get_a1() { return a1; }
+        A &get_a2() { return a2; }
+
+        A a1{1};
+        A a2{2};
+    };
     py::class_<B>(m_sub, "B")
         .def(py::init<>())
         .def("get_a1", &B::get_a1, "Return the internal A 1", py::return_value_policy::reference_internal)
@@ -55,4 +51,48 @@ test_initializer modules([](py::module &m) {
         .def_readwrite("a2", &B::a2);
 
     m.attr("OD") = py::module::import("collections").attr("OrderedDict");
-});
+
+    // test_duplicate_registration
+    // Registering two things with the same name
+    m.def("duplicate_registration", []() {
+        class Dupe1 { };
+        class Dupe2 { };
+        class Dupe3 { };
+        class DupeException { };
+
+        auto dm = py::module("dummy");
+        auto failures = py::list();
+
+        py::class_<Dupe1>(dm, "Dupe1");
+        py::class_<Dupe2>(dm, "Dupe2");
+        dm.def("dupe1_factory", []() { return Dupe1(); });
+        py::exception<DupeException>(dm, "DupeException");
+
+        try {
+            py::class_<Dupe1>(dm, "Dupe1");
+            failures.append("Dupe1 class");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("Dupe1", []() { return Dupe1(); });
+            failures.append("Dupe1 function");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<Dupe3>(dm, "dupe1_factory");
+            failures.append("dupe1_factory");
+        } catch (std::runtime_error &) {}
+        try {
+            py::exception<Dupe3>(dm, "Dupe2");
+            failures.append("Dupe2");
+        } catch (std::runtime_error &) {}
+        try {
+            dm.def("DupeException", []() { return 30; });
+            failures.append("DupeException1");
+        } catch (std::runtime_error &) {}
+        try {
+            py::class_<DupeException>(dm, "DupeException");
+            failures.append("DupeException2");
+        } catch (std::runtime_error &) {}
+
+        return failures;
+    });
+}
diff --git a/pybind11/tests/test_modules.py b/pybind11/tests/test_modules.py
index fe72f190a..2552838c2 100644
--- a/pybind11/tests/test_modules.py
+++ b/pybind11/tests/test_modules.py
@@ -1,32 +1,34 @@
+from pybind11_tests import modules as m
+from pybind11_tests.modules import subsubmodule as ms
+from pybind11_tests import ConstructorStats
+
 
 def test_nested_modules():
     import pybind11_tests
-    from pybind11_tests.submodule import submodule_func
-
     assert pybind11_tests.__name__ == "pybind11_tests"
-    assert pybind11_tests.submodule.__name__ == "pybind11_tests.submodule"
+    assert pybind11_tests.modules.__name__ == "pybind11_tests.modules"
+    assert pybind11_tests.modules.subsubmodule.__name__ == "pybind11_tests.modules.subsubmodule"
+    assert m.__name__ == "pybind11_tests.modules"
+    assert ms.__name__ == "pybind11_tests.modules.subsubmodule"
 
-    assert submodule_func() == "submodule_func()"
+    assert ms.submodule_func() == "submodule_func()"
 
 
 def test_reference_internal():
-    from pybind11_tests import ConstructorStats
-    from pybind11_tests.submodule import A, B
-
-    b = B()
+    b = ms.B()
     assert str(b.get_a1()) == "A[1]"
     assert str(b.a1) == "A[1]"
     assert str(b.get_a2()) == "A[2]"
     assert str(b.a2) == "A[2]"
 
-    b.a1 = A(42)
-    b.a2 = A(43)
+    b.a1 = ms.A(42)
+    b.a2 = ms.A(43)
     assert str(b.get_a1()) == "A[42]"
     assert str(b.a1) == "A[42]"
     assert str(b.get_a2()) == "A[43]"
     assert str(b.a2) == "A[43]"
 
-    astats, bstats = ConstructorStats.get(A), ConstructorStats.get(B)
+    astats, bstats = ConstructorStats.get(ms.A), ConstructorStats.get(ms.B)
     assert astats.alive() == 2
     assert bstats.alive() == 1
     del b
@@ -47,8 +49,24 @@ def test_reference_internal():
 
 
 def test_importing():
-    from pybind11_tests import OD
+    from pybind11_tests.modules import OD
     from collections import OrderedDict
 
     assert OD is OrderedDict
     assert str(OD([(1, 'a'), (2, 'b')])) == "OrderedDict([(1, 'a'), (2, 'b')])"
+
+
+def test_pydoc():
+    """Pydoc needs to be able to provide help() for everything inside a pybind11 module"""
+    import pybind11_tests
+    import pydoc
+
+    assert pybind11_tests.__name__ == "pybind11_tests"
+    assert pybind11_tests.__doc__ == "pybind11 test module"
+    assert pydoc.text.docmodule(pybind11_tests)
+
+
+def test_duplicate_registration():
+    """Registering two things with the same name"""
+
+    assert m.duplicate_registration() == []
diff --git a/pybind11/tests/test_multiple_inheritance.cpp b/pybind11/tests/test_multiple_inheritance.cpp
index c57cb852a..35f9d9c4e 100644
--- a/pybind11/tests/test_multiple_inheritance.cpp
+++ b/pybind11/tests/test_multiple_inheritance.cpp
@@ -9,75 +9,212 @@
 */
 
 #include "pybind11_tests.h"
+#include "constructor_stats.h"
 
-struct Base1 {
-    Base1(int i) : i(i) { }
-    int foo() { return i; }
+// Many bases for testing that multiple inheritance from many classes (i.e. requiring extra
+// space for holder constructed flags) works.
+template <int N> struct BaseN {
+    BaseN(int i) : i(i) { }
     int i;
 };
 
-struct Base2 {
-    Base2(int i) : i(i) { }
-    int bar() { return i; }
-    int i;
+// test_mi_static_properties
+struct Vanilla {
+    std::string vanilla() { return "Vanilla"; };
 };
-
-struct Base12 : Base1, Base2 {
-    Base12(int i, int j) : Base1(i), Base2(j) { }
+struct WithStatic1 {
+    static std::string static_func1() { return "WithStatic1"; };
+    static int static_value1;
 };
-
-struct MIType : Base12 {
-    MIType(int i, int j) : Base12(i, j) { }
+struct WithStatic2 {
+    static std::string static_func2() { return "WithStatic2"; };
+    static int static_value2;
 };
-
-test_initializer multiple_inheritance([](py::module &m) {
-    py::class_<Base1>(m, "Base1")
-        .def(py::init<int>())
-        .def("foo", &Base1::foo);
-
-    py::class_<Base2>(m, "Base2")
-        .def(py::init<int>())
-        .def("bar", &Base2::bar);
-
+struct VanillaStaticMix1 : Vanilla, WithStatic1, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix1"; }
+    static int static_value;
+};
+struct VanillaStaticMix2 : WithStatic1, Vanilla, WithStatic2 {
+    static std::string static_func() { return "VanillaStaticMix2"; }
+    static int static_value;
+};
+int WithStatic1::static_value1 = 1;
+int WithStatic2::static_value2 = 2;
+int VanillaStaticMix1::static_value = 12;
+int VanillaStaticMix2::static_value = 12;
+
+TEST_SUBMODULE(multiple_inheritance, m) {
+
+    // test_multiple_inheritance_mix1
+    // test_multiple_inheritance_mix2
+    struct Base1 {
+        Base1(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
+    py::class_<Base1> b1(m, "Base1");
+    b1.def(py::init<int>())
+      .def("foo", &Base1::foo);
+
+    struct Base2 {
+        Base2(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
+    py::class_<Base2> b2(m, "Base2");
+    b2.def(py::init<int>())
+      .def("bar", &Base2::bar);
+
+
+    // test_multiple_inheritance_cpp
+    struct Base12 : Base1, Base2 {
+        Base12(int i, int j) : Base1(i), Base2(j) { }
+    };
+    struct MIType : Base12 {
+        MIType(int i, int j) : Base12(i, j) { }
+    };
     py::class_<Base12, Base1, Base2>(m, "Base12");
-
     py::class_<MIType, Base12>(m, "MIType")
         .def(py::init<int, int>());
-});
-
-/* Test the case where not all base classes are specified,
-   and where pybind11 requires the py::multiple_inheritance
-   flag to perform proper casting between types */
-
-struct Base1a {
-    Base1a(int i) : i(i) { }
-    int foo() { return i; }
-    int i;
-};
 
-struct Base2a {
-    Base2a(int i) : i(i) { }
-    int bar() { return i; }
-    int i;
-};
 
-struct Base12a : Base1a, Base2a {
-    Base12a(int i, int j) : Base1a(i), Base2a(j) { }
-};
-
-test_initializer multiple_inheritance_nonexplicit([](py::module &m) {
+    // test_multiple_inheritance_python_many_bases
+    #define PYBIND11_BASEN(N) py::class_<BaseN<N>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) { return b.i + N; })
+    PYBIND11_BASEN( 1); PYBIND11_BASEN( 2); PYBIND11_BASEN( 3); PYBIND11_BASEN( 4);
+    PYBIND11_BASEN( 5); PYBIND11_BASEN( 6); PYBIND11_BASEN( 7); PYBIND11_BASEN( 8);
+    PYBIND11_BASEN( 9); PYBIND11_BASEN(10); PYBIND11_BASEN(11); PYBIND11_BASEN(12);
+    PYBIND11_BASEN(13); PYBIND11_BASEN(14); PYBIND11_BASEN(15); PYBIND11_BASEN(16);
+    PYBIND11_BASEN(17);
+
+    // Uncommenting this should result in a compile time failure (MI can only be specified via
+    // template parameters because pybind has to know the types involved; see discussion in #742 for
+    // details).
+//    struct Base12v2 : Base1, Base2 {
+//        Base12v2(int i, int j) : Base1(i), Base2(j) { }
+//    };
+//    py::class_<Base12v2>(m, "Base12v2", b1, b2)
+//        .def(py::init<int, int>());
+
+
+    // test_multiple_inheritance_virtbase
+    // Test the case where not all base classes are specified, and where pybind11 requires the
+    // py::multiple_inheritance flag to perform proper casting between types.
+    struct Base1a {
+        Base1a(int i) : i(i) { }
+        int foo() { return i; }
+        int i;
+    };
     py::class_<Base1a, std::shared_ptr<Base1a>>(m, "Base1a")
         .def(py::init<int>())
         .def("foo", &Base1a::foo);
 
+    struct Base2a {
+        Base2a(int i) : i(i) { }
+        int bar() { return i; }
+        int i;
+    };
     py::class_<Base2a, std::shared_ptr<Base2a>>(m, "Base2a")
         .def(py::init<int>())
         .def("bar", &Base2a::bar);
 
+    struct Base12a : Base1a, Base2a {
+        Base12a(int i, int j) : Base1a(i), Base2a(j) { }
+    };
     py::class_<Base12a, /* Base1 missing */ Base2a,
                std::shared_ptr<Base12a>>(m, "Base12a", py::multiple_inheritance())
         .def(py::init<int, int>());
 
     m.def("bar_base2a", [](Base2a *b) { return b->bar(); });
     m.def("bar_base2a_sharedptr", [](std::shared_ptr<Base2a> b) { return b->bar(); });
-});
+
+    // test_mi_unaligned_base
+    // test_mi_base_return
+    // Issue #801: invalid casting to derived type with MI bases
+    struct I801B1 { int a = 1; virtual ~I801B1() = default; };
+    struct I801B2 { int b = 2; virtual ~I801B2() = default; };
+    struct I801C : I801B1, I801B2 {};
+    struct I801D : I801C {}; // Indirect MI
+    // Unregistered classes:
+    struct I801B3 { int c = 3; virtual ~I801B3() = default; };
+    struct I801E : I801B3, I801D {};
+
+    py::class_<I801B1, std::shared_ptr<I801B1>>(m, "I801B1").def(py::init<>()).def_readonly("a", &I801B1::a);
+    py::class_<I801B2, std::shared_ptr<I801B2>>(m, "I801B2").def(py::init<>()).def_readonly("b", &I801B2::b);
+    py::class_<I801C, I801B1, I801B2, std::shared_ptr<I801C>>(m, "I801C").def(py::init<>());
+    py::class_<I801D, I801C, std::shared_ptr<I801D>>(m, "I801D").def(py::init<>());
+
+    // Two separate issues here: first, we want to recognize a pointer to a base type as being a
+    // known instance even when the pointer value is unequal (i.e. due to a non-first
+    // multiple-inheritance base class):
+    m.def("i801b1_c", [](I801C *c) { return static_cast<I801B1 *>(c); });
+    m.def("i801b2_c", [](I801C *c) { return static_cast<I801B2 *>(c); });
+    m.def("i801b1_d", [](I801D *d) { return static_cast<I801B1 *>(d); });
+    m.def("i801b2_d", [](I801D *d) { return static_cast<I801B2 *>(d); });
+
+    // Second, when returned a base class pointer to a derived instance, we cannot assume that the
+    // pointer is `reinterpret_cast`able to the derived pointer because, like above, the base class
+    // pointer could be offset.
+    m.def("i801c_b1", []() -> I801B1 * { return new I801C(); });
+    m.def("i801c_b2", []() -> I801B2 * { return new I801C(); });
+    m.def("i801d_b1", []() -> I801B1 * { return new I801D(); });
+    m.def("i801d_b2", []() -> I801B2 * { return new I801D(); });
+
+    // Return a base class pointer to a pybind-registered type when the actual derived type
+    // isn't pybind-registered (and uses multiple-inheritance to offset the pybind base)
+    m.def("i801e_c", []() -> I801C * { return new I801E(); });
+    m.def("i801e_b2", []() -> I801B2 * { return new I801E(); });
+
+
+    // test_mi_static_properties
+    py::class_<Vanilla>(m, "Vanilla")
+        .def(py::init<>())
+        .def("vanilla", &Vanilla::vanilla);
+
+    py::class_<WithStatic1>(m, "WithStatic1")
+        .def(py::init<>())
+        .def_static("static_func1", &WithStatic1::static_func1)
+        .def_readwrite_static("static_value1", &WithStatic1::static_value1);
+
+    py::class_<WithStatic2>(m, "WithStatic2")
+        .def(py::init<>())
+        .def_static("static_func2", &WithStatic2::static_func2)
+        .def_readwrite_static("static_value2", &WithStatic2::static_value2);
+
+    py::class_<VanillaStaticMix1, Vanilla, WithStatic1, WithStatic2>(
+        m, "VanillaStaticMix1")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix1::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix1::static_value);
+
+    py::class_<VanillaStaticMix2, WithStatic1, Vanilla, WithStatic2>(
+        m, "VanillaStaticMix2")
+        .def(py::init<>())
+        .def_static("static_func", &VanillaStaticMix2::static_func)
+        .def_readwrite_static("static_value", &VanillaStaticMix2::static_value);
+
+
+#if !defined(PYPY_VERSION)
+    struct WithDict { };
+    struct VanillaDictMix1 : Vanilla, WithDict { };
+    struct VanillaDictMix2 : WithDict, Vanilla { };
+    py::class_<WithDict>(m, "WithDict", py::dynamic_attr()).def(py::init<>());
+    py::class_<VanillaDictMix1, Vanilla, WithDict>(m, "VanillaDictMix1").def(py::init<>());
+    py::class_<VanillaDictMix2, WithDict, Vanilla>(m, "VanillaDictMix2").def(py::init<>());
+#endif
+
+    // test_diamond_inheritance
+    // Issue #959: segfault when constructing diamond inheritance instance
+    // All of these have int members so that there will be various unequal pointers involved.
+    struct B { int b; virtual ~B() = default; };
+    struct C0 : public virtual B { int c0; };
+    struct C1 : public virtual B { int c1; };
+    struct D : public C0, public C1 { int d; };
+    py::class_<B>(m, "B")
+        .def("b", [](B *self) { return self; });
+    py::class_<C0, B>(m, "C0")
+        .def("c0", [](C0 *self) { return self; });
+    py::class_<C1, B>(m, "C1")
+        .def("c1", [](C1 *self) { return self; });
+    py::class_<D, C0, C1>(m, "D")
+        .def(py::init<>());
+}
diff --git a/pybind11/tests/test_multiple_inheritance.py b/pybind11/tests/test_multiple_inheritance.py
index c10298d70..475dd3b3d 100644
--- a/pybind11/tests/test_multiple_inheritance.py
+++ b/pybind11/tests/test_multiple_inheritance.py
@@ -1,15 +1,16 @@
-def test_multiple_inheritance_cpp():
-    from pybind11_tests import MIType
+import pytest
+from pybind11_tests import ConstructorStats
+from pybind11_tests import multiple_inheritance as m
+
 
-    mt = MIType(3, 4)
+def test_multiple_inheritance_cpp():
+    mt = m.MIType(3, 4)
 
     assert mt.foo() == 3
     assert mt.bar() == 4
 
 
 def test_multiple_inheritance_mix1():
-    from pybind11_tests import Base2
-
     class Base1:
         def __init__(self, i):
             self.i = i
@@ -17,10 +18,10 @@ def test_multiple_inheritance_mix1():
         def foo(self):
             return self.i
 
-    class MITypePy(Base1, Base2):
+    class MITypePy(Base1, m.Base2):
         def __init__(self, i, j):
             Base1.__init__(self, i)
-            Base2.__init__(self, j)
+            m.Base2.__init__(self, j)
 
     mt = MITypePy(3, 4)
 
@@ -29,7 +30,6 @@ def test_multiple_inheritance_mix1():
 
 
 def test_multiple_inheritance_mix2():
-    from pybind11_tests import Base1
 
     class Base2:
         def __init__(self, i):
@@ -38,9 +38,9 @@ def test_multiple_inheritance_mix2():
         def bar(self):
             return self.i
 
-    class MITypePy(Base1, Base2):
+    class MITypePy(m.Base1, Base2):
         def __init__(self, i, j):
-            Base1.__init__(self, i)
+            m.Base1.__init__(self, i)
             Base2.__init__(self, j)
 
     mt = MITypePy(3, 4)
@@ -49,14 +49,301 @@ def test_multiple_inheritance_mix2():
     assert mt.bar() == 4
 
 
+def test_multiple_inheritance_python():
+
+    class MI1(m.Base1, m.Base2):
+        def __init__(self, i, j):
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class B1(object):
+        def v(self):
+            return 1
+
+    class MI2(B1, m.Base1, m.Base2):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI3(MI2):
+        def __init__(self, i, j):
+            MI2.__init__(self, i, j)
+
+    class MI4(MI3, m.Base2):
+        def __init__(self, i, j):
+            MI3.__init__(self, i, j)
+            # This should be ignored (Base2 is already initialized via MI2):
+            m.Base2.__init__(self, i + 100)
+
+    class MI5(m.Base2, B1, m.Base1):
+        def __init__(self, i, j):
+            B1.__init__(self)
+            m.Base1.__init__(self, i)
+            m.Base2.__init__(self, j)
+
+    class MI6(m.Base2, B1):
+        def __init__(self, i):
+            m.Base2.__init__(self, i)
+            B1.__init__(self)
+
+    class B2(B1):
+        def v(self):
+            return 2
+
+    class B3(object):
+        def v(self):
+            return 3
+
+    class B4(B3, B2):
+        def v(self):
+            return 4
+
+    class MI7(B4, MI6):
+        def __init__(self, i):
+            B4.__init__(self)
+            MI6.__init__(self, i)
+
+    class MI8(MI6, B3):
+        def __init__(self, i):
+            MI6.__init__(self, i)
+            B3.__init__(self)
+
+    class MI8b(B3, MI6):
+        def __init__(self, i):
+            B3.__init__(self)
+            MI6.__init__(self, i)
+
+    mi1 = MI1(1, 2)
+    assert mi1.foo() == 1
+    assert mi1.bar() == 2
+
+    mi2 = MI2(3, 4)
+    assert mi2.v() == 1
+    assert mi2.foo() == 3
+    assert mi2.bar() == 4
+
+    mi3 = MI3(5, 6)
+    assert mi3.v() == 1
+    assert mi3.foo() == 5
+    assert mi3.bar() == 6
+
+    mi4 = MI4(7, 8)
+    assert mi4.v() == 1
+    assert mi4.foo() == 7
+    assert mi4.bar() == 8
+
+    mi5 = MI5(10, 11)
+    assert mi5.v() == 1
+    assert mi5.foo() == 10
+    assert mi5.bar() == 11
+
+    mi6 = MI6(12)
+    assert mi6.v() == 1
+    assert mi6.bar() == 12
+
+    mi7 = MI7(13)
+    assert mi7.v() == 4
+    assert mi7.bar() == 13
+
+    mi8 = MI8(14)
+    assert mi8.v() == 1
+    assert mi8.bar() == 14
+
+    mi8b = MI8b(15)
+    assert mi8b.v() == 3
+    assert mi8b.bar() == 15
+
+
+def test_multiple_inheritance_python_many_bases():
+
+    class MIMany14(m.BaseN1, m.BaseN2, m.BaseN3, m.BaseN4):
+        def __init__(self):
+            m.BaseN1.__init__(self, 1)
+            m.BaseN2.__init__(self, 2)
+            m.BaseN3.__init__(self, 3)
+            m.BaseN4.__init__(self, 4)
+
+    class MIMany58(m.BaseN5, m.BaseN6, m.BaseN7, m.BaseN8):
+        def __init__(self):
+            m.BaseN5.__init__(self, 5)
+            m.BaseN6.__init__(self, 6)
+            m.BaseN7.__init__(self, 7)
+            m.BaseN8.__init__(self, 8)
+
+    class MIMany916(m.BaseN9, m.BaseN10, m.BaseN11, m.BaseN12, m.BaseN13, m.BaseN14, m.BaseN15,
+                    m.BaseN16):
+        def __init__(self):
+            m.BaseN9.__init__(self, 9)
+            m.BaseN10.__init__(self, 10)
+            m.BaseN11.__init__(self, 11)
+            m.BaseN12.__init__(self, 12)
+            m.BaseN13.__init__(self, 13)
+            m.BaseN14.__init__(self, 14)
+            m.BaseN15.__init__(self, 15)
+            m.BaseN16.__init__(self, 16)
+
+    class MIMany19(MIMany14, MIMany58, m.BaseN9):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            m.BaseN9.__init__(self, 9)
+
+    class MIMany117(MIMany14, MIMany58, MIMany916, m.BaseN17):
+        def __init__(self):
+            MIMany14.__init__(self)
+            MIMany58.__init__(self)
+            MIMany916.__init__(self)
+            m.BaseN17.__init__(self, 17)
+
+    # Inherits from 4 registered C++ classes: can fit in one pointer on any modern arch:
+    a = MIMany14()
+    for i in range(1, 4):
+        assert getattr(a, "f" + str(i))() == 2 * i
+
+    # Inherits from 8: requires 1/2 pointers worth of holder flags on 32/64-bit arch:
+    b = MIMany916()
+    for i in range(9, 16):
+        assert getattr(b, "f" + str(i))() == 2 * i
+
+    # Inherits from 9: requires >= 2 pointers worth of holder flags
+    c = MIMany19()
+    for i in range(1, 9):
+        assert getattr(c, "f" + str(i))() == 2 * i
+
+    # Inherits from 17: requires >= 3 pointers worth of holder flags
+    d = MIMany117()
+    for i in range(1, 17):
+        assert getattr(d, "f" + str(i))() == 2 * i
+
+
 def test_multiple_inheritance_virtbase():
-    from pybind11_tests import Base12a, bar_base2a, bar_base2a_sharedptr
 
-    class MITypePy(Base12a):
+    class MITypePy(m.Base12a):
         def __init__(self, i, j):
-            Base12a.__init__(self, i, j)
+            m.Base12a.__init__(self, i, j)
 
     mt = MITypePy(3, 4)
     assert mt.bar() == 4
-    assert bar_base2a(mt) == 4
-    assert bar_base2a_sharedptr(mt) == 4
+    assert m.bar_base2a(mt) == 4
+    assert m.bar_base2a_sharedptr(mt) == 4
+
+
+def test_mi_static_properties():
+    """Mixing bases with and without static properties should be possible
+     and the result should be independent of base definition order"""
+
+    for d in (m.VanillaStaticMix1(), m.VanillaStaticMix2()):
+        assert d.vanilla() == "Vanilla"
+        assert d.static_func1() == "WithStatic1"
+        assert d.static_func2() == "WithStatic2"
+        assert d.static_func() == d.__class__.__name__
+
+        m.WithStatic1.static_value1 = 1
+        m.WithStatic2.static_value2 = 2
+        assert d.static_value1 == 1
+        assert d.static_value2 == 2
+        assert d.static_value == 12
+
+        d.static_value1 = 0
+        assert d.static_value1 == 0
+        d.static_value2 = 0
+        assert d.static_value2 == 0
+        d.static_value = 0
+        assert d.static_value == 0
+
+
+@pytest.unsupported_on_pypy
+def test_mi_dynamic_attributes():
+    """Mixing bases with and without dynamic attribute support"""
+
+    for d in (m.VanillaDictMix1(), m.VanillaDictMix2()):
+        d.dynamic = 1
+        assert d.dynamic == 1
+
+
+def test_mi_unaligned_base():
+    """Returning an offset (non-first MI) base class pointer should recognize the instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c = m.I801C()
+    d = m.I801D()
+    # + 4 below because we have the two instances, and each instance has offset base I801B2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+    b1c = m.i801b1_c(c)
+    assert b1c is c
+    b2c = m.i801b2_c(c)
+    assert b2c is c
+    b1d = m.i801b1_d(d)
+    assert b1d is d
+    b2d = m.i801b2_d(d)
+    assert b2d is d
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4  # no extra instances
+    del c, b1c, b2c
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    del d, b1d, b2d
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+
+def test_mi_base_return():
+    """Tests returning an offset (non-first MI) base class pointer to a derived instance"""
+
+    n_inst = ConstructorStats.detail_reg_inst()
+
+    c1 = m.i801c_b1()
+    assert type(c1) is m.I801C
+    assert c1.a == 1
+    assert c1.b == 2
+
+    d1 = m.i801d_b1()
+    assert type(d1) is m.I801D
+    assert d1.a == 1
+    assert d1.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 4
+
+    c2 = m.i801c_b2()
+    assert type(c2) is m.I801C
+    assert c2.a == 1
+    assert c2.b == 2
+
+    d2 = m.i801d_b2()
+    assert type(d2) is m.I801D
+    assert d2.a == 1
+    assert d2.b == 2
+
+    assert ConstructorStats.detail_reg_inst() == n_inst + 8
+
+    del c2
+    assert ConstructorStats.detail_reg_inst() == n_inst + 6
+    del c1, d1, d2
+    assert ConstructorStats.detail_reg_inst() == n_inst
+
+    # Returning an unregistered derived type with a registered base; we won't
+    # pick up the derived type, obviously, but should still work (as an object
+    # of whatever type was returned).
+    e1 = m.i801e_c()
+    assert type(e1) is m.I801C
+    assert e1.a == 1
+    assert e1.b == 2
+
+    e2 = m.i801e_b2()
+    assert type(e2) is m.I801B2
+    assert e2.b == 2
+
+
+def test_diamond_inheritance():
+    """Tests that diamond inheritance works as expected (issue #959)"""
+
+    # Issue #959: this shouldn't segfault:
+    d = m.D()
+
+    # Make sure all the various distinct pointers are all recognized as registered instances:
+    assert d is d.c0()
+    assert d is d.c1()
+    assert d is d.b()
+    assert d is d.c0().b()
+    assert d is d.c1().b()
+    assert d is d.c0().c1().b().c0().b()
diff --git a/pybind11/tests/test_numpy_array.cpp b/pybind11/tests/test_numpy_array.cpp
index 14c4c2999..2046c0e03 100644
--- a/pybind11/tests/test_numpy_array.cpp
+++ b/pybind11/tests/test_numpy_array.cpp
@@ -13,10 +13,10 @@
 #include <pybind11/stl.h>
 
 #include <cstdint>
-#include <vector>
 
 using arr = py::array;
 using arr_t = py::array_t<uint16_t, 0>;
+static_assert(std::is_same<arr_t::value_type, uint16_t>::value, "");
 
 template<typename... Ix> arr data(const arr& a, Ix... index) {
     return arr(a.nbytes() - a.offset_at(index...), (const uint8_t *) a.data(index...));
@@ -26,39 +26,25 @@ template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
     return arr(a.size() - a.index_at(index...), a.data(index...));
 }
 
-arr& mutate_data(arr& a) {
-    auto ptr = (uint8_t *) a.mutable_data();
-    for (size_t i = 0; i < a.nbytes(); i++)
-        ptr[i] = (uint8_t) (ptr[i] * 2);
-    return a;
-}
-
-arr_t& mutate_data_t(arr_t& a) {
-    auto ptr = a.mutable_data();
-    for (size_t i = 0; i < a.size(); i++)
-        ptr[i]++;
-    return a;
-}
-
 template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
     auto ptr = (uint8_t *) a.mutable_data(index...);
-    for (size_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+    for (ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
         ptr[i] = (uint8_t) (ptr[i] * 2);
     return a;
 }
 
 template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
     auto ptr = a.mutable_data(index...);
-    for (size_t i = 0; i < a.size() - a.index_at(index...); i++)
+    for (ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
         ptr[i]++;
     return a;
 }
 
-template<typename... Ix> size_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> size_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> size_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
 template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
 
 #define def_index_fn(name, type) \
@@ -67,55 +53,72 @@ template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(
     sm.def(#name, [](type a, int i, int j) { return name(a, i, j); }); \
     sm.def(#name, [](type a, int i, int j, int k) { return name(a, i, j, k); });
 
-test_initializer numpy_array([](py::module &m) {
-    auto sm = m.def_submodule("array");
+template <typename T, typename T2> py::handle auxiliaries(T &&r, T2 &&r2) {
+    if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+    py::list l;
+    l.append(*r.data(0, 0));
+    l.append(*r2.mutable_data(0, 0));
+    l.append(r.data(0, 1) == r2.mutable_data(0, 1));
+    l.append(r.ndim());
+    l.append(r.itemsize());
+    l.append(r.shape(0));
+    l.append(r.shape(1));
+    l.append(r.size());
+    l.append(r.nbytes());
+    return l.release();
+}
+
+TEST_SUBMODULE(numpy_array, sm) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
 
+    // test_array_attributes
     sm.def("ndim", [](const arr& a) { return a.ndim(); });
     sm.def("shape", [](const arr& a) { return arr(a.ndim(), a.shape()); });
-    sm.def("shape", [](const arr& a, size_t dim) { return a.shape(dim); });
+    sm.def("shape", [](const arr& a, ssize_t dim) { return a.shape(dim); });
     sm.def("strides", [](const arr& a) { return arr(a.ndim(), a.strides()); });
-    sm.def("strides", [](const arr& a, size_t dim) { return a.strides(dim); });
+    sm.def("strides", [](const arr& a, ssize_t dim) { return a.strides(dim); });
     sm.def("writeable", [](const arr& a) { return a.writeable(); });
     sm.def("size", [](const arr& a) { return a.size(); });
     sm.def("itemsize", [](const arr& a) { return a.itemsize(); });
     sm.def("nbytes", [](const arr& a) { return a.nbytes(); });
     sm.def("owndata", [](const arr& a) { return a.owndata(); });
 
-    def_index_fn(data, const arr&);
-    def_index_fn(data_t, const arr_t&);
+    // test_index_offset
     def_index_fn(index_at, const arr&);
     def_index_fn(index_at_t, const arr_t&);
     def_index_fn(offset_at, const arr&);
     def_index_fn(offset_at_t, const arr_t&);
+    // test_data
+    def_index_fn(data, const arr&);
+    def_index_fn(data_t, const arr_t&);
+    // test_mutate_data, test_mutate_readonly
     def_index_fn(mutate_data, arr&);
     def_index_fn(mutate_data_t, arr_t&);
     def_index_fn(at_t, const arr_t&);
     def_index_fn(mutate_at_t, arr_t&);
 
-    sm.def("make_f_array", [] {
-        return py::array_t<float>({ 2, 2 }, { 4, 8 });
-    });
-
-    sm.def("make_c_array", [] {
-        return py::array_t<float>({ 2, 2 }, { 8, 4 });
-    });
+    // test_make_c_f_array
+    sm.def("make_f_array", [] { return py::array_t<float>({ 2, 2 }, { 4, 8 }); });
+    sm.def("make_c_array", [] { return py::array_t<float>({ 2, 2 }, { 8, 4 }); });
 
+    // test_wrap
     sm.def("wrap", [](py::array a) {
         return py::array(
             a.dtype(),
-            std::vector<size_t>(a.shape(), a.shape() + a.ndim()),
-            std::vector<size_t>(a.strides(), a.strides() + a.ndim()),
+            {a.shape(), a.shape() + a.ndim()},
+            {a.strides(), a.strides() + a.ndim()},
             a.data(),
             a
         );
     });
 
+    // test_numpy_view
     struct ArrayClass {
         int data[2] = { 1, 2 };
         ArrayClass() { py::print("ArrayClass()"); }
         ~ArrayClass() { py::print("~ArrayClass()"); }
     };
-
     py::class_<ArrayClass>(sm, "ArrayClass")
         .def(py::init<>())
         .def("numpy_view", [](py::object &obj) {
@@ -125,16 +128,18 @@ test_initializer numpy_array([](py::module &m) {
         }
     );
 
+    // test_cast_numpy_int64_to_uint64
     sm.def("function_taking_uint64", [](uint64_t) { });
 
+    // test_isinstance
     sm.def("isinstance_untyped", [](py::object yes, py::object no) {
         return py::isinstance<py::array>(yes) && !py::isinstance<py::array>(no);
     });
-
     sm.def("isinstance_typed", [](py::object o) {
         return py::isinstance<py::array_t<double>>(o) && !py::isinstance<py::array_t<int>>(o);
     });
 
+    // test_constructors
     sm.def("default_constructors", []() {
         return py::dict(
             "array"_a=py::array(),
@@ -142,7 +147,6 @@ test_initializer numpy_array([](py::module &m) {
             "array_t<double>"_a=py::array_t<double>()
         );
     });
-
     sm.def("converting_constructors", [](py::object o) {
         return py::dict(
             "array"_a=py::array(o),
@@ -150,4 +154,142 @@ test_initializer numpy_array([](py::module &m) {
             "array_t<double>"_a=py::array_t<double>(o)
         );
     });
-});
+
+    // test_overload_resolution
+    sm.def("overloaded", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded", [](py::array_t<float>) { return "float"; });
+    sm.def("overloaded", [](py::array_t<int>) { return "int"; });
+    sm.def("overloaded", [](py::array_t<unsigned short>) { return "unsigned short"; });
+    sm.def("overloaded", [](py::array_t<long long>) { return "long long"; });
+    sm.def("overloaded", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded", [](py::array_t<std::complex<float>>) { return "float complex"; });
+
+    sm.def("overloaded2", [](py::array_t<std::complex<double>>) { return "double complex"; });
+    sm.def("overloaded2", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded2", [](py::array_t<std::complex<float>>) { return "float complex"; });
+    sm.def("overloaded2", [](py::array_t<float>) { return "float"; });
+
+    // Only accept the exact types:
+    sm.def("overloaded3", [](py::array_t<int>) { return "int"; }, py::arg().noconvert());
+    sm.def("overloaded3", [](py::array_t<double>) { return "double"; }, py::arg().noconvert());
+
+    // Make sure we don't do unsafe coercion (e.g. float to int) when not using forcecast, but
+    // rather that float gets converted via the safe (conversion to double) overload:
+    sm.def("overloaded4", [](py::array_t<long long, 0>) { return "long long"; });
+    sm.def("overloaded4", [](py::array_t<double, 0>) { return "double"; });
+
+    // But we do allow conversion to int if forcecast is enabled (but only if no overload matches
+    // without conversion)
+    sm.def("overloaded5", [](py::array_t<unsigned int>) { return "unsigned int"; });
+    sm.def("overloaded5", [](py::array_t<double>) { return "double"; });
+
+    // test_greedy_string_overload
+    // Issue 685: ndarray shouldn't go to std::string overload
+    sm.def("issue685", [](std::string) { return "string"; });
+    sm.def("issue685", [](py::array) { return "array"; });
+    sm.def("issue685", [](py::object) { return "other"; });
+
+    // test_array_unchecked_fixed_dims
+    sm.def("proxy_add2", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked<2>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+
+    sm.def("proxy_init3", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_init3F", [](double start) {
+        py::array_t<double, py::array::f_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked<3>();
+        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_squared_L2_norm", [](py::array_t<double> a) {
+        auto r = a.unchecked<1>();
+        double sumsq = 0;
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            sumsq += r[i] * r(i); // Either notation works for a 1D array
+        return sumsq;
+    });
+
+    sm.def("proxy_auxiliaries2", [](py::array_t<double> a) {
+        auto r = a.unchecked<2>();
+        auto r2 = a.mutable_unchecked<2>();
+        return auxiliaries(r, r2);
+    });
+
+    // test_array_unchecked_dyn_dims
+    // Same as the above, but without a compile-time dimensions specification:
+    sm.def("proxy_add2_dyn", [](py::array_t<double> a, double v) {
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+            for (ssize_t j = 0; j < r.shape(1); j++)
+                r(i, j) += v;
+    }, py::arg().noconvert(), py::arg());
+    sm.def("proxy_init3_dyn", [](double start) {
+        py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
+        auto r = a.mutable_unchecked();
+        if (r.ndim() != 3) throw std::domain_error("error: ndim != 3");
+        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (ssize_t j = 0; j < r.shape(1); j++)
+        for (ssize_t k = 0; k < r.shape(2); k++)
+            r(i, j, k) = start++;
+        return a;
+    });
+    sm.def("proxy_auxiliaries2_dyn", [](py::array_t<double> a) {
+        return auxiliaries(a.unchecked(), a.mutable_unchecked());
+    });
+
+    sm.def("array_auxiliaries2", [](py::array_t<double> a) {
+        return auxiliaries(a, a);
+    });
+
+    // test_array_failures
+    // Issue #785: Uninformative "Unknown internal error" exception when constructing array from empty object:
+    sm.def("array_fail_test", []() { return py::array(py::object()); });
+    sm.def("array_t_fail_test", []() { return py::array_t<double>(py::object()); });
+    // Make sure the error from numpy is being passed through:
+    sm.def("array_fail_test_negative_size", []() { int c = 0; return py::array(-1, &c); });
+
+    // test_initializer_list
+    // Issue (unnumbered; reported in #788): regression: initializer lists can be ambiguous
+    sm.def("array_initializer_list1", []() { return py::array_t<float>(1); }); // { 1 } also works, but clang warns about it
+    sm.def("array_initializer_list2", []() { return py::array_t<float>({ 1, 2 }); });
+    sm.def("array_initializer_list3", []() { return py::array_t<float>({ 1, 2, 3 }); });
+    sm.def("array_initializer_list4", []() { return py::array_t<float>({ 1, 2, 3, 4 }); });
+
+    // test_array_resize
+    // reshape array to 2D without changing size
+    sm.def("array_reshape2", [](py::array_t<double> a) {
+        const ssize_t dim_sz = (ssize_t)std::sqrt(a.size());
+        if (dim_sz * dim_sz != a.size())
+            throw std::domain_error("array_reshape2: input array total size is not a squared integer");
+        a.resize({dim_sz, dim_sz});
+    });
+
+    // resize to 3D array with each dimension = N
+    sm.def("array_resize3", [](py::array_t<double> a, size_t N, bool refcheck) {
+        a.resize({N, N, N}, refcheck);
+    });
+
+    // test_array_create_and_resize
+    // return 2D array with Nrows = Ncols = N
+    sm.def("create_and_resize", [](size_t N) {
+        py::array_t<double> a;
+        a.resize({N, N});
+        std::fill(a.mutable_data(), a.mutable_data() + a.size(), 42.);
+        return a;
+    });
+}
diff --git a/pybind11/tests/test_numpy_array.py b/pybind11/tests/test_numpy_array.py
index b96790c39..27433934f 100644
--- a/pybind11/tests/test_numpy_array.py
+++ b/pybind11/tests/test_numpy_array.py
@@ -1,4 +1,7 @@
 import pytest
+from pybind11_tests import numpy_array as m
+
+pytestmark = pytest.requires_numpy
 
 with pytest.suppress(ImportError):
     import numpy as np
@@ -6,142 +9,117 @@ with pytest.suppress(ImportError):
 
 @pytest.fixture(scope='function')
 def arr():
-    return np.array([[1, 2, 3], [4, 5, 6]], '<u2')
+    return np.array([[1, 2, 3], [4, 5, 6]], '=u2')
 
 
-@pytest.requires_numpy
 def test_array_attributes():
-    from pybind11_tests.array import (
-        ndim, shape, strides, writeable, size, itemsize, nbytes, owndata
-    )
-
     a = np.array(0, 'f8')
-    assert ndim(a) == 0
-    assert all(shape(a) == [])
-    assert all(strides(a) == [])
+    assert m.ndim(a) == 0
+    assert all(m.shape(a) == [])
+    assert all(m.strides(a) == [])
     with pytest.raises(IndexError) as excinfo:
-        shape(a, 0)
+        m.shape(a, 0)
     assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
     with pytest.raises(IndexError) as excinfo:
-        strides(a, 0)
+        m.strides(a, 0)
     assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
-    assert writeable(a)
-    assert size(a) == 1
-    assert itemsize(a) == 8
-    assert nbytes(a) == 8
-    assert owndata(a)
+    assert m.writeable(a)
+    assert m.size(a) == 1
+    assert m.itemsize(a) == 8
+    assert m.nbytes(a) == 8
+    assert m.owndata(a)
 
     a = np.array([[1, 2, 3], [4, 5, 6]], 'u2').view()
     a.flags.writeable = False
-    assert ndim(a) == 2
-    assert all(shape(a) == [2, 3])
-    assert shape(a, 0) == 2
-    assert shape(a, 1) == 3
-    assert all(strides(a) == [6, 2])
-    assert strides(a, 0) == 6
-    assert strides(a, 1) == 2
+    assert m.ndim(a) == 2
+    assert all(m.shape(a) == [2, 3])
+    assert m.shape(a, 0) == 2
+    assert m.shape(a, 1) == 3
+    assert all(m.strides(a) == [6, 2])
+    assert m.strides(a, 0) == 6
+    assert m.strides(a, 1) == 2
     with pytest.raises(IndexError) as excinfo:
-        shape(a, 2)
+        m.shape(a, 2)
     assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
     with pytest.raises(IndexError) as excinfo:
-        strides(a, 2)
+        m.strides(a, 2)
     assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
-    assert not writeable(a)
-    assert size(a) == 6
-    assert itemsize(a) == 2
-    assert nbytes(a) == 12
-    assert not owndata(a)
+    assert not m.writeable(a)
+    assert m.size(a) == 6
+    assert m.itemsize(a) == 2
+    assert m.nbytes(a) == 12
+    assert not m.owndata(a)
 
 
-@pytest.requires_numpy
 @pytest.mark.parametrize('args, ret', [([], 0), ([0], 0), ([1], 3), ([0, 1], 1), ([1, 2], 5)])
 def test_index_offset(arr, args, ret):
-    from pybind11_tests.array import index_at, index_at_t, offset_at, offset_at_t
-    assert index_at(arr, *args) == ret
-    assert index_at_t(arr, *args) == ret
-    assert offset_at(arr, *args) == ret * arr.dtype.itemsize
-    assert offset_at_t(arr, *args) == ret * arr.dtype.itemsize
+    assert m.index_at(arr, *args) == ret
+    assert m.index_at_t(arr, *args) == ret
+    assert m.offset_at(arr, *args) == ret * arr.dtype.itemsize
+    assert m.offset_at_t(arr, *args) == ret * arr.dtype.itemsize
 
 
-@pytest.requires_numpy
 def test_dim_check_fail(arr):
-    from pybind11_tests.array import (index_at, index_at_t, offset_at, offset_at_t, data, data_t,
-                                      mutate_data, mutate_data_t)
-    for func in (index_at, index_at_t, offset_at, offset_at_t, data, data_t,
-                 mutate_data, mutate_data_t):
+    for func in (m.index_at, m.index_at_t, m.offset_at, m.offset_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t):
         with pytest.raises(IndexError) as excinfo:
             func(arr, 1, 2, 3)
         assert str(excinfo.value) == 'too many indices for an array: 3 (ndim = 2)'
 
 
-@pytest.requires_numpy
 @pytest.mark.parametrize('args, ret',
                          [([], [1, 2, 3, 4, 5, 6]),
                           ([1], [4, 5, 6]),
                           ([0, 1], [2, 3, 4, 5, 6]),
                           ([1, 2], [6])])
 def test_data(arr, args, ret):
-    from pybind11_tests.array import data, data_t
-    assert all(data_t(arr, *args) == ret)
-    assert all(data(arr, *args)[::2] == ret)
-    assert all(data(arr, *args)[1::2] == 0)
-
-
-@pytest.requires_numpy
-def test_mutate_readonly(arr):
-    from pybind11_tests.array import mutate_data, mutate_data_t, mutate_at_t
-    arr.flags.writeable = False
-    for func, args in (mutate_data, ()), (mutate_data_t, ()), (mutate_at_t, (0, 0)):
-        with pytest.raises(RuntimeError) as excinfo:
-            func(arr, *args)
-        assert str(excinfo.value) == 'array is not writeable'
+    from sys import byteorder
+    assert all(m.data_t(arr, *args) == ret)
+    assert all(m.data(arr, *args)[(0 if byteorder == 'little' else 1)::2] == ret)
+    assert all(m.data(arr, *args)[(1 if byteorder == 'little' else 0)::2] == 0)
 
 
-@pytest.requires_numpy
 @pytest.mark.parametrize('dim', [0, 1, 3])
 def test_at_fail(arr, dim):
-    from pybind11_tests.array import at_t, mutate_at_t
-    for func in at_t, mutate_at_t:
+    for func in m.at_t, m.mutate_at_t:
         with pytest.raises(IndexError) as excinfo:
             func(arr, *([0] * dim))
         assert str(excinfo.value) == 'index dimension mismatch: {} (ndim = 2)'.format(dim)
 
 
-@pytest.requires_numpy
 def test_at(arr):
-    from pybind11_tests.array import at_t, mutate_at_t
+    assert m.at_t(arr, 0, 2) == 3
+    assert m.at_t(arr, 1, 0) == 4
 
-    assert at_t(arr, 0, 2) == 3
-    assert at_t(arr, 1, 0) == 4
+    assert all(m.mutate_at_t(arr, 0, 2).ravel() == [1, 2, 4, 4, 5, 6])
+    assert all(m.mutate_at_t(arr, 1, 0).ravel() == [1, 2, 4, 5, 5, 6])
 
-    assert all(mutate_at_t(arr, 0, 2).ravel() == [1, 2, 4, 4, 5, 6])
-    assert all(mutate_at_t(arr, 1, 0).ravel() == [1, 2, 4, 5, 5, 6])
 
+def test_mutate_readonly(arr):
+    arr.flags.writeable = False
+    for func, args in (m.mutate_data, ()), (m.mutate_data_t, ()), (m.mutate_at_t, (0, 0)):
+        with pytest.raises(ValueError) as excinfo:
+            func(arr, *args)
+        assert str(excinfo.value) == 'array is not writeable'
 
-@pytest.requires_numpy
-def test_mutate_data(arr):
-    from pybind11_tests.array import mutate_data, mutate_data_t
 
-    assert all(mutate_data(arr).ravel() == [2, 4, 6, 8, 10, 12])
-    assert all(mutate_data(arr).ravel() == [4, 8, 12, 16, 20, 24])
-    assert all(mutate_data(arr, 1).ravel() == [4, 8, 12, 32, 40, 48])
-    assert all(mutate_data(arr, 0, 1).ravel() == [4, 16, 24, 64, 80, 96])
-    assert all(mutate_data(arr, 1, 2).ravel() == [4, 16, 24, 64, 80, 192])
+def test_mutate_data(arr):
+    assert all(m.mutate_data(arr).ravel() == [2, 4, 6, 8, 10, 12])
+    assert all(m.mutate_data(arr).ravel() == [4, 8, 12, 16, 20, 24])
+    assert all(m.mutate_data(arr, 1).ravel() == [4, 8, 12, 32, 40, 48])
+    assert all(m.mutate_data(arr, 0, 1).ravel() == [4, 16, 24, 64, 80, 96])
+    assert all(m.mutate_data(arr, 1, 2).ravel() == [4, 16, 24, 64, 80, 192])
 
-    assert all(mutate_data_t(arr).ravel() == [5, 17, 25, 65, 81, 193])
-    assert all(mutate_data_t(arr).ravel() == [6, 18, 26, 66, 82, 194])
-    assert all(mutate_data_t(arr, 1).ravel() == [6, 18, 26, 67, 83, 195])
-    assert all(mutate_data_t(arr, 0, 1).ravel() == [6, 19, 27, 68, 84, 196])
-    assert all(mutate_data_t(arr, 1, 2).ravel() == [6, 19, 27, 68, 84, 197])
+    assert all(m.mutate_data_t(arr).ravel() == [5, 17, 25, 65, 81, 193])
+    assert all(m.mutate_data_t(arr).ravel() == [6, 18, 26, 66, 82, 194])
+    assert all(m.mutate_data_t(arr, 1).ravel() == [6, 18, 26, 67, 83, 195])
+    assert all(m.mutate_data_t(arr, 0, 1).ravel() == [6, 19, 27, 68, 84, 196])
+    assert all(m.mutate_data_t(arr, 1, 2).ravel() == [6, 19, 27, 68, 84, 197])
 
 
-@pytest.requires_numpy
 def test_bounds_check(arr):
-    from pybind11_tests.array import (index_at, index_at_t, data, data_t,
-                                      mutate_data, mutate_data_t, at_t, mutate_at_t)
-    funcs = (index_at, index_at_t, data, data_t,
-             mutate_data, mutate_data_t, at_t, mutate_at_t)
-    for func in funcs:
+    for func in (m.index_at, m.index_at_t, m.data, m.data_t,
+                 m.mutate_data, m.mutate_data_t, m.at_t, m.mutate_at_t):
         with pytest.raises(IndexError) as excinfo:
             func(arr, 2, 0)
         assert str(excinfo.value) == 'index 2 is out of bounds for axis 0 with size 2'
@@ -150,22 +128,17 @@ def test_bounds_check(arr):
         assert str(excinfo.value) == 'index 4 is out of bounds for axis 1 with size 3'
 
 
-@pytest.requires_numpy
 def test_make_c_f_array():
-    from pybind11_tests.array import (
-        make_c_array, make_f_array
-    )
-    assert make_c_array().flags.c_contiguous
-    assert not make_c_array().flags.f_contiguous
-    assert make_f_array().flags.f_contiguous
-    assert not make_f_array().flags.c_contiguous
+    assert m.make_c_array().flags.c_contiguous
+    assert not m.make_c_array().flags.f_contiguous
+    assert m.make_f_array().flags.f_contiguous
+    assert not m.make_f_array().flags.c_contiguous
 
 
-@pytest.requires_numpy
 def test_wrap():
-    from pybind11_tests.array import wrap
-
-    def assert_references(a, b):
+    def assert_references(a, b, base=None):
+        if base is None:
+            base = a
         assert a is not b
         assert a.__array_interface__['data'][0] == b.__array_interface__['data'][0]
         assert a.shape == b.shape
@@ -177,44 +150,46 @@ def test_wrap():
         assert a.flags.updateifcopy == b.flags.updateifcopy
         assert np.all(a == b)
         assert not b.flags.owndata
-        assert b.base is a
+        assert b.base is base
         if a.flags.writeable and a.ndim == 2:
             a[0, 0] = 1234
             assert b[0, 0] == 1234
 
     a1 = np.array([1, 2], dtype=np.int16)
     assert a1.flags.owndata and a1.base is None
-    a2 = wrap(a1)
+    a2 = m.wrap(a1)
     assert_references(a1, a2)
 
     a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='F')
     assert a1.flags.owndata and a1.base is None
-    a2 = wrap(a1)
+    a2 = m.wrap(a1)
     assert_references(a1, a2)
 
     a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='C')
     a1.flags.writeable = False
-    a2 = wrap(a1)
+    a2 = m.wrap(a1)
     assert_references(a1, a2)
 
     a1 = np.random.random((4, 4, 4))
-    a2 = wrap(a1)
+    a2 = m.wrap(a1)
     assert_references(a1, a2)
 
-    a1 = a1.transpose()
-    a2 = wrap(a1)
-    assert_references(a1, a2)
+    a1t = a1.transpose()
+    a2 = m.wrap(a1t)
+    assert_references(a1t, a2, a1)
 
-    a1 = a1.diagonal()
-    a2 = wrap(a1)
-    assert_references(a1, a2)
+    a1d = a1.diagonal()
+    a2 = m.wrap(a1d)
+    assert_references(a1d, a2, a1)
+
+    a1m = a1[::-1, ::-1, ::-1]
+    a2 = m.wrap(a1m)
+    assert_references(a1m, a2, a1)
 
 
-@pytest.requires_numpy
 def test_numpy_view(capture):
-    from pybind11_tests.array import ArrayClass
     with capture:
-        ac = ArrayClass()
+        ac = m.ArrayClass()
         ac_view_1 = ac.numpy_view()
         ac_view_2 = ac.numpy_view()
         assert np.all(ac_view_1 == np.array([1, 2], dtype=np.int32))
@@ -240,35 +215,188 @@ def test_numpy_view(capture):
 
 
 @pytest.unsupported_on_pypy
-@pytest.requires_numpy
 def test_cast_numpy_int64_to_uint64():
-    from pybind11_tests.array import function_taking_uint64
-    function_taking_uint64(123)
-    function_taking_uint64(np.uint64(123))
+    m.function_taking_uint64(123)
+    m.function_taking_uint64(np.uint64(123))
 
 
-@pytest.requires_numpy
 def test_isinstance():
-    from pybind11_tests.array import isinstance_untyped, isinstance_typed
-
-    assert isinstance_untyped(np.array([1, 2, 3]), "not an array")
-    assert isinstance_typed(np.array([1.0, 2.0, 3.0]))
+    assert m.isinstance_untyped(np.array([1, 2, 3]), "not an array")
+    assert m.isinstance_typed(np.array([1.0, 2.0, 3.0]))
 
 
-@pytest.requires_numpy
 def test_constructors():
-    from pybind11_tests.array import default_constructors, converting_constructors
-
-    defaults = default_constructors()
+    defaults = m.default_constructors()
     for a in defaults.values():
         assert a.size == 0
     assert defaults["array"].dtype == np.array([]).dtype
     assert defaults["array_t<int32>"].dtype == np.int32
     assert defaults["array_t<double>"].dtype == np.float64
 
-    results = converting_constructors([1, 2, 3])
+    results = m.converting_constructors([1, 2, 3])
     for a in results.values():
         np.testing.assert_array_equal(a, [1, 2, 3])
     assert results["array"].dtype == np.int_
     assert results["array_t<int32>"].dtype == np.int32
     assert results["array_t<double>"].dtype == np.float64
+
+
+def test_overload_resolution(msg):
+    # Exact overload matches:
+    assert m.overloaded(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded(np.array([1], dtype='ushort')) == 'unsigned short'
+    assert m.overloaded(np.array([1], dtype='intc')) == 'int'
+    assert m.overloaded(np.array([1], dtype='longlong')) == 'long long'
+    assert m.overloaded(np.array([1], dtype='complex')) == 'double complex'
+    assert m.overloaded(np.array([1], dtype='csingle')) == 'float complex'
+
+    # No exact match, should call first convertible version:
+    assert m.overloaded(np.array([1], dtype='uint8')) == 'double'
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded("not an array")
+    assert msg(excinfo.value) == """
+        overloaded(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[float64]) -> str
+            2. (arg0: numpy.ndarray[float32]) -> str
+            3. (arg0: numpy.ndarray[int32]) -> str
+            4. (arg0: numpy.ndarray[uint16]) -> str
+            5. (arg0: numpy.ndarray[int64]) -> str
+            6. (arg0: numpy.ndarray[complex128]) -> str
+            7. (arg0: numpy.ndarray[complex64]) -> str
+
+        Invoked with: 'not an array'
+    """
+
+    assert m.overloaded2(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded2(np.array([1], dtype='complex64')) == 'float complex'
+    assert m.overloaded2(np.array([1], dtype='complex128')) == 'double complex'
+    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+
+    assert m.overloaded3(np.array([1], dtype='float64')) == 'double'
+    assert m.overloaded3(np.array([1], dtype='intc')) == 'int'
+    expected_exc = """
+        overloaded3(): incompatible function arguments. The following argument types are supported:
+            1. (arg0: numpy.ndarray[int32]) -> str
+            2. (arg0: numpy.ndarray[float64]) -> str
+
+        Invoked with:"""
+
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='uintc'))
+    assert msg(excinfo.value) == expected_exc + " array([1], dtype=uint32)"
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='float32'))
+    assert msg(excinfo.value) == expected_exc + " array([ 1.], dtype=float32)"
+    with pytest.raises(TypeError) as excinfo:
+        m.overloaded3(np.array([1], dtype='complex'))
+    assert msg(excinfo.value) == expected_exc + " array([ 1.+0.j])"
+
+    # Exact matches:
+    assert m.overloaded4(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='longlong')) == 'long long'
+    # Non-exact matches requiring conversion.  Since float to integer isn't a
+    # save conversion, it should go to the double overload, but short can go to
+    # either (and so should end up on the first-registered, the long long).
+    assert m.overloaded4(np.array([1], dtype='float32')) == 'double'
+    assert m.overloaded4(np.array([1], dtype='short')) == 'long long'
+
+    assert m.overloaded5(np.array([1], dtype='double')) == 'double'
+    assert m.overloaded5(np.array([1], dtype='uintc')) == 'unsigned int'
+    assert m.overloaded5(np.array([1], dtype='float32')) == 'unsigned int'
+
+
+def test_greedy_string_overload():
+    """Tests fix for #685 - ndarray shouldn't go to std::string overload"""
+
+    assert m.issue685("abc") == "string"
+    assert m.issue685(np.array([97, 98, 99], dtype='b')) == "array"
+    assert m.issue685(123) == "other"
+
+
+def test_array_unchecked_fixed_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    with pytest.raises(ValueError) as excinfo:
+        m.proxy_add2(np.array([1., 2, 3]), 5.0)
+    assert msg(excinfo.value) == "array has incorrect number of dimensions: 1; expected 2"
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3(3.0) == expect_c)
+    expect_f = np.transpose(expect_c)
+    assert np.all(m.proxy_init3F(3.0) == expect_f)
+
+    assert m.proxy_squared_L2_norm(np.array(range(6))) == 55
+    assert m.proxy_squared_L2_norm(np.array(range(6), dtype="float64")) == 55
+
+    assert m.proxy_auxiliaries2(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_unchecked_dyn_dims(msg):
+    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    m.proxy_add2_dyn(z1, 10)
+    assert np.all(z1 == [[11, 12], [13, 14]])
+
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    assert np.all(m.proxy_init3_dyn(3.0) == expect_c)
+
+    assert m.proxy_auxiliaries2_dyn(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
+    assert m.proxy_auxiliaries2_dyn(z1) == m.array_auxiliaries2(z1)
+
+
+def test_array_failure():
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_t_fail_test()
+    assert str(excinfo.value) == 'cannot create a pybind11::array_t from a nullptr'
+
+    with pytest.raises(ValueError) as excinfo:
+        m.array_fail_test_negative_size()
+    assert str(excinfo.value) == 'negative dimensions are not allowed'
+
+
+def test_initializer_list():
+    assert m.array_initializer_list1().shape == (1,)
+    assert m.array_initializer_list2().shape == (1, 2)
+    assert m.array_initializer_list3().shape == (1, 2, 3)
+    assert m.array_initializer_list4().shape == (1, 2, 3, 4)
+
+
+def test_array_resize(msg):
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float64')
+    m.array_reshape2(a)
+    assert(a.size == 9)
+    assert(np.all(a == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
+
+    # total size change should succced with refcheck off
+    m.array_resize3(a, 4, False)
+    assert(a.size == 64)
+    # ... and fail with refcheck on
+    try:
+        m.array_resize3(a, 3, True)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize an array"))
+    # transposed array doesn't own data
+    b = a.transpose()
+    try:
+        m.array_resize3(b, 3, False)
+    except ValueError as e:
+        assert(str(e).startswith("cannot resize this array: it does not own its data"))
+    # ... but reshape should be fine
+    m.array_reshape2(b)
+    assert(b.shape == (8, 8))
+
+
+@pytest.unsupported_on_pypy
+def test_array_create_and_resize(msg):
+    a = m.create_and_resize(2)
+    assert(a.size == 4)
+    assert(np.all(a == 42.))
diff --git a/pybind11/tests/test_numpy_dtypes.cpp b/pybind11/tests/test_numpy_dtypes.cpp
index 3894f6a30..ddec851f6 100644
--- a/pybind11/tests/test_numpy_dtypes.cpp
+++ b/pybind11/tests/test_numpy_dtypes.cpp
@@ -19,23 +19,25 @@
 namespace py = pybind11;
 
 struct SimpleStruct {
-    bool x;
-    uint32_t y;
-    float z;
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
 };
 
 std::ostream& operator<<(std::ostream& os, const SimpleStruct& v) {
-    return os << "s:" << v.x << "," << v.y << "," << v.z;
+    return os << "s:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
 }
 
 PYBIND11_PACKED(struct PackedStruct {
-    bool x;
-    uint32_t y;
-    float z;
+    bool bool_;
+    uint32_t uint_;
+    float float_;
+    long double ldbl_;
 });
 
 std::ostream& operator<<(std::ostream& os, const PackedStruct& v) {
-    return os << "p:" << v.x << "," << v.y << "," << v.z;
+    return os << "p:" << v.bool_ << "," << v.uint_ << "," << v.float_ << "," << v.ldbl_;
 }
 
 PYBIND11_PACKED(struct NestedStruct {
@@ -48,10 +50,11 @@ std::ostream& operator<<(std::ostream& os, const NestedStruct& v) {
 }
 
 struct PartialStruct {
-    bool x;
-    uint32_t y;
-    float z;
+    bool bool_;
+    uint32_t uint_;
+    float float_;
     uint64_t dummy2;
+    long double ldbl_;
 };
 
 struct PartialNestedStruct {
@@ -67,6 +70,22 @@ struct StringStruct {
     std::array<char, 3> b;
 };
 
+struct ComplexStruct {
+    std::complex<float> cflt;
+    std::complex<double> cdbl;
+};
+
+std::ostream& operator<<(std::ostream& os, const ComplexStruct& v) {
+    return os << "c:" << v.cflt << "," << v.cdbl;
+}
+
+struct ArrayStruct {
+    char a[3][4];
+    int32_t b[2];
+    std::array<uint8_t, 3> c;
+    std::array<float, 2> d[4];
+};
+
 PYBIND11_PACKED(struct StructWithUglyNames {
     int8_t __x__;
     uint64_t __y__;
@@ -88,6 +107,27 @@ std::ostream& operator<<(std::ostream& os, const StringStruct& v) {
     return os << "'";
 }
 
+std::ostream& operator<<(std::ostream& os, const ArrayStruct& v) {
+    os << "a={";
+    for (int i = 0; i < 3; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{';
+        for (int j = 0; j < 3; j++)
+            os << v.a[i][j] << ',';
+        os << v.a[i][3] << '}';
+    }
+    os << "},b={" << v.b[0] << ',' << v.b[1];
+    os << "},c={" << int(v.c[0]) << ',' << int(v.c[1]) << ',' << int(v.c[2]);
+    os << "},d={";
+    for (int i = 0; i < 4; i++) {
+        if (i > 0)
+            os << ',';
+        os << '{' << v.d[i][0] << ',' << v.d[i][1] << '}';
+    }
+    return os << '}';
+}
+
 std::ostream& operator<<(std::ostream& os, const EnumStruct& v) {
     return os << "e1=" << (v.e1 == E1::A ? "A" : "B") << ",e2=" << (v.e2 == E2::X ? "X" : "Y");
 }
@@ -99,67 +139,19 @@ py::array mkarray_via_buffer(size_t n) {
                                      1, { n }, { sizeof(T) }));
 }
 
+#define SET_TEST_VALS(s, i) do { \
+    s.bool_ = (i) % 2 != 0; \
+    s.uint_ = (uint32_t) (i); \
+    s.float_ = (float) (i) * 1.5f; \
+    s.ldbl_ = (long double) (i) * -2.5L; } while (0)
+
 template <typename S>
 py::array_t<S, 0> create_recarray(size_t n) {
     auto arr = mkarray_via_buffer<S>(n);
     auto req = arr.request();
     auto ptr = static_cast<S*>(req.ptr);
     for (size_t i = 0; i < n; i++) {
-        ptr[i].x = i % 2 != 0; ptr[i].y = (uint32_t) i; ptr[i].z = (float) i * 1.5f;
-    }
-    return arr;
-}
-
-std::string get_format_unbound() {
-    return py::format_descriptor<UnboundStruct>::format();
-}
-
-py::array_t<NestedStruct, 0> create_nested(size_t n) {
-    auto arr = mkarray_via_buffer<NestedStruct>(n);
-    auto req = arr.request();
-    auto ptr = static_cast<NestedStruct*>(req.ptr);
-    for (size_t i = 0; i < n; i++) {
-        ptr[i].a.x = i % 2 != 0; ptr[i].a.y = (uint32_t) i; ptr[i].a.z = (float) i * 1.5f;
-        ptr[i].b.x = (i + 1) % 2 != 0; ptr[i].b.y = (uint32_t) (i + 1); ptr[i].b.z = (float) (i + 1) * 1.5f;
-    }
-    return arr;
-}
-
-py::array_t<PartialNestedStruct, 0> create_partial_nested(size_t n) {
-    auto arr = mkarray_via_buffer<PartialNestedStruct>(n);
-    auto req = arr.request();
-    auto ptr = static_cast<PartialNestedStruct*>(req.ptr);
-    for (size_t i = 0; i < n; i++) {
-        ptr[i].a.x = i % 2 != 0; ptr[i].a.y = (uint32_t) i; ptr[i].a.z = (float) i * 1.5f;
-    }
-    return arr;
-}
-
-py::array_t<StringStruct, 0> create_string_array(bool non_empty) {
-    auto arr = mkarray_via_buffer<StringStruct>(non_empty ? 4 : 0);
-    if (non_empty) {
-        auto req = arr.request();
-        auto ptr = static_cast<StringStruct*>(req.ptr);
-        for (size_t i = 0; i < req.size * req.itemsize; i++)
-            static_cast<char*>(req.ptr)[i] = 0;
-        ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
-        ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
-        ptr[3].a[0] = 'a'; ptr[3].b[0] = 'a';
-
-        ptr[2].a[1] = 'b'; ptr[2].b[1] = 'b';
-        ptr[3].a[1] = 'b'; ptr[3].b[1] = 'b';
-
-        ptr[3].a[2] = 'c'; ptr[3].b[2] = 'c';
-    }
-    return arr;
-}
-
-py::array_t<EnumStruct, 0> create_enum_array(size_t n) {
-    auto arr = mkarray_via_buffer<EnumStruct>(n);
-    auto ptr = (EnumStruct *) arr.mutable_data();
-    for (size_t i = 0; i < n; i++) {
-        ptr[i].e1 = static_cast<E1>(-1 + ((int) i % 2) * 2);
-        ptr[i].e2 = static_cast<E2>(1 + (i % 2));
+        SET_TEST_VALS(ptr[i], i);
     }
     return arr;
 }
@@ -169,7 +161,7 @@ py::list print_recarray(py::array_t<S, 0> arr) {
     const auto req = arr.request();
     const auto ptr = static_cast<S*>(req.ptr);
     auto l = py::list();
-    for (size_t i = 0; i < req.size; i++) {
+    for (ssize_t i = 0; i < req.size; i++) {
         std::stringstream ss;
         ss << ptr[i];
         l.append(py::str(ss.str()));
@@ -177,47 +169,12 @@ py::list print_recarray(py::array_t<S, 0> arr) {
     return l;
 }
 
-py::list print_format_descriptors() {
-    const auto fmts = {
-        py::format_descriptor<SimpleStruct>::format(),
-        py::format_descriptor<PackedStruct>::format(),
-        py::format_descriptor<NestedStruct>::format(),
-        py::format_descriptor<PartialStruct>::format(),
-        py::format_descriptor<PartialNestedStruct>::format(),
-        py::format_descriptor<StringStruct>::format(),
-        py::format_descriptor<EnumStruct>::format()
-    };
-    auto l = py::list();
-    for (const auto &fmt : fmts) {
-        l.append(py::cast(fmt));
-    }
-    return l;
-}
-
-py::list print_dtypes() {
-    const auto dtypes = {
-        py::str(py::dtype::of<SimpleStruct>()),
-        py::str(py::dtype::of<PackedStruct>()),
-        py::str(py::dtype::of<NestedStruct>()),
-        py::str(py::dtype::of<PartialStruct>()),
-        py::str(py::dtype::of<PartialNestedStruct>()),
-        py::str(py::dtype::of<StringStruct>()),
-        py::str(py::dtype::of<EnumStruct>()),
-        py::str(py::dtype::of<StructWithUglyNames>())
-    };
-    auto l = py::list();
-    for (const auto &s : dtypes) {
-        l.append(s);
-    }
-    return l;
-}
-
 py::array_t<int32_t, 0> test_array_ctors(int i) {
     using arr_t = py::array_t<int32_t, 0>;
 
     std::vector<int32_t> data { 1, 2, 3, 4, 5, 6 };
-    std::vector<size_t> shape { 3, 2 };
-    std::vector<size_t> strides { 8, 4 };
+    std::vector<ssize_t> shape { 3, 2 };
+    std::vector<ssize_t> strides { 8, 4 };
 
     auto ptr = data.data();
     auto vptr = (void *) ptr;
@@ -287,77 +244,208 @@ py::list test_dtype_ctors() {
     return list;
 }
 
-struct TrailingPaddingStruct {
-    int32_t a;
-    char b;
-};
-
-py::dtype trailing_padding_dtype() {
-    return py::dtype::of<TrailingPaddingStruct>();
-}
-
-py::dtype buffer_to_dtype(py::buffer& buf) {
-    return py::dtype(buf.request());
-}
-
-py::list test_dtype_methods() {
-    py::list list;
-    auto dt1 = py::dtype::of<int32_t>();
-    auto dt2 = py::dtype::of<SimpleStruct>();
-    list.append(dt1); list.append(dt2);
-    list.append(py::bool_(dt1.has_fields())); list.append(py::bool_(dt2.has_fields()));
-    list.append(py::int_(dt1.itemsize())); list.append(py::int_(dt2.itemsize()));
-    return list;
-}
-
-test_initializer numpy_dtypes([](py::module &m) {
-    try {
-        py::module::import("numpy");
-    } catch (...) {
-        return;
-    }
+TEST_SUBMODULE(numpy_dtypes, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
 
     // typeinfo may be registered before the dtype descriptor for scalar casts to work...
     py::class_<SimpleStruct>(m, "SimpleStruct");
 
-    PYBIND11_NUMPY_DTYPE(SimpleStruct, x, y, z);
-    PYBIND11_NUMPY_DTYPE(PackedStruct, x, y, z);
+    PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_);
+    PYBIND11_NUMPY_DTYPE(PackedStruct, bool_, uint_, float_, ldbl_);
     PYBIND11_NUMPY_DTYPE(NestedStruct, a, b);
-    PYBIND11_NUMPY_DTYPE(PartialStruct, x, y, z);
+    PYBIND11_NUMPY_DTYPE(PartialStruct, bool_, uint_, float_, ldbl_);
     PYBIND11_NUMPY_DTYPE(PartialNestedStruct, a);
     PYBIND11_NUMPY_DTYPE(StringStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(ArrayStruct, a, b, c, d);
     PYBIND11_NUMPY_DTYPE(EnumStruct, e1, e2);
-    PYBIND11_NUMPY_DTYPE(TrailingPaddingStruct, a, b);
+    PYBIND11_NUMPY_DTYPE(ComplexStruct, cflt, cdbl);
 
     // ... or after
     py::class_<PackedStruct>(m, "PackedStruct");
 
     PYBIND11_NUMPY_DTYPE_EX(StructWithUglyNames, __x__, "x", __y__, "y");
 
+    // If uncommented, this should produce a static_assert failure telling the user that the struct
+    // is not a POD type
+//    struct NotPOD { std::string v; NotPOD() : v("hi") {}; };
+//    PYBIND11_NUMPY_DTYPE(NotPOD, v);
+
+    // test_recarray, test_scalar_conversion
     m.def("create_rec_simple", &create_recarray<SimpleStruct>);
     m.def("create_rec_packed", &create_recarray<PackedStruct>);
-    m.def("create_rec_nested", &create_nested);
+    m.def("create_rec_nested", [](size_t n) { // test_signature
+        py::array_t<NestedStruct, 0> arr = mkarray_via_buffer<NestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<NestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+            SET_TEST_VALS(ptr[i].b, i + 1);
+        }
+        return arr;
+    });
     m.def("create_rec_partial", &create_recarray<PartialStruct>);
-    m.def("create_rec_partial_nested", &create_partial_nested);
-    m.def("print_format_descriptors", &print_format_descriptors);
+    m.def("create_rec_partial_nested", [](size_t n) {
+        py::array_t<PartialNestedStruct, 0> arr = mkarray_via_buffer<PartialNestedStruct>(n);
+        auto req = arr.request();
+        auto ptr = static_cast<PartialNestedStruct*>(req.ptr);
+        for (size_t i = 0; i < n; i++) {
+            SET_TEST_VALS(ptr[i].a, i);
+        }
+        return arr;
+    });
     m.def("print_rec_simple", &print_recarray<SimpleStruct>);
     m.def("print_rec_packed", &print_recarray<PackedStruct>);
     m.def("print_rec_nested", &print_recarray<NestedStruct>);
-    m.def("print_dtypes", &print_dtypes);
-    m.def("get_format_unbound", &get_format_unbound);
-    m.def("create_string_array", &create_string_array);
+
+    // test_format_descriptors
+    m.def("get_format_unbound", []() { return py::format_descriptor<UnboundStruct>::format(); });
+    m.def("print_format_descriptors", []() {
+        py::list l;
+        for (const auto &fmt : {
+            py::format_descriptor<SimpleStruct>::format(),
+            py::format_descriptor<PackedStruct>::format(),
+            py::format_descriptor<NestedStruct>::format(),
+            py::format_descriptor<PartialStruct>::format(),
+            py::format_descriptor<PartialNestedStruct>::format(),
+            py::format_descriptor<StringStruct>::format(),
+            py::format_descriptor<ArrayStruct>::format(),
+            py::format_descriptor<EnumStruct>::format(),
+            py::format_descriptor<ComplexStruct>::format()
+        }) {
+            l.append(py::cast(fmt));
+        }
+        return l;
+    });
+
+    // test_dtype
+    m.def("print_dtypes", []() {
+        py::list l;
+        for (const py::handle &d : {
+            py::dtype::of<SimpleStruct>(),
+            py::dtype::of<PackedStruct>(),
+            py::dtype::of<NestedStruct>(),
+            py::dtype::of<PartialStruct>(),
+            py::dtype::of<PartialNestedStruct>(),
+            py::dtype::of<StringStruct>(),
+            py::dtype::of<ArrayStruct>(),
+            py::dtype::of<EnumStruct>(),
+            py::dtype::of<StructWithUglyNames>(),
+            py::dtype::of<ComplexStruct>()
+        })
+            l.append(py::str(d));
+        return l;
+    });
+    m.def("test_dtype_ctors", &test_dtype_ctors);
+    m.def("test_dtype_methods", []() {
+        py::list list;
+        auto dt1 = py::dtype::of<int32_t>();
+        auto dt2 = py::dtype::of<SimpleStruct>();
+        list.append(dt1); list.append(dt2);
+        list.append(py::bool_(dt1.has_fields())); list.append(py::bool_(dt2.has_fields()));
+        list.append(py::int_(dt1.itemsize())); list.append(py::int_(dt2.itemsize()));
+        return list;
+    });
+    struct TrailingPaddingStruct {
+        int32_t a;
+        char b;
+    };
+    PYBIND11_NUMPY_DTYPE(TrailingPaddingStruct, a, b);
+    m.def("trailing_padding_dtype", []() { return py::dtype::of<TrailingPaddingStruct>(); });
+
+    // test_string_array
+    m.def("create_string_array", [](bool non_empty) {
+        py::array_t<StringStruct, 0> arr = mkarray_via_buffer<StringStruct>(non_empty ? 4 : 0);
+        if (non_empty) {
+            auto req = arr.request();
+            auto ptr = static_cast<StringStruct*>(req.ptr);
+            for (ssize_t i = 0; i < req.size * req.itemsize; i++)
+                static_cast<char*>(req.ptr)[i] = 0;
+            ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
+            ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
+            ptr[3].a[0] = 'a'; ptr[3].b[0] = 'a';
+
+            ptr[2].a[1] = 'b'; ptr[2].b[1] = 'b';
+            ptr[3].a[1] = 'b'; ptr[3].b[1] = 'b';
+
+            ptr[3].a[2] = 'c'; ptr[3].b[2] = 'c';
+        }
+        return arr;
+    });
     m.def("print_string_array", &print_recarray<StringStruct>);
-    m.def("create_enum_array", &create_enum_array);
+
+    // test_array_array
+    m.def("create_array_array", [](size_t n) {
+        py::array_t<ArrayStruct, 0> arr = mkarray_via_buffer<ArrayStruct>(n);
+        auto ptr = (ArrayStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            for (size_t j = 0; j < 3; j++)
+                for (size_t k = 0; k < 4; k++)
+                    ptr[i].a[j][k] = char('A' + (i * 100 + j * 10 + k) % 26);
+            for (size_t j = 0; j < 2; j++)
+                ptr[i].b[j] = int32_t(i * 1000 + j);
+            for (size_t j = 0; j < 3; j++)
+                ptr[i].c[j] = uint8_t(i * 10 + j);
+            for (size_t j = 0; j < 4; j++)
+                for (size_t k = 0; k < 2; k++)
+                    ptr[i].d[j][k] = float(i) * 100.0f + float(j) * 10.0f + float(k);
+        }
+        return arr;
+    });
+    m.def("print_array_array", &print_recarray<ArrayStruct>);
+
+    // test_enum_array
+    m.def("create_enum_array", [](size_t n) {
+        py::array_t<EnumStruct, 0> arr = mkarray_via_buffer<EnumStruct>(n);
+        auto ptr = (EnumStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].e1 = static_cast<E1>(-1 + ((int) i % 2) * 2);
+            ptr[i].e2 = static_cast<E2>(1 + (i % 2));
+        }
+        return arr;
+    });
     m.def("print_enum_array", &print_recarray<EnumStruct>);
+
+    // test_complex_array
+    m.def("create_complex_array", [](size_t n) {
+        py::array_t<ComplexStruct, 0> arr = mkarray_via_buffer<ComplexStruct>(n);
+        auto ptr = (ComplexStruct *) arr.mutable_data();
+        for (size_t i = 0; i < n; i++) {
+            ptr[i].cflt.real(float(i));
+            ptr[i].cflt.imag(float(i) + 0.25f);
+            ptr[i].cdbl.real(double(i) + 0.5);
+            ptr[i].cdbl.imag(double(i) + 0.75);
+        }
+        return arr;
+    });
+    m.def("print_complex_array", &print_recarray<ComplexStruct>);
+
+    // test_array_constructors
     m.def("test_array_ctors", &test_array_ctors);
-    m.def("test_dtype_ctors", &test_dtype_ctors);
-    m.def("test_dtype_methods", &test_dtype_methods);
-    m.def("trailing_padding_dtype", &trailing_padding_dtype);
-    m.def("buffer_to_dtype", &buffer_to_dtype);
-    m.def("f_simple", [](SimpleStruct s) { return s.y * 10; });
-    m.def("f_packed", [](PackedStruct s) { return s.y * 10; });
-    m.def("f_nested", [](NestedStruct s) { return s.a.y * 10; });
-    m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, x, y, z); });
-});
 
-#undef PYBIND11_PACKED
+    // test_compare_buffer_info
+    struct CompareStruct {
+        bool x;
+        uint32_t y;
+        float z;
+    };
+    PYBIND11_NUMPY_DTYPE(CompareStruct, x, y, z);
+    m.def("compare_buffer_info", []() {
+        py::list list;
+        list.append(py::bool_(py::detail::compare_buffer_info<float>::compare(py::buffer_info(nullptr, sizeof(float), "f", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<unsigned>::compare(py::buffer_info(nullptr, sizeof(int), "I", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), "l", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<long>::compare(py::buffer_info(nullptr, sizeof(long), sizeof(long) == sizeof(int) ? "i" : "q", 1))));
+        list.append(py::bool_(py::detail::compare_buffer_info<CompareStruct>::compare(py::buffer_info(nullptr, sizeof(CompareStruct), "T{?:x:3xI:y:f:z:}", 1))));
+        return list;
+    });
+    m.def("buffer_to_dtype", [](py::buffer& buf) { return py::dtype(buf.request()); });
+
+    // test_scalar_conversion
+    m.def("f_simple", [](SimpleStruct s) { return s.uint_ * 10; });
+    m.def("f_packed", [](PackedStruct s) { return s.uint_ * 10; });
+    m.def("f_nested", [](NestedStruct s) { return s.a.uint_ * 10; });
+
+    // test_register_dtype
+    m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_); });
+}
diff --git a/pybind11/tests/test_numpy_dtypes.py b/pybind11/tests/test_numpy_dtypes.py
index 52ebe0ede..5f9a95404 100644
--- a/pybind11/tests/test_numpy_dtypes.py
+++ b/pybind11/tests/test_numpy_dtypes.py
@@ -1,5 +1,8 @@
 import re
 import pytest
+from pybind11_tests import numpy_dtypes as m
+
+pytestmark = pytest.requires_numpy
 
 with pytest.suppress(ImportError):
     import numpy as np
@@ -7,78 +10,122 @@ with pytest.suppress(ImportError):
 
 @pytest.fixture(scope='module')
 def simple_dtype():
-    return np.dtype({'names': ['x', 'y', 'z'],
-                     'formats': ['?', 'u4', 'f4'],
-                     'offsets': [0, 4, 8]})
+    ld = np.dtype('longdouble')
+    return np.dtype({'names': ['bool_', 'uint_', 'float_', 'ldbl_'],
+                     'formats': ['?', 'u4', 'f4', 'f{}'.format(ld.itemsize)],
+                     'offsets': [0, 4, 8, (16 if ld.alignment > 4 else 12)]})
 
 
 @pytest.fixture(scope='module')
 def packed_dtype():
-    return np.dtype([('x', '?'), ('y', 'u4'), ('z', 'f4')])
+    return np.dtype([('bool_', '?'), ('uint_', 'u4'), ('float_', 'f4'), ('ldbl_', 'g')])
+
+
+def dt_fmt():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+    return ("{{'names':['bool_','uint_','float_','ldbl_'],"
+            " 'formats':['?','" + e + "u4','" + e + "f4','" + e + "f{}'],"
+            " 'offsets':[0,4,8,{}], 'itemsize':{}}}")
+
+
+def simple_dtype_fmt():
+    ld = np.dtype('longdouble')
+    simple_ld_off = 12 + 4 * (ld.alignment > 4)
+    return dt_fmt().format(ld.itemsize, simple_ld_off, simple_ld_off + ld.itemsize)
+
+
+def packed_dtype_fmt():
+    from sys import byteorder
+    return "[('bool_', '?'), ('uint_', '{e}u4'), ('float_', '{e}f4'), ('ldbl_', '{e}f{}')]".format(
+        np.dtype('longdouble').itemsize, e='<' if byteorder == 'little' else '>')
+
+
+def partial_ld_offset():
+    return 12 + 4 * (np.dtype('uint64').alignment > 4) + 8 + 8 * (
+        np.dtype('longdouble').alignment > 8)
+
+
+def partial_dtype_fmt():
+    ld = np.dtype('longdouble')
+    partial_ld_off = partial_ld_offset()
+    return dt_fmt().format(ld.itemsize, partial_ld_off, partial_ld_off + ld.itemsize)
+
+
+def partial_nested_fmt():
+    ld = np.dtype('longdouble')
+    partial_nested_off = 8 + 8 * (ld.alignment > 8)
+    partial_ld_off = partial_ld_offset()
+    partial_nested_size = partial_nested_off * 2 + partial_ld_off + ld.itemsize
+    return "{{'names':['a'], 'formats':[{}], 'offsets':[{}], 'itemsize':{}}}".format(
+        partial_dtype_fmt(), partial_nested_off, partial_nested_size)
 
 
 def assert_equal(actual, expected_data, expected_dtype):
     np.testing.assert_equal(actual, np.array(expected_data, dtype=expected_dtype))
 
 
-@pytest.requires_numpy
 def test_format_descriptors():
-    from pybind11_tests import get_format_unbound, print_format_descriptors
-
     with pytest.raises(RuntimeError) as excinfo:
-        get_format_unbound()
+        m.get_format_unbound()
     assert re.match('^NumPy type info missing for .*UnboundStruct.*$', str(excinfo.value))
 
-    assert print_format_descriptors() == [
-        "T{?:x:3xI:y:f:z:}",
-        "T{?:x:=I:y:=f:z:}",
-        "T{T{?:x:3xI:y:f:z:}:a:T{?:x:=I:y:=f:z:}:b:}",
-        "T{?:x:3xI:y:f:z:12x}",
-        "T{8xT{?:x:3xI:y:f:z:12x}:a:8x}",
-        "T{3s:a:3s:b:}",
-        'T{q:e1:B:e2:}'
+    ld = np.dtype('longdouble')
+    ldbl_fmt = ('4x' if ld.alignment > 4 else '') + ld.char
+    ss_fmt = "^T{?:bool_:3xI:uint_:f:float_:" + ldbl_fmt + ":ldbl_:}"
+    dbl = np.dtype('double')
+    partial_fmt = ("^T{?:bool_:3xI:uint_:f:float_:" +
+                   str(4 * (dbl.alignment > 4) + dbl.itemsize + 8 * (ld.alignment > 8)) +
+                   "xg:ldbl_:}")
+    nested_extra = str(max(8, ld.alignment))
+    assert m.print_format_descriptors() == [
+        ss_fmt,
+        "^T{?:bool_:I:uint_:f:float_:g:ldbl_:}",
+        "^T{" + ss_fmt + ":a:^T{?:bool_:I:uint_:f:float_:g:ldbl_:}:b:}",
+        partial_fmt,
+        "^T{" + nested_extra + "x" + partial_fmt + ":a:" + nested_extra + "x}",
+        "^T{3s:a:3s:b:}",
+        "^T{(3)4s:a:(2)i:b:(3)B:c:1x(4, 2)f:d:}",
+        '^T{q:e1:B:e2:}',
+        '^T{Zf:cflt:Zd:cdbl:}'
     ]
 
 
-@pytest.requires_numpy
 def test_dtype(simple_dtype):
-    from pybind11_tests import (print_dtypes, test_dtype_ctors, test_dtype_methods,
-                                trailing_padding_dtype, buffer_to_dtype)
-
-    assert print_dtypes() == [
-        "{'names':['x','y','z'], 'formats':['?','<u4','<f4'], 'offsets':[0,4,8], 'itemsize':12}",
-        "[('x', '?'), ('y', '<u4'), ('z', '<f4')]",
-        "[('a', {'names':['x','y','z'], 'formats':['?','<u4','<f4'], 'offsets':[0,4,8],"
-        " 'itemsize':12}), ('b', [('x', '?'), ('y', '<u4'), ('z', '<f4')])]",
-        "{'names':['x','y','z'], 'formats':['?','<u4','<f4'], 'offsets':[0,4,8], 'itemsize':24}",
-        "{'names':['a'], 'formats':[{'names':['x','y','z'], 'formats':['?','<u4','<f4'],"
-        " 'offsets':[0,4,8], 'itemsize':24}], 'offsets':[8], 'itemsize':40}",
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    assert m.print_dtypes() == [
+        simple_dtype_fmt(),
+        packed_dtype_fmt(),
+        "[('a', {}), ('b', {})]".format(simple_dtype_fmt(), packed_dtype_fmt()),
+        partial_dtype_fmt(),
+        partial_nested_fmt(),
         "[('a', 'S3'), ('b', 'S3')]",
-        "[('e1', '<i8'), ('e2', 'u1')]",
-        "[('x', 'i1'), ('y', '<u8')]"
+        ("{{'names':['a','b','c','d'], " +
+         "'formats':[('S4', (3,)),('<i4', (2,)),('u1', (3,)),('<f4', (4, 2))], " +
+         "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e),
+        "[('e1', '" + e + "i8'), ('e2', 'u1')]",
+        "[('x', 'i1'), ('y', '" + e + "u8')]",
+        "[('cflt', '" + e + "c8'), ('cdbl', '" + e + "c16')]"
     ]
 
     d1 = np.dtype({'names': ['a', 'b'], 'formats': ['int32', 'float64'],
                    'offsets': [1, 10], 'itemsize': 20})
     d2 = np.dtype([('a', 'i4'), ('b', 'f4')])
-    assert test_dtype_ctors() == [np.dtype('int32'), np.dtype('float64'),
-                                  np.dtype('bool'), d1, d1, np.dtype('uint32'), d2]
+    assert m.test_dtype_ctors() == [np.dtype('int32'), np.dtype('float64'),
+                                    np.dtype('bool'), d1, d1, np.dtype('uint32'), d2]
 
-    assert test_dtype_methods() == [np.dtype('int32'), simple_dtype, False, True,
-                                    np.dtype('int32').itemsize, simple_dtype.itemsize]
+    assert m.test_dtype_methods() == [np.dtype('int32'), simple_dtype, False, True,
+                                      np.dtype('int32').itemsize, simple_dtype.itemsize]
 
-    assert trailing_padding_dtype() == buffer_to_dtype(np.zeros(1, trailing_padding_dtype()))
+    assert m.trailing_padding_dtype() == m.buffer_to_dtype(np.zeros(1, m.trailing_padding_dtype()))
 
 
-@pytest.requires_numpy
 def test_recarray(simple_dtype, packed_dtype):
-    from pybind11_tests import (create_rec_simple, create_rec_packed, create_rec_nested,
-                                print_rec_simple, print_rec_packed, print_rec_nested,
-                                create_rec_partial, create_rec_partial_nested)
-
-    elements = [(False, 0, 0.0), (True, 1, 1.5), (False, 2, 3.0)]
+    elements = [(False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)]
 
-    for func, dtype in [(create_rec_simple, simple_dtype), (create_rec_packed, packed_dtype)]:
+    for func, dtype in [(m.create_rec_simple, simple_dtype), (m.create_rec_packed, packed_dtype)]:
         arr = func(0)
         assert arr.dtype == dtype
         assert_equal(arr, [], simple_dtype)
@@ -90,74 +137,65 @@ def test_recarray(simple_dtype, packed_dtype):
         assert_equal(arr, elements, packed_dtype)
 
         if dtype == simple_dtype:
-            assert print_rec_simple(arr) == [
-                "s:0,0,0",
-                "s:1,1,1.5",
-                "s:0,2,3"
+            assert m.print_rec_simple(arr) == [
+                "s:0,0,0,-0",
+                "s:1,1,1.5,-2.5",
+                "s:0,2,3,-5"
             ]
         else:
-            assert print_rec_packed(arr) == [
-                "p:0,0,0",
-                "p:1,1,1.5",
-                "p:0,2,3"
+            assert m.print_rec_packed(arr) == [
+                "p:0,0,0,-0",
+                "p:1,1,1.5,-2.5",
+                "p:0,2,3,-5"
             ]
 
     nested_dtype = np.dtype([('a', simple_dtype), ('b', packed_dtype)])
 
-    arr = create_rec_nested(0)
+    arr = m.create_rec_nested(0)
     assert arr.dtype == nested_dtype
     assert_equal(arr, [], nested_dtype)
 
-    arr = create_rec_nested(3)
+    arr = m.create_rec_nested(3)
     assert arr.dtype == nested_dtype
-    assert_equal(arr, [((False, 0, 0.0), (True, 1, 1.5)),
-                       ((True, 1, 1.5), (False, 2, 3.0)),
-                       ((False, 2, 3.0), (True, 3, 4.5))], nested_dtype)
-    assert print_rec_nested(arr) == [
-        "n:a=s:0,0,0;b=p:1,1,1.5",
-        "n:a=s:1,1,1.5;b=p:0,2,3",
-        "n:a=s:0,2,3;b=p:1,3,4.5"
+    assert_equal(arr, [((False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5)),
+                       ((True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)),
+                       ((False, 2, 3.0, -5.0), (True, 3, 4.5, -7.5))], nested_dtype)
+    assert m.print_rec_nested(arr) == [
+        "n:a=s:0,0,0,-0;b=p:1,1,1.5,-2.5",
+        "n:a=s:1,1,1.5,-2.5;b=p:0,2,3,-5",
+        "n:a=s:0,2,3,-5;b=p:1,3,4.5,-7.5"
     ]
 
-    arr = create_rec_partial(3)
-    assert str(arr.dtype) == \
-        "{'names':['x','y','z'], 'formats':['?','<u4','<f4'], 'offsets':[0,4,8], 'itemsize':24}"
+    arr = m.create_rec_partial(3)
+    assert str(arr.dtype) == partial_dtype_fmt()
     partial_dtype = arr.dtype
     assert '' not in arr.dtype.fields
     assert partial_dtype.itemsize > simple_dtype.itemsize
     assert_equal(arr, elements, simple_dtype)
     assert_equal(arr, elements, packed_dtype)
 
-    arr = create_rec_partial_nested(3)
-    assert str(arr.dtype) == \
-        "{'names':['a'], 'formats':[{'names':['x','y','z'], 'formats':['?','<u4','<f4']," \
-        " 'offsets':[0,4,8], 'itemsize':24}], 'offsets':[8], 'itemsize':40}"
+    arr = m.create_rec_partial_nested(3)
+    assert str(arr.dtype) == partial_nested_fmt()
     assert '' not in arr.dtype.fields
     assert '' not in arr.dtype.fields['a'][0].fields
     assert arr.dtype.itemsize > partial_dtype.itemsize
-    np.testing.assert_equal(arr['a'], create_rec_partial(3))
+    np.testing.assert_equal(arr['a'], m.create_rec_partial(3))
 
 
-@pytest.requires_numpy
 def test_array_constructors():
-    from pybind11_tests import test_array_ctors
-
     data = np.arange(1, 7, dtype='int32')
     for i in range(8):
-        np.testing.assert_array_equal(test_array_ctors(10 + i), data.reshape((3, 2)))
-        np.testing.assert_array_equal(test_array_ctors(20 + i), data.reshape((3, 2)))
+        np.testing.assert_array_equal(m.test_array_ctors(10 + i), data.reshape((3, 2)))
+        np.testing.assert_array_equal(m.test_array_ctors(20 + i), data.reshape((3, 2)))
     for i in range(5):
-        np.testing.assert_array_equal(test_array_ctors(30 + i), data)
-        np.testing.assert_array_equal(test_array_ctors(40 + i), data)
+        np.testing.assert_array_equal(m.test_array_ctors(30 + i), data)
+        np.testing.assert_array_equal(m.test_array_ctors(40 + i), data)
 
 
-@pytest.requires_numpy
 def test_string_array():
-    from pybind11_tests import create_string_array, print_string_array
-
-    arr = create_string_array(True)
+    arr = m.create_string_array(True)
     assert str(arr.dtype) == "[('a', 'S3'), ('b', 'S3')]"
-    assert print_string_array(arr) == [
+    assert m.print_string_array(arr) == [
         "a='',b=''",
         "a='a',b='a'",
         "a='ab',b='ab'",
@@ -166,45 +204,78 @@ def test_string_array():
     dtype = arr.dtype
     assert arr['a'].tolist() == [b'', b'a', b'ab', b'abc']
     assert arr['b'].tolist() == [b'', b'a', b'ab', b'abc']
-    arr = create_string_array(False)
+    arr = m.create_string_array(False)
     assert dtype == arr.dtype
 
 
-@pytest.requires_numpy
+def test_array_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_array_array(3)
+    assert str(arr.dtype) == (
+        "{{'names':['a','b','c','d'], " +
+        "'formats':[('S4', (3,)),('<i4', (2,)),('u1', (3,)),('{e}f4', (4, 2))], " +
+        "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e)
+    assert m.print_array_array(arr) == [
+        "a={{A,B,C,D},{K,L,M,N},{U,V,W,X}},b={0,1}," +
+        "c={0,1,2},d={{0,1},{10,11},{20,21},{30,31}}",
+        "a={{W,X,Y,Z},{G,H,I,J},{Q,R,S,T}},b={1000,1001}," +
+        "c={10,11,12},d={{100,101},{110,111},{120,121},{130,131}}",
+        "a={{S,T,U,V},{C,D,E,F},{M,N,O,P}},b={2000,2001}," +
+        "c={20,21,22},d={{200,201},{210,211},{220,221},{230,231}}",
+    ]
+    assert arr['a'].tolist() == [[b'ABCD', b'KLMN', b'UVWX'],
+                                 [b'WXYZ', b'GHIJ', b'QRST'],
+                                 [b'STUV', b'CDEF', b'MNOP']]
+    assert arr['b'].tolist() == [[0, 1], [1000, 1001], [2000, 2001]]
+    assert m.create_array_array(0).dtype == arr.dtype
+
+
 def test_enum_array():
-    from pybind11_tests import create_enum_array, print_enum_array
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
 
-    arr = create_enum_array(3)
+    arr = m.create_enum_array(3)
     dtype = arr.dtype
-    assert dtype == np.dtype([('e1', '<i8'), ('e2', 'u1')])
-    assert print_enum_array(arr) == [
+    assert dtype == np.dtype([('e1', e + 'i8'), ('e2', 'u1')])
+    assert m.print_enum_array(arr) == [
         "e1=A,e2=X",
         "e1=B,e2=Y",
         "e1=A,e2=X"
     ]
     assert arr['e1'].tolist() == [-1, 1, -1]
     assert arr['e2'].tolist() == [1, 2, 1]
-    assert create_enum_array(0).dtype == dtype
+    assert m.create_enum_array(0).dtype == dtype
 
 
-@pytest.requires_numpy
-def test_signature(doc):
-    from pybind11_tests import create_rec_nested
+def test_complex_array():
+    from sys import byteorder
+    e = '<' if byteorder == 'little' else '>'
+
+    arr = m.create_complex_array(3)
+    dtype = arr.dtype
+    assert dtype == np.dtype([('cflt', e + 'c8'), ('cdbl', e + 'c16')])
+    assert m.print_complex_array(arr) == [
+        "c:(0,0.25),(0.5,0.75)",
+        "c:(1,1.25),(1.5,1.75)",
+        "c:(2,2.25),(2.5,2.75)"
+    ]
+    assert arr['cflt'].tolist() == [0.0 + 0.25j, 1.0 + 1.25j, 2.0 + 2.25j]
+    assert arr['cdbl'].tolist() == [0.5 + 0.75j, 1.5 + 1.75j, 2.5 + 2.75j]
+    assert m.create_complex_array(0).dtype == dtype
 
-    assert doc(create_rec_nested) == "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
 
+def test_signature(doc):
+    assert doc(m.create_rec_nested) == \
+        "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
 
-@pytest.requires_numpy
-def test_scalar_conversion():
-    from pybind11_tests import (create_rec_simple, f_simple,
-                                create_rec_packed, f_packed,
-                                create_rec_nested, f_nested,
-                                create_enum_array)
 
+def test_scalar_conversion():
     n = 3
-    arrays = [create_rec_simple(n), create_rec_packed(n),
-              create_rec_nested(n), create_enum_array(n)]
-    funcs = [f_simple, f_packed, f_nested]
+    arrays = [m.create_rec_simple(n), m.create_rec_packed(n),
+              m.create_rec_nested(n), m.create_enum_array(n)]
+    funcs = [m.f_simple, m.f_packed, m.f_nested]
 
     for i, func in enumerate(funcs):
         for j, arr in enumerate(arrays):
@@ -216,10 +287,12 @@ def test_scalar_conversion():
                 assert 'incompatible function arguments' in str(excinfo.value)
 
 
-@pytest.requires_numpy
 def test_register_dtype():
-    from pybind11_tests import register_dtype
-
     with pytest.raises(RuntimeError) as excinfo:
-        register_dtype()
+        m.register_dtype()
     assert 'dtype is already registered' in str(excinfo.value)
+
+
+@pytest.requires_numpy
+def test_compare_buffer_info():
+    assert all(m.compare_buffer_info())
diff --git a/pybind11/tests/test_numpy_vectorize.cpp b/pybind11/tests/test_numpy_vectorize.cpp
index 6d94db2a1..a875a74b9 100644
--- a/pybind11/tests/test_numpy_vectorize.cpp
+++ b/pybind11/tests/test_numpy_vectorize.cpp
@@ -16,11 +16,11 @@ double my_func(int x, float y, double z) {
     return (float) x*y*z;
 }
 
-std::complex<double> my_func3(std::complex<double> c) {
-    return c * std::complex<double>(2.f);
-}
+TEST_SUBMODULE(numpy_vectorize, m) {
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
 
-test_initializer numpy_vectorize([](py::module &m) {
+    // test_vectorize, test_docs, test_array_collapse
     // Vectorize all arguments of a function (though non-vector arguments are also allowed)
     m.def("vectorized_func", py::vectorize(my_func));
 
@@ -32,10 +32,58 @@ test_initializer numpy_vectorize([](py::module &m) {
     );
 
     // Vectorize a complex-valued function
-    m.def("vectorized_func3", py::vectorize(my_func3));
+    m.def("vectorized_func3", py::vectorize(
+        [](std::complex<double> c) { return c * std::complex<double>(2.f); }
+    ));
 
-    /// Numpy function which only accepts specific data types
+    // test_type_selection
+    // Numpy function which only accepts specific data types
     m.def("selective_func", [](py::array_t<int, py::array::c_style>) { return "Int branch taken."; });
     m.def("selective_func", [](py::array_t<float, py::array::c_style>) { return "Float branch taken."; });
     m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
-});
+
+
+    // test_passthrough_arguments
+    // Passthrough test: references and non-pod types should be automatically passed through (in the
+    // function definition below, only `b`, `d`, and `g` are vectorized):
+    struct NonPODClass {
+        NonPODClass(int v) : value{v} {}
+        int value;
+    };
+    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
+    m.def("vec_passthrough", py::vectorize(
+        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
+            return *a + b + c.at(0) + d + e + f.value + g;
+        }
+    ));
+
+    // test_method_vectorization
+    struct VectorizeTestClass {
+        VectorizeTestClass(int v) : value{v} {};
+        float method(int x, float y) { return y + (float) (x + value); }
+        int value = 0;
+    };
+    py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
+    vtc .def(py::init<int>())
+        .def_readwrite("value", &VectorizeTestClass::value);
+
+    // Automatic vectorizing of methods
+    vtc.def("method", py::vectorize(&VectorizeTestClass::method));
+
+    // test_trivial_broadcasting
+    // Internal optimization test for whether the input is trivially broadcastable:
+    py::enum_<py::detail::broadcast_trivial>(m, "trivial")
+        .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
+        .value("c_trivial", py::detail::broadcast_trivial::c_trivial)
+        .value("non_trivial", py::detail::broadcast_trivial::non_trivial);
+    m.def("vectorized_is_trivial", [](
+                py::array_t<int, py::array::forcecast> arg1,
+                py::array_t<float, py::array::forcecast> arg2,
+                py::array_t<double, py::array::forcecast> arg3
+                ) {
+        ssize_t ndim;
+        std::vector<ssize_t> shape;
+        std::array<py::buffer_info, 3> buffers {{ arg1.request(), arg2.request(), arg3.request() }};
+        return py::detail::broadcast(buffers, ndim, shape);
+    });
+}
diff --git a/pybind11/tests/test_numpy_vectorize.py b/pybind11/tests/test_numpy_vectorize.py
index 718646efa..0e9c88397 100644
--- a/pybind11/tests/test_numpy_vectorize.py
+++ b/pybind11/tests/test_numpy_vectorize.py
@@ -1,16 +1,16 @@
 import pytest
+from pybind11_tests import numpy_vectorize as m
+
+pytestmark = pytest.requires_numpy
 
 with pytest.suppress(ImportError):
     import numpy as np
 
 
-@pytest.requires_numpy
 def test_vectorize(capture):
-    from pybind11_tests import vectorized_func, vectorized_func2, vectorized_func3
-
-    assert np.isclose(vectorized_func3(np.array(3 + 7j)), [6 + 14j])
+    assert np.isclose(m.vectorized_func3(np.array(3 + 7j)), [6 + 14j])
 
-    for f in [vectorized_func, vectorized_func2]:
+    for f in [m.vectorized_func, m.vectorized_func2]:
         with capture:
             assert np.isclose(f(1, 2, 3), 6)
         assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
@@ -23,6 +23,20 @@ def test_vectorize(capture):
             my_func(x:int=1, y:float=2, z:float=3)
             my_func(x:int=3, y:float=4, z:float=3)
         """
+        with capture:
+            a = np.array([[1, 2], [3, 4]], order='F')
+            b = np.array([[10, 20], [30, 40]], order='F')
+            c = 3
+            result = f(a, b, c)
+            assert np.allclose(result, a * b * c)
+            assert result.flags.f_contiguous
+        # All inputs are F order and full or singletons, so we the result is in col-major order:
+        assert capture == """
+            my_func(x:int=1, y:float=10, z:float=3)
+            my_func(x:int=3, y:float=30, z:float=3)
+            my_func(x:int=2, y:float=20, z:float=3)
+            my_func(x:int=4, y:float=40, z:float=3)
+        """
         with capture:
             a, b, c = np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3
             assert np.allclose(f(a, b, c), a * b * c)
@@ -56,21 +70,127 @@ def test_vectorize(capture):
             my_func(x:int=5, y:float=3, z:float=2)
             my_func(x:int=6, y:float=3, z:float=2)
         """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F'), np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=2, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=5, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]])[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
+        with capture:
+            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F')[::, ::2], np.array([[2], [3]]), 2
+            assert np.allclose(f(a, b, c), a * b * c)
+        assert capture == """
+            my_func(x:int=1, y:float=2, z:float=2)
+            my_func(x:int=3, y:float=2, z:float=2)
+            my_func(x:int=4, y:float=3, z:float=2)
+            my_func(x:int=6, y:float=3, z:float=2)
+        """
 
 
-@pytest.requires_numpy
 def test_type_selection():
-    from pybind11_tests import selective_func
-
-    assert selective_func(np.array([1], dtype=np.int32)) == "Int branch taken."
-    assert selective_func(np.array([1.0], dtype=np.float32)) == "Float branch taken."
-    assert selective_func(np.array([1.0j], dtype=np.complex64)) == "Complex float branch taken."
+    assert m.selective_func(np.array([1], dtype=np.int32)) == "Int branch taken."
+    assert m.selective_func(np.array([1.0], dtype=np.float32)) == "Float branch taken."
+    assert m.selective_func(np.array([1.0j], dtype=np.complex64)) == "Complex float branch taken."
 
 
-@pytest.requires_numpy
 def test_docs(doc):
-    from pybind11_tests import vectorized_func
-
-    assert doc(vectorized_func) == """
-        vectorized_func(arg0: numpy.ndarray[int], arg1: numpy.ndarray[float], arg2: numpy.ndarray[float]) -> object
+    assert doc(m.vectorized_func) == """
+        vectorized_func(arg0: numpy.ndarray[int32], arg1: numpy.ndarray[float32], arg2: numpy.ndarray[float64]) -> object
     """  # noqa: E501 line too long
+
+
+def test_trivial_broadcasting():
+    trivial, vectorized_is_trivial = m.trivial, m.vectorized_is_trivial
+
+    assert vectorized_is_trivial(1, 2, 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array(1), np.array(2), 3) == trivial.c_trivial
+    assert vectorized_is_trivial(np.array([1, 3]), np.array([2, 4]), 3) == trivial.c_trivial
+    assert trivial.c_trivial == vectorized_is_trivial(
+        np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3)
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2) == trivial.non_trivial
+    assert vectorized_is_trivial(
+        np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2) == trivial.non_trivial
+    z1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype='int32')
+    z2 = np.array(z1, dtype='float32')
+    z3 = np.array(z1, dtype='float64')
+    assert vectorized_is_trivial(z1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(1, z2, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, 1, z3) == trivial.c_trivial
+    assert vectorized_is_trivial(z1, z2, 1) == trivial.c_trivial
+    assert vectorized_is_trivial(z1[::2, ::2], 1, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(1, 1, z1[::2, ::2]) == trivial.c_trivial
+    assert vectorized_is_trivial(1, 1, z3[::2, ::2]) == trivial.non_trivial
+    assert vectorized_is_trivial(z1, 1, z3[1::4, 1::4]) == trivial.c_trivial
+
+    y1 = np.array(z1, order='F')
+    y2 = np.array(y1)
+    y3 = np.array(y1)
+    assert vectorized_is_trivial(y1, y2, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, 1, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(1, 1, y3) == trivial.f_trivial
+    assert vectorized_is_trivial(y1, z2, 1) == trivial.non_trivial
+    assert vectorized_is_trivial(z1[1::4, 1::4], y2, 1) == trivial.f_trivial
+    assert vectorized_is_trivial(y1[1::4, 1::4], z2, 1) == trivial.c_trivial
+
+    assert m.vectorized_func(z1, z2, z3).flags.c_contiguous
+    assert m.vectorized_func(y1, y2, y3).flags.f_contiguous
+    assert m.vectorized_func(z1, 1, 1).flags.c_contiguous
+    assert m.vectorized_func(1, y2, 1).flags.f_contiguous
+    assert m.vectorized_func(z1[1::4, 1::4], y2, 1).flags.f_contiguous
+    assert m.vectorized_func(y1[1::4, 1::4], z2, 1).flags.c_contiguous
+
+
+def test_passthrough_arguments(doc):
+    assert doc(m.vec_passthrough) == (
+        "vec_passthrough(" + ", ".join([
+            "arg0: float",
+            "arg1: numpy.ndarray[float64]",
+            "arg2: numpy.ndarray[float64]",
+            "arg3: numpy.ndarray[int32]",
+            "arg4: int",
+            "arg5: m.numpy_vectorize.NonPODClass",
+            "arg6: numpy.ndarray[float64]"]) + ") -> object")
+
+    b = np.array([[10, 20, 30]], dtype='float64')
+    c = np.array([100, 200])  # NOT a vectorized argument
+    d = np.array([[1000], [2000], [3000]], dtype='int')
+    g = np.array([[1000000, 2000000, 3000000]], dtype='int')  # requires casting
+    assert np.all(
+        m.vec_passthrough(1, b, c, d, 10000, m.NonPODClass(100000), g) ==
+        np.array([[1111111, 2111121, 3111131],
+                  [1112111, 2112121, 3112131],
+                  [1113111, 2113121, 3113131]]))
+
+
+def test_method_vectorization():
+    o = m.VectorizeTestClass(3)
+    x = np.array([1, 2], dtype='int')
+    y = np.array([[10], [20]], dtype='float32')
+    assert np.all(o.method(x, y) == [[14, 15], [24, 25]])
+
+
+def test_array_collapse():
+    assert not isinstance(m.vectorized_func(1, 2, 3), np.ndarray)
+    assert not isinstance(m.vectorized_func(np.array(1), 2, 3), np.ndarray)
+    z = m.vectorized_func([1], 2, 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, )
+    z = m.vectorized_func(1, [[[2]]], 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, 1, 1)
diff --git a/pybind11/tests/test_opaque_types.cpp b/pybind11/tests/test_opaque_types.cpp
index 54f4dc7a5..5e83df0f6 100644
--- a/pybind11/tests/test_opaque_types.cpp
+++ b/pybind11/tests/test_opaque_types.cpp
@@ -11,17 +11,13 @@
 #include <pybind11/stl.h>
 #include <vector>
 
-typedef std::vector<std::string> StringList;
-
-class ClassWithSTLVecProperty {
-public:
-    StringList stringList;
-};
+using StringList = std::vector<std::string>;
 
 /* IMPORTANT: Disable internal pybind11 translation mechanisms for STL data structures */
 PYBIND11_MAKE_OPAQUE(StringList);
 
-test_initializer opaque_types([](py::module &m) {
+TEST_SUBMODULE(opaque_types, m) {
+    // test_string_list
     py::class_<StringList>(m, "StringList")
         .def(py::init<>())
         .def("pop_back", &StringList::pop_back)
@@ -33,6 +29,10 @@ test_initializer opaque_types([](py::module &m) {
            return py::make_iterator(v.begin(), v.end());
         }, py::keep_alive<0, 1>());
 
+    class ClassWithSTLVecProperty {
+    public:
+        StringList stringList;
+    };
     py::class_<ClassWithSTLVecProperty>(m, "ClassWithSTLVecProperty")
         .def(py::init<>())
         .def_readwrite("stringList", &ClassWithSTLVecProperty::stringList);
@@ -49,6 +49,7 @@ test_initializer opaque_types([](py::module &m) {
         return ret + "]";
     });
 
+    // test_pointers
     m.def("return_void_ptr", []() { return (void *) 0x1234; });
     m.def("get_void_ptr_value", [](void *ptr) { return reinterpret_cast<std::intptr_t>(ptr); });
     m.def("return_null_str", []() { return (char *) nullptr; });
@@ -59,4 +60,4 @@ test_initializer opaque_types([](py::module &m) {
         result->push_back("some value");
         return std::unique_ptr<StringList>(result);
     });
-});
+}
diff --git a/pybind11/tests/test_opaque_types.py b/pybind11/tests/test_opaque_types.py
index 7781943b4..2d3aef5d1 100644
--- a/pybind11/tests/test_opaque_types.py
+++ b/pybind11/tests/test_opaque_types.py
@@ -1,39 +1,36 @@
 import pytest
+from pybind11_tests import opaque_types as m
+from pybind11_tests import ConstructorStats, UserType
 
 
 def test_string_list():
-    from pybind11_tests import StringList, ClassWithSTLVecProperty, print_opaque_list
-
-    l = StringList()
+    l = m.StringList()
     l.push_back("Element 1")
     l.push_back("Element 2")
-    assert print_opaque_list(l) == "Opaque list: [Element 1, Element 2]"
+    assert m.print_opaque_list(l) == "Opaque list: [Element 1, Element 2]"
     assert l.back() == "Element 2"
 
     for i, k in enumerate(l, start=1):
         assert k == "Element {}".format(i)
     l.pop_back()
-    assert print_opaque_list(l) == "Opaque list: [Element 1]"
+    assert m.print_opaque_list(l) == "Opaque list: [Element 1]"
 
-    cvp = ClassWithSTLVecProperty()
-    assert print_opaque_list(cvp.stringList) == "Opaque list: []"
+    cvp = m.ClassWithSTLVecProperty()
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: []"
 
     cvp.stringList = l
     cvp.stringList.push_back("Element 3")
-    assert print_opaque_list(cvp.stringList) == "Opaque list: [Element 1, Element 3]"
+    assert m.print_opaque_list(cvp.stringList) == "Opaque list: [Element 1, Element 3]"
 
 
 def test_pointers(msg):
-    from pybind11_tests import (return_void_ptr, get_void_ptr_value, ExampleMandA,
-                                print_opaque_list, return_null_str, get_null_str_value,
-                                return_unique_ptr, ConstructorStats)
-
-    assert get_void_ptr_value(return_void_ptr()) == 0x1234
-    assert get_void_ptr_value(ExampleMandA())  # Should also work for other C++ types
-    assert ConstructorStats.get(ExampleMandA).alive() == 0
+    living_before = ConstructorStats.get(UserType).alive()
+    assert m.get_void_ptr_value(m.return_void_ptr()) == 0x1234
+    assert m.get_void_ptr_value(UserType())  # Should also work for other C++ types
+    assert ConstructorStats.get(UserType).alive() == living_before
 
     with pytest.raises(TypeError) as excinfo:
-        get_void_ptr_value([1, 2, 3])  # This should not work
+        m.get_void_ptr_value([1, 2, 3])  # This should not work
     assert msg(excinfo.value) == """
         get_void_ptr_value(): incompatible function arguments. The following argument types are supported:
             1. (arg0: capsule) -> int
@@ -41,9 +38,9 @@ def test_pointers(msg):
         Invoked with: [1, 2, 3]
     """  # noqa: E501 line too long
 
-    assert return_null_str() is None
-    assert get_null_str_value(return_null_str()) is not None
+    assert m.return_null_str() is None
+    assert m.get_null_str_value(m.return_null_str()) is not None
 
-    ptr = return_unique_ptr()
+    ptr = m.return_unique_ptr()
     assert "StringList" in repr(ptr)
-    assert print_opaque_list(ptr) == "Opaque list: [some value]"
+    assert m.print_opaque_list(ptr) == "Opaque list: [some value]"
diff --git a/pybind11/tests/test_operator_overloading.cpp b/pybind11/tests/test_operator_overloading.cpp
index 93aea8010..4ad34d104 100644
--- a/pybind11/tests/test_operator_overloading.cpp
+++ b/pybind11/tests/test_operator_overloading.cpp
@@ -10,28 +10,18 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
 #include <pybind11/operators.h>
+#include <functional>
 
 class Vector2 {
 public:
     Vector2(float x, float y) : x(x), y(y) { print_created(this, toString()); }
     Vector2(const Vector2 &v) : x(v.x), y(v.y) { print_copy_created(this); }
     Vector2(Vector2 &&v) : x(v.x), y(v.y) { print_move_created(this); v.x = v.y = 0; }
+    Vector2 &operator=(const Vector2 &v) { x = v.x; y = v.y; print_copy_assigned(this); return *this; }
+    Vector2 &operator=(Vector2 &&v) { x = v.x; y = v.y; v.x = v.y = 0; print_move_assigned(this); return *this; }
     ~Vector2() { print_destroyed(this); }
 
-    std::string toString() const {
-        return "[" + std::to_string(x) + ", " + std::to_string(y) + "]";
-    }
-
-    void operator=(const Vector2 &v) {
-        print_copy_assigned(this);
-        x = v.x;
-        y = v.y;
-    }
-
-    void operator=(Vector2 &&v) {
-        print_move_assigned(this);
-        x = v.x; y = v.y; v.x = v.y = 0;
-    }
+    std::string toString() const { return "[" + std::to_string(x) + ", " + std::to_string(y) + "]"; }
 
     Vector2 operator+(const Vector2 &v) const { return Vector2(x + v.x, y + v.y); }
     Vector2 operator-(const Vector2 &v) const { return Vector2(x - v.x, y - v.y); }
@@ -39,10 +29,14 @@ public:
     Vector2 operator+(float value) const { return Vector2(x + value, y + value); }
     Vector2 operator*(float value) const { return Vector2(x * value, y * value); }
     Vector2 operator/(float value) const { return Vector2(x / value, y / value); }
+    Vector2 operator*(const Vector2 &v) const { return Vector2(x * v.x, y * v.y); }
+    Vector2 operator/(const Vector2 &v) const { return Vector2(x / v.x, y / v.y); }
     Vector2& operator+=(const Vector2 &v) { x += v.x; y += v.y; return *this; }
     Vector2& operator-=(const Vector2 &v) { x -= v.x; y -= v.y; return *this; }
     Vector2& operator*=(float v) { x *= v; y *= v; return *this; }
     Vector2& operator/=(float v) { x /= v; y /= v; return *this; }
+    Vector2& operator*=(const Vector2 &v) { x *= v.x; y *= v.y; return *this; }
+    Vector2& operator/=(const Vector2 &v) { x /= v.x; y /= v.y; return *this; }
 
     friend Vector2 operator+(float f, const Vector2 &v) { return Vector2(f + v.x, f + v.y); }
     friend Vector2 operator-(float f, const Vector2 &v) { return Vector2(f - v.x, f - v.y); }
@@ -52,7 +46,25 @@ private:
     float x, y;
 };
 
-test_initializer operator_overloading([](py::module &m) {
+class C1 { };
+class C2 { };
+
+int operator+(const C1 &, const C1 &) { return 11; }
+int operator+(const C2 &, const C2 &) { return 22; }
+int operator+(const C2 &, const C1 &) { return 21; }
+int operator+(const C1 &, const C2 &) { return 12; }
+
+namespace std {
+    template<>
+    struct hash<Vector2> {
+        // Not a good hash function, but easy to test
+        size_t operator()(const Vector2 &) { return 4; }
+    };
+}
+
+TEST_SUBMODULE(operators, m) {
+
+    // test_operator_overloading
     py::class_<Vector2>(m, "Vector2")
         .def(py::init<float, float>())
         .def(py::self + py::self)
@@ -61,16 +73,74 @@ test_initializer operator_overloading([](py::module &m) {
         .def(py::self - float())
         .def(py::self * float())
         .def(py::self / float())
+        .def(py::self * py::self)
+        .def(py::self / py::self)
         .def(py::self += py::self)
         .def(py::self -= py::self)
         .def(py::self *= float())
         .def(py::self /= float())
+        .def(py::self *= py::self)
+        .def(py::self /= py::self)
         .def(float() + py::self)
         .def(float() - py::self)
         .def(float() * py::self)
         .def(float() / py::self)
         .def("__str__", &Vector2::toString)
+        .def(hash(py::self))
         ;
 
     m.attr("Vector") = m.attr("Vector2");
-});
+
+    // test_operators_notimplemented
+    // #393: need to return NotSupported to ensure correct arithmetic operator behavior
+    py::class_<C1>(m, "C1")
+        .def(py::init<>())
+        .def(py::self + py::self);
+
+    py::class_<C2>(m, "C2")
+        .def(py::init<>())
+        .def(py::self + py::self)
+        .def("__add__", [](const C2& c2, const C1& c1) { return c2 + c1; })
+        .def("__radd__", [](const C2& c2, const C1& c1) { return c1 + c2; });
+
+    // test_nested
+    // #328: first member in a class can't be used in operators
+    struct NestABase { int value = -2; };
+    py::class_<NestABase>(m, "NestABase")
+        .def(py::init<>())
+        .def_readwrite("value", &NestABase::value);
+
+    struct NestA : NestABase {
+        int value = 3;
+        NestA& operator+=(int i) { value += i; return *this; }
+    };
+    py::class_<NestA>(m, "NestA")
+        .def(py::init<>())
+        .def(py::self += int())
+        .def("as_base", [](NestA &a) -> NestABase& {
+            return (NestABase&) a;
+        }, py::return_value_policy::reference_internal);
+    m.def("get_NestA", [](const NestA &a) { return a.value; });
+
+    struct NestB {
+        NestA a;
+        int value = 4;
+        NestB& operator-=(int i) { value -= i; return *this; }
+    };
+    py::class_<NestB>(m, "NestB")
+        .def(py::init<>())
+        .def(py::self -= int())
+        .def_readwrite("a", &NestB::a);
+    m.def("get_NestB", [](const NestB &b) { return b.value; });
+
+    struct NestC {
+        NestB b;
+        int value = 5;
+        NestC& operator*=(int i) { value *= i; return *this; }
+    };
+    py::class_<NestC>(m, "NestC")
+        .def(py::init<>())
+        .def(py::self *= int())
+        .def_readwrite("b", &NestC::b);
+    m.def("get_NestC", [](const NestC &c) { return c.value; });
+}
diff --git a/pybind11/tests/test_operator_overloading.py b/pybind11/tests/test_operator_overloading.py
index 02ccb9633..0d80e5ed3 100644
--- a/pybind11/tests/test_operator_overloading.py
+++ b/pybind11/tests/test_operator_overloading.py
@@ -1,8 +1,11 @@
-def test_operator_overloading():
-    from pybind11_tests import Vector2, Vector, ConstructorStats
+import pytest
+from pybind11_tests import operators as m
+from pybind11_tests import ConstructorStats
+
 
-    v1 = Vector2(1, 2)
-    v2 = Vector(3, -1)
+def test_operator_overloading():
+    v1 = m.Vector2(1, 2)
+    v2 = m.Vector(3, -1)
     assert str(v1) == "[1.000000, 2.000000]"
     assert str(v2) == "[3.000000, -1.000000]"
 
@@ -16,12 +19,25 @@ def test_operator_overloading():
     assert str(8 + v1) == "[9.000000, 10.000000]"
     assert str(8 * v1) == "[8.000000, 16.000000]"
     assert str(8 / v1) == "[8.000000, 4.000000]"
+    assert str(v1 * v2) == "[3.000000, -2.000000]"
+    assert str(v2 / v1) == "[3.000000, -0.500000]"
 
-    v1 += v2
+    v1 += 2 * v2
+    assert str(v1) == "[7.000000, 0.000000]"
+    v1 -= v2
+    assert str(v1) == "[4.000000, 1.000000]"
     v1 *= 2
     assert str(v1) == "[8.000000, 2.000000]"
+    v1 /= 16
+    assert str(v1) == "[0.500000, 0.125000]"
+    v1 *= v2
+    assert str(v1) == "[1.500000, -0.125000]"
+    v2 /= v1
+    assert str(v2) == "[2.000000, 8.000000]"
+
+    assert hash(v1) == 4
 
-    cstats = ConstructorStats.get(Vector2)
+    cstats = ConstructorStats.get(m.Vector2)
     assert cstats.alive() == 2
     del v1
     assert cstats.alive() == 1
@@ -32,9 +48,59 @@ def test_operator_overloading():
                                '[-7.000000, -6.000000]', '[9.000000, 10.000000]',
                                '[8.000000, 16.000000]', '[0.125000, 0.250000]',
                                '[7.000000, 6.000000]', '[9.000000, 10.000000]',
-                               '[8.000000, 16.000000]', '[8.000000, 4.000000]']
+                               '[8.000000, 16.000000]', '[8.000000, 4.000000]',
+                               '[3.000000, -2.000000]', '[3.000000, -0.500000]',
+                               '[6.000000, -2.000000]']
     assert cstats.default_constructions == 0
     assert cstats.copy_constructions == 0
     assert cstats.move_constructions >= 10
     assert cstats.copy_assignments == 0
     assert cstats.move_assignments == 0
+
+
+def test_operators_notimplemented():
+    """#393: need to return NotSupported to ensure correct arithmetic operator behavior"""
+
+    c1, c2 = m.C1(), m.C2()
+    assert c1 + c1 == 11
+    assert c2 + c2 == 22
+    assert c2 + c1 == 21
+    assert c1 + c2 == 12
+
+
+def test_nested():
+    """#328: first member in a class can't be used in operators"""
+
+    a = m.NestA()
+    b = m.NestB()
+    c = m.NestC()
+
+    a += 10
+    assert m.get_NestA(a) == 13
+    b.a += 100
+    assert m.get_NestA(b.a) == 103
+    c.b.a += 1000
+    assert m.get_NestA(c.b.a) == 1003
+    b -= 1
+    assert m.get_NestB(b) == 3
+    c.b -= 3
+    assert m.get_NestB(c.b) == 1
+    c *= 7
+    assert m.get_NestC(c) == 35
+
+    abase = a.as_base()
+    assert abase.value == -2
+    a.as_base().value += 44
+    assert abase.value == 42
+    assert c.b.a.as_base().value == -2
+    c.b.a.as_base().value += 44
+    assert c.b.a.as_base().value == 42
+
+    del c
+    pytest.gc_collect()
+    del a  # Should't delete while abase is still alive
+    pytest.gc_collect()
+
+    assert abase.value == 42
+    del abase, b
+    pytest.gc_collect()
diff --git a/pybind11/tests/test_pickling.cpp b/pybind11/tests/test_pickling.cpp
index 52b1dbc30..9dc63bda3 100644
--- a/pybind11/tests/test_pickling.cpp
+++ b/pybind11/tests/test_pickling.cpp
@@ -9,30 +9,28 @@
 
 #include "pybind11_tests.h"
 
-class Pickleable {
-public:
-    Pickleable(const std::string &value) : m_value(value) { }
-    const std::string &value() const { return m_value; }
-
-    void setExtra1(int extra1) { m_extra1 = extra1; }
-    void setExtra2(int extra2) { m_extra2 = extra2; }
-    int extra1() const { return m_extra1; }
-    int extra2() const { return m_extra2; }
-private:
-    std::string m_value;
-    int m_extra1 = 0;
-    int m_extra2 = 0;
-};
-
-class PickleableWithDict {
-public:
-    PickleableWithDict(const std::string &value) : value(value) { }
-
-    std::string value;
-    int extra;
-};
-
-test_initializer pickling([](py::module &m) {
+TEST_SUBMODULE(pickling, m) {
+    // test_roundtrip
+    class Pickleable {
+    public:
+        Pickleable(const std::string &value) : m_value(value) { }
+        const std::string &value() const { return m_value; }
+
+        void setExtra1(int extra1) { m_extra1 = extra1; }
+        void setExtra2(int extra2) { m_extra2 = extra2; }
+        int extra1() const { return m_extra1; }
+        int extra2() const { return m_extra2; }
+    private:
+        std::string m_value;
+        int m_extra1 = 0;
+        int m_extra2 = 0;
+    };
+
+    class PickleableNew : public Pickleable {
+    public:
+        using Pickleable::Pickleable;
+    };
+
     py::class_<Pickleable>(m, "Pickleable")
         .def(py::init<std::string>())
         .def("value", &Pickleable::value)
@@ -57,7 +55,38 @@ test_initializer pickling([](py::module &m) {
             p.setExtra2(t[2].cast<int>());
         });
 
+    py::class_<PickleableNew, Pickleable>(m, "PickleableNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](const PickleableNew &p) {
+                return py::make_tuple(p.value(), p.extra1(), p.extra2());
+            },
+            [](py::tuple t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+                auto p = PickleableNew(t[0].cast<std::string>());
+
+                p.setExtra1(t[1].cast<int>());
+                p.setExtra2(t[2].cast<int>());
+                return p;
+            }
+        ));
+
 #if !defined(PYPY_VERSION)
+    // test_roundtrip_with_dict
+    class PickleableWithDict {
+    public:
+        PickleableWithDict(const std::string &value) : value(value) { }
+
+        std::string value;
+        int extra;
+    };
+
+    class PickleableWithDictNew : public PickleableWithDict {
+    public:
+        using PickleableWithDict::PickleableWithDict;
+    };
+
     py::class_<PickleableWithDict>(m, "PickleableWithDict", py::dynamic_attr())
         .def(py::init<std::string>())
         .def_readwrite("value", &PickleableWithDict::value)
@@ -79,5 +108,23 @@ test_initializer pickling([](py::module &m) {
             /* Assign Python state */
             self.attr("__dict__") = t[2];
         });
+
+    py::class_<PickleableWithDictNew, PickleableWithDict>(m, "PickleableWithDictNew")
+        .def(py::init<std::string>())
+        .def(py::pickle(
+            [](py::object self) {
+                return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
+            },
+            [](const py::tuple &t) {
+                if (t.size() != 3)
+                    throw std::runtime_error("Invalid state!");
+
+                auto cpp_state = PickleableWithDictNew(t[0].cast<std::string>());
+                cpp_state.extra = t[1].cast<int>();
+
+                auto py_state = t[2].cast<py::dict>();
+                return std::make_pair(cpp_state, py_state);
+            }
+        ));
 #endif
-});
+}
diff --git a/pybind11/tests/test_pickling.py b/pybind11/tests/test_pickling.py
index 548c618af..707d34786 100644
--- a/pybind11/tests/test_pickling.py
+++ b/pybind11/tests/test_pickling.py
@@ -1,4 +1,5 @@
 import pytest
+from pybind11_tests import pickling as m
 
 try:
     import cPickle as pickle  # Use cPickle on Python 2.7
@@ -6,10 +7,10 @@ except ImportError:
     import pickle
 
 
-def test_roundtrip():
-    from pybind11_tests import Pickleable
-
-    p = Pickleable("test_value")
+@pytest.mark.parametrize("cls_name", ["Pickleable", "PickleableNew"])
+def test_roundtrip(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
     p.setExtra1(15)
     p.setExtra2(48)
 
@@ -21,10 +22,10 @@ def test_roundtrip():
 
 
 @pytest.unsupported_on_pypy
-def test_roundtrip_with_dict():
-    from pybind11_tests import PickleableWithDict
-
-    p = PickleableWithDict("test_value")
+@pytest.mark.parametrize("cls_name", ["PickleableWithDict", "PickleableWithDictNew"])
+def test_roundtrip_with_dict(cls_name):
+    cls = getattr(m, cls_name)
+    p = cls("test_value")
     p.extra = 15
     p.dynamic = "Attribute"
 
diff --git a/pybind11/tests/test_python_types.cpp b/pybind11/tests/test_python_types.cpp
deleted file mode 100644
index e1598e9ef..000000000
--- a/pybind11/tests/test_python_types.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
-    tests/test_python_types.cpp -- singleton design pattern, static functions and
-    variables, passing and interacting with Python types
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
-
-    All rights reserved. Use of this source code is governed by a
-    BSD-style license that can be found in the LICENSE file.
-*/
-
-#include "pybind11_tests.h"
-#include "constructor_stats.h"
-#include <pybind11/stl.h>
-
-#ifdef _WIN32
-#  include <io.h>
-#  include <fcntl.h>
-#endif
-
-class ExamplePythonTypes {
-public:
-    static ExamplePythonTypes *new_instance() {
-        auto *ptr = new ExamplePythonTypes();
-        print_created(ptr, "via new_instance");
-        return ptr;
-    }
-    ~ExamplePythonTypes() { print_destroyed(this); }
-
-    /* Create and return a Python dictionary */
-    py::dict get_dict() {
-        py::dict dict;
-        dict[py::str("key")] = py::str("value");
-        return dict;
-    }
-
-    /* Create and return a Python set */
-    py::set get_set() {
-        py::set set;
-        set.add(py::str("key1"));
-        set.add("key2");
-        set.add(std::string("key3"));
-        return set;
-    }
-
-    /* Create and return a C++ dictionary */
-    std::map<std::string, std::string> get_dict_2() {
-        std::map<std::string, std::string> result;
-        result["key"] = "value";
-        return result;
-    }
-
-    /* Create and return a C++ set */
-    std::set<std::string> get_set_2() {
-        std::set<std::string> result;
-        result.insert("key1");
-        result.insert("key2");
-        return result;
-    }
-
-    /* Create, manipulate, and return a Python list */
-    py::list get_list() {
-        py::list list;
-        list.append("value");
-        py::print("Entry at position 0:", list[0]);
-        list[0] = py::str("overwritten");
-        return list;
-    }
-
-    /* C++ STL data types are automatically casted */
-    std::vector<std::wstring> get_list_2() {
-        std::vector<std::wstring> list;
-        list.push_back(L"value");
-        return list;
-    }
-
-    /* C++ STL data types are automatically casted */
-    std::array<std::string, 2> get_array() {
-        return std::array<std::string, 2> {{ "array entry 1" , "array entry 2"}};
-    }
-
-    std::valarray<int> get_valarray() {
-        return std::valarray<int>({ 1, 4, 9 });
-    }
-
-    /* Easily iterate over a dictionary using a C++11 range-based for loop */
-    void print_dict(py::dict dict) {
-        for (auto item : dict)
-            py::print("key: {}, value={}"_s.format(item.first, item.second));
-    }
-
-    /* Easily iterate over a set using a C++11 range-based for loop */
-    void print_set(py::set set) {
-        for (auto item : set)
-            py::print("key:", item);
-    }
-
-    /* Easily iterate over a list using a C++11 range-based for loop */
-    void print_list(py::list list) {
-        int index = 0;
-        for (auto item : list)
-            py::print("list item {}: {}"_s.format(index++, item));
-    }
-
-    /* STL data types (such as maps) are automatically casted from Python */
-    void print_dict_2(const std::map<std::string, std::string> &dict) {
-        for (auto item : dict)
-            py::print("key: {}, value={}"_s.format(item.first, item.second));
-    }
-
-    /* STL data types (such as sets) are automatically casted from Python */
-    void print_set_2(const std::set<std::string> &set) {
-        for (auto item : set)
-            py::print("key:", item);
-    }
-
-    /* STL data types (such as vectors) are automatically casted from Python */
-    void print_list_2(std::vector<std::wstring> &list) {
-        int index = 0;
-        for (auto item : list)
-            py::print("list item {}: {}"_s.format(index++, item));
-    }
-
-    /* pybind automatically translates between C++11 and Python tuples */
-    std::pair<std::string, bool> pair_passthrough(std::pair<bool, std::string> input) {
-        return std::make_pair(input.second, input.first);
-    }
-
-    /* pybind automatically translates between C++11 and Python tuples */
-    std::tuple<int, std::string, bool> tuple_passthrough(std::tuple<bool, std::string, int> input) {
-        return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
-    }
-
-    /* STL data types (such as arrays) are automatically casted from Python */
-    void print_array(std::array<std::string, 2> &array) {
-        int index = 0;
-        for (auto item : array)
-            py::print("array item {}: {}"_s.format(index++, item));
-    }
-
-    void print_valarray(std::valarray<int> &varray) {
-        int index = 0;
-        for (auto item : varray)
-            py::print("valarray item {}: {}"_s.format(index++, item));
-    }
-
-    void throw_exception() {
-        throw std::runtime_error("This exception was intentionally thrown.");
-    }
-
-    py::bytes get_bytes_from_string() {
-        return (py::bytes) std::string("foo");
-    }
-
-    py::bytes get_bytes_from_str() {
-        return (py::bytes) py::str("bar", 3);
-    }
-
-    py::str get_str_from_string() {
-        return (py::str) std::string("baz");
-    }
-
-    py::str get_str_from_bytes() {
-        return (py::str) py::bytes("boo", 3);
-    }
-
-    void test_print(const py::object& obj) {
-        py::print(py::str(obj));
-        py::print(py::repr(obj));
-    }
-
-    static int value;
-    static const int value2;
-};
-
-int ExamplePythonTypes::value = 0;
-const int ExamplePythonTypes::value2 = 5;
-
-struct MoveOutContainer {
-    struct Value { int value; };
-
-    std::list<Value> move_list() const { return {{0}, {1}, {2}}; }
-};
-
-
-test_initializer python_types([](py::module &m) {
-    /* No constructor is explicitly defined below. An exception is raised when
-       trying to construct it directly from Python */
-    py::class_<ExamplePythonTypes>(m, "ExamplePythonTypes", "Example 2 documentation", py::metaclass())
-        .def("get_dict", &ExamplePythonTypes::get_dict, "Return a Python dictionary")
-        .def("get_dict_2", &ExamplePythonTypes::get_dict_2, "Return a C++ dictionary")
-        .def("get_list", &ExamplePythonTypes::get_list, "Return a Python list")
-        .def("get_list_2", &ExamplePythonTypes::get_list_2, "Return a C++ list")
-        .def("get_set", &ExamplePythonTypes::get_set, "Return a Python set")
-        .def("get_set2", &ExamplePythonTypes::get_set_2, "Return a C++ set")
-        .def("get_array", &ExamplePythonTypes::get_array, "Return a C++ array")
-        .def("get_valarray", &ExamplePythonTypes::get_valarray, "Return a C++ valarray")
-        .def("print_dict", &ExamplePythonTypes::print_dict, "Print entries of a Python dictionary")
-        .def("print_dict_2", &ExamplePythonTypes::print_dict_2, "Print entries of a C++ dictionary")
-        .def("print_set", &ExamplePythonTypes::print_set, "Print entries of a Python set")
-        .def("print_set_2", &ExamplePythonTypes::print_set_2, "Print entries of a C++ set")
-        .def("print_list", &ExamplePythonTypes::print_list, "Print entries of a Python list")
-        .def("print_list_2", &ExamplePythonTypes::print_list_2, "Print entries of a C++ list")
-        .def("print_array", &ExamplePythonTypes::print_array, "Print entries of a C++ array")
-        .def("print_valarray", &ExamplePythonTypes::print_valarray, "Print entries of a C++ valarray")
-        .def("pair_passthrough", &ExamplePythonTypes::pair_passthrough, "Return a pair in reversed order")
-        .def("tuple_passthrough", &ExamplePythonTypes::tuple_passthrough, "Return a triple in reversed order")
-        .def("throw_exception", &ExamplePythonTypes::throw_exception, "Throw an exception")
-        .def("get_bytes_from_string", &ExamplePythonTypes::get_bytes_from_string, "py::bytes from std::string")
-        .def("get_bytes_from_str", &ExamplePythonTypes::get_bytes_from_str, "py::bytes from py::str")
-        .def("get_str_from_string", &ExamplePythonTypes::get_str_from_string, "py::str from std::string")
-        .def("get_str_from_bytes", &ExamplePythonTypes::get_str_from_bytes, "py::str from py::bytes")
-        .def("test_print", &ExamplePythonTypes::test_print, "test the print function")
-        .def_static("new_instance", &ExamplePythonTypes::new_instance, "Return an instance")
-        .def_readwrite_static("value", &ExamplePythonTypes::value, "Static value member")
-        .def_readonly_static("value2", &ExamplePythonTypes::value2, "Static value member (readonly)");
-
-    m.def("test_print_function", []() {
-        py::print("Hello, World!");
-        py::print(1, 2.0, "three", true, std::string("-- multiple args"));
-        auto args = py::make_tuple("and", "a", "custom", "separator");
-        py::print("*args", *args, "sep"_a="-");
-        py::print("no new line here", "end"_a=" -- ");
-        py::print("next print");
-
-        auto py_stderr = py::module::import("sys").attr("stderr");
-        py::print("this goes to stderr", "file"_a=py_stderr);
-
-        py::print("flush", "flush"_a=true);
-
-        py::print("{a} + {b} = {c}"_s.format("a"_a="py::print", "b"_a="str.format", "c"_a="this"));
-    });
-
-    m.def("test_str_format", []() {
-        auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
-        auto s2 = "{a} + {b} = {c}"_s.format("a"_a=1, "b"_a=2, "c"_a=3);
-        return py::make_tuple(s1, s2);
-    });
-
-    m.def("test_dict_keyword_constructor", []() {
-        auto d1 = py::dict("x"_a=1, "y"_a=2);
-        auto d2 = py::dict("z"_a=3, **d1);
-        return d2;
-    });
-
-    m.def("test_accessor_api", [](py::object o) {
-        auto d = py::dict();
-
-        d["basic_attr"] = o.attr("basic_attr");
-
-        auto l = py::list();
-        for (const auto &item : o.attr("begin_end")) {
-            l.append(item);
-        }
-        d["begin_end"] = l;
-
-        d["operator[object]"] = o.attr("d")["operator[object]"_s];
-        d["operator[char *]"] = o.attr("d")["operator[char *]"];
-
-        d["attr(object)"] = o.attr("sub").attr("attr_obj");
-        d["attr(char *)"] = o.attr("sub").attr("attr_char");
-        try {
-            o.attr("sub").attr("missing").ptr();
-        } catch (const py::error_already_set &) {
-            d["missing_attr_ptr"] = "raised"_s;
-        }
-        try {
-            o.attr("missing").attr("doesn't matter");
-        } catch (const py::error_already_set &) {
-            d["missing_attr_chain"] = "raised"_s;
-        }
-
-        d["is_none"] = o.attr("basic_attr").is_none();
-
-        d["operator()"] = o.attr("func")(1);
-        d["operator*"] = o.attr("func")(*o.attr("begin_end"));
-
-        return d;
-    });
-
-    m.def("test_tuple_accessor", [](py::tuple existing_t) {
-        try {
-            existing_t[0] = 1;
-        } catch (const py::error_already_set &) {
-            // --> Python system error
-            // Only new tuples (refcount == 1) are mutable
-            auto new_t = py::tuple(3);
-            for (size_t i = 0; i < new_t.size(); ++i) {
-                new_t[i] = i;
-            }
-            return new_t;
-        }
-        return py::tuple();
-    });
-
-    m.def("test_accessor_assignment", []() {
-        auto l = py::list(1);
-        l[0] = 0;
-
-        auto d = py::dict();
-        d["get"] = l[0];
-        auto var = l[0];
-        d["deferred_get"] = var;
-        l[0] = 1;
-        d["set"] = l[0];
-        var = 99; // this assignment should not overwrite l[0]
-        d["deferred_set"] = l[0];
-        d["var"] = var;
-
-        return d;
-    });
-
-    bool has_optional = false, has_exp_optional = false;
-#ifdef PYBIND11_HAS_OPTIONAL
-    has_optional = true;
-    using opt_int = std::optional<int>;
-    m.def("double_or_zero", [](const opt_int& x) -> int {
-        return x.value_or(0) * 2;
-    });
-    m.def("half_or_none", [](int x) -> opt_int {
-        return x ? opt_int(x / 2) : opt_int();
-    });
-    m.def("test_nullopt", [](opt_int x) {
-        return x.value_or(42);
-    }, py::arg_v("x", std::nullopt, "None"));
-#endif
-
-#ifdef PYBIND11_HAS_EXP_OPTIONAL
-    has_exp_optional = true;
-    using exp_opt_int = std::experimental::optional<int>;
-    m.def("double_or_zero_exp", [](const exp_opt_int& x) -> int {
-        return x.value_or(0) * 2;
-    });
-    m.def("half_or_none_exp", [](int x) -> exp_opt_int {
-        return x ? exp_opt_int(x / 2) : exp_opt_int();
-    });
-    m.def("test_nullopt_exp", [](exp_opt_int x) {
-        return x.value_or(42);
-    }, py::arg_v("x", std::experimental::nullopt, "None"));
-#endif
-
-    m.attr("has_optional") = has_optional;
-    m.attr("has_exp_optional") = has_exp_optional;
-
-    m.def("test_default_constructors", []() {
-        return py::dict(
-            "str"_a=py::str(),
-            "bool"_a=py::bool_(),
-            "int"_a=py::int_(),
-            "float"_a=py::float_(),
-            "tuple"_a=py::tuple(),
-            "list"_a=py::list(),
-            "dict"_a=py::dict(),
-            "set"_a=py::set()
-        );
-    });
-
-    m.def("test_converting_constructors", [](py::dict d) {
-        return py::dict(
-            "str"_a=py::str(d["str"]),
-            "bool"_a=py::bool_(d["bool"]),
-            "int"_a=py::int_(d["int"]),
-            "float"_a=py::float_(d["float"]),
-            "tuple"_a=py::tuple(d["tuple"]),
-            "list"_a=py::list(d["list"]),
-            "dict"_a=py::dict(d["dict"]),
-            "set"_a=py::set(d["set"]),
-            "memoryview"_a=py::memoryview(d["memoryview"])
-        );
-    });
-
-    m.def("test_cast_functions", [](py::dict d) {
-        // When converting between Python types, obj.cast<T>() should be the same as T(obj)
-        return py::dict(
-            "str"_a=d["str"].cast<py::str>(),
-            "bool"_a=d["bool"].cast<py::bool_>(),
-            "int"_a=d["int"].cast<py::int_>(),
-            "float"_a=d["float"].cast<py::float_>(),
-            "tuple"_a=d["tuple"].cast<py::tuple>(),
-            "list"_a=d["list"].cast<py::list>(),
-            "dict"_a=d["dict"].cast<py::dict>(),
-            "set"_a=d["set"].cast<py::set>(),
-            "memoryview"_a=d["memoryview"].cast<py::memoryview>()
-        );
-    });
-
-    py::class_<MoveOutContainer::Value>(m, "MoveOutContainerValue")
-        .def_readonly("value", &MoveOutContainer::Value::value);
-
-    py::class_<MoveOutContainer>(m, "MoveOutContainer")
-        .def(py::init<>())
-        .def_property_readonly("move_list", &MoveOutContainer::move_list);
-
-    m.def("get_implicit_casting", []() {
-        py::dict d;
-        d["char*_i1"] = "abc";
-        const char *c2 = "abc";
-        d["char*_i2"] = c2;
-        d["char*_e"] = py::cast(c2);
-        d["char*_p"] = py::str(c2);
-
-        d["int_i1"] = 42;
-        int i = 42;
-        d["int_i2"] = i;
-        i++;
-        d["int_e"] = py::cast(i);
-        i++;
-        d["int_p"] = py::int_(i);
-
-        d["str_i1"] = std::string("str");
-        std::string s2("str1");
-        d["str_i2"] = s2;
-        s2[3] = '2';
-        d["str_e"] = py::cast(s2);
-        s2[3] = '3';
-        d["str_p"] = py::str(s2);
-
-        py::list l(2);
-        l[0] = 3;
-        l[1] = py::cast(6);
-        l.append(9);
-        l.append(py::cast(12));
-        l.append(py::int_(15));
-
-        return py::dict(
-            "d"_a=d,
-            "l"_a=l
-        );
-    });
-});
diff --git a/pybind11/tests/test_python_types.py b/pybind11/tests/test_python_types.py
deleted file mode 100644
index cb28e1ff1..000000000
--- a/pybind11/tests/test_python_types.py
+++ /dev/null
@@ -1,412 +0,0 @@
-import pytest
-
-from pybind11_tests import ExamplePythonTypes, ConstructorStats, has_optional, has_exp_optional
-
-
-def test_repr():
-    # In Python 3.3+, repr() accesses __qualname__
-    assert "ExamplePythonTypes__Meta" in repr(type(ExamplePythonTypes))
-    assert "ExamplePythonTypes" in repr(ExamplePythonTypes)
-
-
-def test_static():
-    ExamplePythonTypes.value = 15
-    assert ExamplePythonTypes.value == 15
-    assert ExamplePythonTypes.value2 == 5
-
-    with pytest.raises(AttributeError) as excinfo:
-        ExamplePythonTypes.value2 = 15
-    assert str(excinfo.value) == "can't set attribute"
-
-
-def test_instance(capture):
-    with pytest.raises(TypeError) as excinfo:
-        ExamplePythonTypes()
-    assert str(excinfo.value) == "pybind11_tests.ExamplePythonTypes: No constructor defined!"
-
-    instance = ExamplePythonTypes.new_instance()
-
-    with capture:
-        dict_result = instance.get_dict()
-        dict_result['key2'] = 'value2'
-        instance.print_dict(dict_result)
-    assert capture.unordered == """
-        key: key, value=value
-        key: key2, value=value2
-    """
-    with capture:
-        dict_result = instance.get_dict_2()
-        dict_result['key2'] = 'value2'
-        instance.print_dict_2(dict_result)
-    assert capture.unordered == """
-        key: key, value=value
-        key: key2, value=value2
-    """
-    with capture:
-        set_result = instance.get_set()
-        set_result.add('key4')
-        instance.print_set(set_result)
-    assert capture.unordered == """
-        key: key1
-        key: key2
-        key: key3
-        key: key4
-    """
-    with capture:
-        set_result = instance.get_set2()
-        set_result.add('key3')
-        instance.print_set_2(set_result)
-    assert capture.unordered == """
-        key: key1
-        key: key2
-        key: key3
-    """
-    with capture:
-        list_result = instance.get_list()
-        list_result.append('value2')
-        instance.print_list(list_result)
-    assert capture.unordered == """
-        Entry at position 0: value
-        list item 0: overwritten
-        list item 1: value2
-    """
-    with capture:
-        list_result = instance.get_list_2()
-        list_result.append('value2')
-        instance.print_list_2(list_result)
-    assert capture.unordered == """
-        list item 0: value
-        list item 1: value2
-    """
-    with capture:
-        list_result = instance.get_list_2()
-        list_result.append('value2')
-        instance.print_list_2(tuple(list_result))
-    assert capture.unordered == """
-        list item 0: value
-        list item 1: value2
-    """
-    array_result = instance.get_array()
-    assert array_result == ['array entry 1', 'array entry 2']
-    with capture:
-        instance.print_array(array_result)
-    assert capture.unordered == """
-        array item 0: array entry 1
-        array item 1: array entry 2
-    """
-    varray_result = instance.get_valarray()
-    assert varray_result == [1, 4, 9]
-    with capture:
-        instance.print_valarray(varray_result)
-    assert capture.unordered == """
-        valarray item 0: 1
-        valarray item 1: 4
-        valarray item 2: 9
-    """
-    with pytest.raises(RuntimeError) as excinfo:
-        instance.throw_exception()
-    assert str(excinfo.value) == "This exception was intentionally thrown."
-
-    assert instance.pair_passthrough((True, "test")) == ("test", True)
-    assert instance.tuple_passthrough((True, "test", 5)) == (5, "test", True)
-    # Any sequence can be cast to a std::pair or std::tuple
-    assert instance.pair_passthrough([True, "test"]) == ("test", True)
-    assert instance.tuple_passthrough([True, "test", 5]) == (5, "test", True)
-
-    assert instance.get_bytes_from_string().decode() == "foo"
-    assert instance.get_bytes_from_str().decode() == "bar"
-    assert instance.get_str_from_string().encode().decode() == "baz"
-    assert instance.get_str_from_bytes().encode().decode() == "boo"
-
-    class A(object):
-        def __str__(self):
-            return "this is a str"
-
-        def __repr__(self):
-            return "this is a repr"
-
-    with capture:
-        instance.test_print(A())
-    assert capture == """
-        this is a str
-        this is a repr
-    """
-
-    cstats = ConstructorStats.get(ExamplePythonTypes)
-    assert cstats.alive() == 1
-    del instance
-    assert cstats.alive() == 0
-
-
-# PyPy does not seem to propagate the tp_docs field at the moment
-def test_class_docs(doc):
-    assert doc(ExamplePythonTypes) == "Example 2 documentation"
-
-
-def test_method_docs(doc):
-    assert doc(ExamplePythonTypes.get_dict) == """
-        get_dict(self: m.ExamplePythonTypes) -> dict
-
-        Return a Python dictionary
-    """
-    assert doc(ExamplePythonTypes.get_dict_2) == """
-        get_dict_2(self: m.ExamplePythonTypes) -> Dict[str, str]
-
-        Return a C++ dictionary
-    """
-    assert doc(ExamplePythonTypes.get_list) == """
-        get_list(self: m.ExamplePythonTypes) -> list
-
-        Return a Python list
-    """
-    assert doc(ExamplePythonTypes.get_list_2) == """
-        get_list_2(self: m.ExamplePythonTypes) -> List[str]
-
-        Return a C++ list
-    """
-    assert doc(ExamplePythonTypes.get_dict) == """
-        get_dict(self: m.ExamplePythonTypes) -> dict
-
-        Return a Python dictionary
-    """
-    assert doc(ExamplePythonTypes.get_set) == """
-        get_set(self: m.ExamplePythonTypes) -> set
-
-        Return a Python set
-    """
-    assert doc(ExamplePythonTypes.get_set2) == """
-        get_set2(self: m.ExamplePythonTypes) -> Set[str]
-
-        Return a C++ set
-    """
-    assert doc(ExamplePythonTypes.get_array) == """
-        get_array(self: m.ExamplePythonTypes) -> List[str[2]]
-
-        Return a C++ array
-    """
-    assert doc(ExamplePythonTypes.get_valarray) == """
-        get_valarray(self: m.ExamplePythonTypes) -> List[int]
-
-        Return a C++ valarray
-    """
-    assert doc(ExamplePythonTypes.print_dict) == """
-        print_dict(self: m.ExamplePythonTypes, arg0: dict) -> None
-
-        Print entries of a Python dictionary
-    """
-    assert doc(ExamplePythonTypes.print_dict_2) == """
-        print_dict_2(self: m.ExamplePythonTypes, arg0: Dict[str, str]) -> None
-
-        Print entries of a C++ dictionary
-    """
-    assert doc(ExamplePythonTypes.print_set) == """
-        print_set(self: m.ExamplePythonTypes, arg0: set) -> None
-
-        Print entries of a Python set
-    """
-    assert doc(ExamplePythonTypes.print_set_2) == """
-        print_set_2(self: m.ExamplePythonTypes, arg0: Set[str]) -> None
-
-        Print entries of a C++ set
-    """
-    assert doc(ExamplePythonTypes.print_list) == """
-        print_list(self: m.ExamplePythonTypes, arg0: list) -> None
-
-        Print entries of a Python list
-    """
-    assert doc(ExamplePythonTypes.print_list_2) == """
-        print_list_2(self: m.ExamplePythonTypes, arg0: List[str]) -> None
-
-        Print entries of a C++ list
-    """
-    assert doc(ExamplePythonTypes.print_array) == """
-        print_array(self: m.ExamplePythonTypes, arg0: List[str[2]]) -> None
-
-        Print entries of a C++ array
-    """
-    assert doc(ExamplePythonTypes.pair_passthrough) == """
-        pair_passthrough(self: m.ExamplePythonTypes, arg0: Tuple[bool, str]) -> Tuple[str, bool]
-
-        Return a pair in reversed order
-    """
-    assert doc(ExamplePythonTypes.tuple_passthrough) == """
-        tuple_passthrough(self: m.ExamplePythonTypes, arg0: Tuple[bool, str, int]) -> Tuple[int, str, bool]
-
-        Return a triple in reversed order
-    """  # noqa: E501 line too long
-    assert doc(ExamplePythonTypes.throw_exception) == """
-        throw_exception(self: m.ExamplePythonTypes) -> None
-
-        Throw an exception
-    """
-    assert doc(ExamplePythonTypes.new_instance) == """
-        new_instance() -> m.ExamplePythonTypes
-
-        Return an instance
-    """
-
-
-def test_module():
-    import pybind11_tests
-
-    assert pybind11_tests.__name__ == "pybind11_tests"
-    assert ExamplePythonTypes.__name__ == "ExamplePythonTypes"
-    assert ExamplePythonTypes.__module__ == "pybind11_tests"
-    assert ExamplePythonTypes.get_set.__name__ == "get_set"
-    assert ExamplePythonTypes.get_set.__module__ == "pybind11_tests"
-
-
-def test_print(capture):
-    from pybind11_tests import test_print_function
-
-    with capture:
-        test_print_function()
-    assert capture == """
-        Hello, World!
-        1 2.0 three True -- multiple args
-        *args-and-a-custom-separator
-        no new line here -- next print
-        flush
-        py::print + str.format = this
-    """
-    assert capture.stderr == "this goes to stderr"
-
-
-def test_str_api():
-    from pybind11_tests import test_str_format
-
-    s1, s2 = test_str_format()
-    assert s1 == "1 + 2 = 3"
-    assert s1 == s2
-
-
-def test_dict_api():
-    from pybind11_tests import test_dict_keyword_constructor
-
-    assert test_dict_keyword_constructor() == {"x": 1, "y": 2, "z": 3}
-
-
-def test_accessors():
-    from pybind11_tests import test_accessor_api, test_tuple_accessor, test_accessor_assignment
-
-    class SubTestObject:
-        attr_obj = 1
-        attr_char = 2
-
-    class TestObject:
-        basic_attr = 1
-        begin_end = [1, 2, 3]
-        d = {"operator[object]": 1, "operator[char *]": 2}
-        sub = SubTestObject()
-
-        def func(self, x, *args):
-            return self.basic_attr + x + sum(args)
-
-    d = test_accessor_api(TestObject())
-    assert d["basic_attr"] == 1
-    assert d["begin_end"] == [1, 2, 3]
-    assert d["operator[object]"] == 1
-    assert d["operator[char *]"] == 2
-    assert d["attr(object)"] == 1
-    assert d["attr(char *)"] == 2
-    assert d["missing_attr_ptr"] == "raised"
-    assert d["missing_attr_chain"] == "raised"
-    assert d["is_none"] is False
-    assert d["operator()"] == 2
-    assert d["operator*"] == 7
-
-    assert test_tuple_accessor(tuple()) == (0, 1, 2)
-
-    d = test_accessor_assignment()
-    assert d["get"] == 0
-    assert d["deferred_get"] == 0
-    assert d["set"] == 1
-    assert d["deferred_set"] == 1
-    assert d["var"] == 99
-
-
-@pytest.mark.skipif(not has_optional, reason='no <optional>')
-def test_optional():
-    from pybind11_tests import double_or_zero, half_or_none, test_nullopt
-
-    assert double_or_zero(None) == 0
-    assert double_or_zero(42) == 84
-    pytest.raises(TypeError, double_or_zero, 'foo')
-
-    assert half_or_none(0) is None
-    assert half_or_none(42) == 21
-    pytest.raises(TypeError, half_or_none, 'foo')
-
-    assert test_nullopt() == 42
-    assert test_nullopt(None) == 42
-    assert test_nullopt(42) == 42
-    assert test_nullopt(43) == 43
-
-
-@pytest.mark.skipif(not has_exp_optional, reason='no <experimental/optional>')
-def test_exp_optional():
-    from pybind11_tests import double_or_zero_exp, half_or_none_exp, test_nullopt_exp
-
-    assert double_or_zero_exp(None) == 0
-    assert double_or_zero_exp(42) == 84
-    pytest.raises(TypeError, double_or_zero_exp, 'foo')
-
-    assert half_or_none_exp(0) is None
-    assert half_or_none_exp(42) == 21
-    pytest.raises(TypeError, half_or_none_exp, 'foo')
-
-    assert test_nullopt_exp() == 42
-    assert test_nullopt_exp(None) == 42
-    assert test_nullopt_exp(42) == 42
-    assert test_nullopt_exp(43) == 43
-
-
-def test_constructors():
-    """C++ default and converting constructors are equivalent to type calls in Python"""
-    from pybind11_tests import (test_default_constructors, test_converting_constructors,
-                                test_cast_functions)
-
-    types = [str, bool, int, float, tuple, list, dict, set]
-    expected = {t.__name__: t() for t in types}
-    assert test_default_constructors() == expected
-
-    data = {
-        str: 42,
-        bool: "Not empty",
-        int: "42",
-        float: "+1e3",
-        tuple: range(3),
-        list: range(3),
-        dict: [("two", 2), ("one", 1), ("three", 3)],
-        set: [4, 4, 5, 6, 6, 6],
-        memoryview: b'abc'
-    }
-    inputs = {k.__name__: v for k, v in data.items()}
-    expected = {k.__name__: k(v) for k, v in data.items()}
-    assert test_converting_constructors(inputs) == expected
-    assert test_cast_functions(inputs) == expected
-
-
-def test_move_out_container():
-    """Properties use the `reference_internal` policy by default. If the underlying function
-    returns an rvalue, the policy is automatically changed to `move` to avoid referencing
-    a temporary. In case the return value is a container of user-defined types, the policy
-    also needs to be applied to the elements, not just the container."""
-    from pybind11_tests import MoveOutContainer
-
-    c = MoveOutContainer()
-    moved_out_list = c.move_list
-    assert [x.value for x in moved_out_list] == [0, 1, 2]
-
-
-def test_implicit_casting():
-    """Tests implicit casting when assigning or appending to dicts and lists."""
-    from pybind11_tests import get_implicit_casting
-
-    z = get_implicit_casting()
-    assert z['d'] == {
-        'char*_i1': 'abc', 'char*_i2': 'abc', 'char*_e': 'abc', 'char*_p': 'abc',
-        'str_i1': 'str', 'str_i2': 'str1', 'str_e': 'str2', 'str_p': 'str3',
-        'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
-    }
-    assert z['l'] == [3, 6, 9, 12, 15]
diff --git a/pybind11/tests/test_pytypes.cpp b/pybind11/tests/test_pytypes.cpp
new file mode 100644
index 000000000..a962f0ccc
--- /dev/null
+++ b/pybind11/tests/test_pytypes.cpp
@@ -0,0 +1,272 @@
+/*
+    tests/test_pytypes.cpp -- Python type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+
+
+TEST_SUBMODULE(pytypes, m) {
+    // test_list
+    m.def("get_list", []() {
+        py::list list;
+        list.append("value");
+        py::print("Entry at position 0:", list[0]);
+        list[0] = py::str("overwritten");
+        return list;
+    });
+    m.def("print_list", [](py::list list) {
+        int index = 0;
+        for (auto item : list)
+            py::print("list item {}: {}"_s.format(index++, item));
+    });
+
+    // test_set
+    m.def("get_set", []() {
+        py::set set;
+        set.add(py::str("key1"));
+        set.add("key2");
+        set.add(std::string("key3"));
+        return set;
+    });
+    m.def("print_set", [](py::set set) {
+        for (auto item : set)
+            py::print("key:", item);
+    });
+
+    // test_dict
+    m.def("get_dict", []() { return py::dict("key"_a="value"); });
+    m.def("print_dict", [](py::dict dict) {
+        for (auto item : dict)
+            py::print("key: {}, value={}"_s.format(item.first, item.second));
+    });
+    m.def("dict_keyword_constructor", []() {
+        auto d1 = py::dict("x"_a=1, "y"_a=2);
+        auto d2 = py::dict("z"_a=3, **d1);
+        return d2;
+    });
+
+    // test_str
+    m.def("str_from_string", []() { return py::str(std::string("baz")); });
+    m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); });
+    m.def("str_from_object", [](const py::object& obj) { return py::str(obj); });
+    m.def("repr_from_object", [](const py::object& obj) { return py::repr(obj); });
+
+    m.def("str_format", []() {
+        auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
+        auto s2 = "{a} + {b} = {c}"_s.format("a"_a=1, "b"_a=2, "c"_a=3);
+        return py::make_tuple(s1, s2);
+    });
+
+    // test_bytes
+    m.def("bytes_from_string", []() { return py::bytes(std::string("foo")); });
+    m.def("bytes_from_str", []() { return py::bytes(py::str("bar", 3)); });
+
+    // test_capsule
+    m.def("return_capsule_with_destructor", []() {
+        py::print("creating capsule");
+        return py::capsule([]() {
+            py::print("destructing capsule");
+        });
+    });
+
+    m.def("return_capsule_with_destructor_2", []() {
+        py::print("creating capsule");
+        return py::capsule((void *) 1234, [](void *ptr) {
+            py::print("destructing capsule: {}"_s.format((size_t) ptr));
+        });
+    });
+
+    m.def("return_capsule_with_name_and_destructor", []() {
+        auto capsule = py::capsule((void *) 1234, "pointer type description", [](PyObject *ptr) {
+            if (ptr) {
+                auto name = PyCapsule_GetName(ptr);
+                py::print("destructing capsule ({}, '{}')"_s.format(
+                    (size_t) PyCapsule_GetPointer(ptr, name), name
+                ));
+            }
+        });
+        void *contents = capsule;
+        py::print("created capsule ({}, '{}')"_s.format((size_t) contents, capsule.name()));
+        return capsule;
+    });
+
+    // test_accessors
+    m.def("accessor_api", [](py::object o) {
+        auto d = py::dict();
+
+        d["basic_attr"] = o.attr("basic_attr");
+
+        auto l = py::list();
+        for (const auto &item : o.attr("begin_end")) {
+            l.append(item);
+        }
+        d["begin_end"] = l;
+
+        d["operator[object]"] = o.attr("d")["operator[object]"_s];
+        d["operator[char *]"] = o.attr("d")["operator[char *]"];
+
+        d["attr(object)"] = o.attr("sub").attr("attr_obj");
+        d["attr(char *)"] = o.attr("sub").attr("attr_char");
+        try {
+            o.attr("sub").attr("missing").ptr();
+        } catch (const py::error_already_set &) {
+            d["missing_attr_ptr"] = "raised"_s;
+        }
+        try {
+            o.attr("missing").attr("doesn't matter");
+        } catch (const py::error_already_set &) {
+            d["missing_attr_chain"] = "raised"_s;
+        }
+
+        d["is_none"] = o.attr("basic_attr").is_none();
+
+        d["operator()"] = o.attr("func")(1);
+        d["operator*"] = o.attr("func")(*o.attr("begin_end"));
+
+        // Test implicit conversion
+        py::list implicit_list = o.attr("begin_end");
+        d["implicit_list"] = implicit_list;
+        py::dict implicit_dict = o.attr("__dict__");
+        d["implicit_dict"] = implicit_dict;
+
+        return d;
+    });
+
+    m.def("tuple_accessor", [](py::tuple existing_t) {
+        try {
+            existing_t[0] = 1;
+        } catch (const py::error_already_set &) {
+            // --> Python system error
+            // Only new tuples (refcount == 1) are mutable
+            auto new_t = py::tuple(3);
+            for (size_t i = 0; i < new_t.size(); ++i) {
+                new_t[i] = i;
+            }
+            return new_t;
+        }
+        return py::tuple();
+    });
+
+    m.def("accessor_assignment", []() {
+        auto l = py::list(1);
+        l[0] = 0;
+
+        auto d = py::dict();
+        d["get"] = l[0];
+        auto var = l[0];
+        d["deferred_get"] = var;
+        l[0] = 1;
+        d["set"] = l[0];
+        var = 99; // this assignment should not overwrite l[0]
+        d["deferred_set"] = l[0];
+        d["var"] = var;
+
+        return d;
+    });
+
+    // test_constructors
+    m.def("default_constructors", []() {
+        return py::dict(
+            "str"_a=py::str(),
+            "bool"_a=py::bool_(),
+            "int"_a=py::int_(),
+            "float"_a=py::float_(),
+            "tuple"_a=py::tuple(),
+            "list"_a=py::list(),
+            "dict"_a=py::dict(),
+            "set"_a=py::set()
+        );
+    });
+
+    m.def("converting_constructors", [](py::dict d) {
+        return py::dict(
+            "str"_a=py::str(d["str"]),
+            "bool"_a=py::bool_(d["bool"]),
+            "int"_a=py::int_(d["int"]),
+            "float"_a=py::float_(d["float"]),
+            "tuple"_a=py::tuple(d["tuple"]),
+            "list"_a=py::list(d["list"]),
+            "dict"_a=py::dict(d["dict"]),
+            "set"_a=py::set(d["set"]),
+            "memoryview"_a=py::memoryview(d["memoryview"])
+        );
+    });
+
+    m.def("cast_functions", [](py::dict d) {
+        // When converting between Python types, obj.cast<T>() should be the same as T(obj)
+        return py::dict(
+            "str"_a=d["str"].cast<py::str>(),
+            "bool"_a=d["bool"].cast<py::bool_>(),
+            "int"_a=d["int"].cast<py::int_>(),
+            "float"_a=d["float"].cast<py::float_>(),
+            "tuple"_a=d["tuple"].cast<py::tuple>(),
+            "list"_a=d["list"].cast<py::list>(),
+            "dict"_a=d["dict"].cast<py::dict>(),
+            "set"_a=d["set"].cast<py::set>(),
+            "memoryview"_a=d["memoryview"].cast<py::memoryview>()
+        );
+    });
+
+    m.def("get_implicit_casting", []() {
+        py::dict d;
+        d["char*_i1"] = "abc";
+        const char *c2 = "abc";
+        d["char*_i2"] = c2;
+        d["char*_e"] = py::cast(c2);
+        d["char*_p"] = py::str(c2);
+
+        d["int_i1"] = 42;
+        int i = 42;
+        d["int_i2"] = i;
+        i++;
+        d["int_e"] = py::cast(i);
+        i++;
+        d["int_p"] = py::int_(i);
+
+        d["str_i1"] = std::string("str");
+        std::string s2("str1");
+        d["str_i2"] = s2;
+        s2[3] = '2';
+        d["str_e"] = py::cast(s2);
+        s2[3] = '3';
+        d["str_p"] = py::str(s2);
+
+        py::list l(2);
+        l[0] = 3;
+        l[1] = py::cast(6);
+        l.append(9);
+        l.append(py::cast(12));
+        l.append(py::int_(15));
+
+        return py::dict(
+            "d"_a=d,
+            "l"_a=l
+        );
+    });
+
+    // test_print
+    m.def("print_function", []() {
+        py::print("Hello, World!");
+        py::print(1, 2.0, "three", true, std::string("-- multiple args"));
+        auto args = py::make_tuple("and", "a", "custom", "separator");
+        py::print("*args", *args, "sep"_a="-");
+        py::print("no new line here", "end"_a=" -- ");
+        py::print("next print");
+
+        auto py_stderr = py::module::import("sys").attr("stderr");
+        py::print("this goes to stderr", "file"_a=py_stderr);
+
+        py::print("flush", "flush"_a=true);
+
+        py::print("{a} + {b} = {c}"_s.format("a"_a="py::print", "b"_a="str.format", "c"_a="this"));
+    });
+
+    m.def("print_failure", []() { py::print(42, UnregisteredType()); });
+
+    m.def("hash_function", [](py::object obj) { return py::hash(obj); });
+}
diff --git a/pybind11/tests/test_pytypes.py b/pybind11/tests/test_pytypes.py
new file mode 100644
index 000000000..94c90a909
--- /dev/null
+++ b/pybind11/tests/test_pytypes.py
@@ -0,0 +1,240 @@
+import pytest
+import sys
+
+from pybind11_tests import pytypes as m
+from pybind11_tests import debug_enabled
+
+
+def test_list(capture, doc):
+    with capture:
+        l = m.get_list()
+        assert l == ["overwritten"]
+
+        l.append("value2")
+        m.print_list(l)
+    assert capture.unordered == """
+        Entry at position 0: value
+        list item 0: overwritten
+        list item 1: value2
+    """
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_set(capture, doc):
+    s = m.get_set()
+    assert s == {"key1", "key2", "key3"}
+
+    with capture:
+        s.add("key4")
+        m.print_set(s)
+    assert capture.unordered == """
+        key: key1
+        key: key2
+        key: key3
+        key: key4
+    """
+
+    assert doc(m.get_list) == "get_list() -> list"
+    assert doc(m.print_list) == "print_list(arg0: list) -> None"
+
+
+def test_dict(capture, doc):
+    d = m.get_dict()
+    assert d == {"key": "value"}
+
+    with capture:
+        d["key2"] = "value2"
+        m.print_dict(d)
+    assert capture.unordered == """
+        key: key, value=value
+        key: key2, value=value2
+    """
+
+    assert doc(m.get_dict) == "get_dict() -> dict"
+    assert doc(m.print_dict) == "print_dict(arg0: dict) -> None"
+
+    assert m.dict_keyword_constructor() == {"x": 1, "y": 2, "z": 3}
+
+
+def test_str(doc):
+    assert m.str_from_string().encode().decode() == "baz"
+    assert m.str_from_bytes().encode().decode() == "boo"
+
+    assert doc(m.str_from_bytes) == "str_from_bytes() -> str"
+
+    class A(object):
+        def __str__(self):
+            return "this is a str"
+
+        def __repr__(self):
+            return "this is a repr"
+
+    assert m.str_from_object(A()) == "this is a str"
+    assert m.repr_from_object(A()) == "this is a repr"
+
+    s1, s2 = m.str_format()
+    assert s1 == "1 + 2 = 3"
+    assert s1 == s2
+
+
+def test_bytes(doc):
+    assert m.bytes_from_string().decode() == "foo"
+    assert m.bytes_from_str().decode() == "bar"
+
+    assert doc(m.bytes_from_str) == "bytes_from_str() -> {}".format(
+        "bytes" if sys.version_info[0] == 3 else "str"
+    )
+
+
+def test_capsule(capture):
+    pytest.gc_collect()
+    with capture:
+        a = m.return_capsule_with_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule
+    """
+
+    with capture:
+        a = m.return_capsule_with_destructor_2()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        creating capsule
+        destructing capsule: 1234
+    """
+
+    with capture:
+        a = m.return_capsule_with_name_and_destructor()
+        del a
+        pytest.gc_collect()
+    assert capture.unordered == """
+        created capsule (1234, 'pointer type description')
+        destructing capsule (1234, 'pointer type description')
+    """
+
+
+def test_accessors():
+    class SubTestObject:
+        attr_obj = 1
+        attr_char = 2
+
+    class TestObject:
+        basic_attr = 1
+        begin_end = [1, 2, 3]
+        d = {"operator[object]": 1, "operator[char *]": 2}
+        sub = SubTestObject()
+
+        def func(self, x, *args):
+            return self.basic_attr + x + sum(args)
+
+    d = m.accessor_api(TestObject())
+    assert d["basic_attr"] == 1
+    assert d["begin_end"] == [1, 2, 3]
+    assert d["operator[object]"] == 1
+    assert d["operator[char *]"] == 2
+    assert d["attr(object)"] == 1
+    assert d["attr(char *)"] == 2
+    assert d["missing_attr_ptr"] == "raised"
+    assert d["missing_attr_chain"] == "raised"
+    assert d["is_none"] is False
+    assert d["operator()"] == 2
+    assert d["operator*"] == 7
+    assert d["implicit_list"] == [1, 2, 3]
+    assert all(x in TestObject.__dict__ for x in d["implicit_dict"])
+
+    assert m.tuple_accessor(tuple()) == (0, 1, 2)
+
+    d = m.accessor_assignment()
+    assert d["get"] == 0
+    assert d["deferred_get"] == 0
+    assert d["set"] == 1
+    assert d["deferred_set"] == 1
+    assert d["var"] == 99
+
+
+def test_constructors():
+    """C++ default and converting constructors are equivalent to type calls in Python"""
+    types = [str, bool, int, float, tuple, list, dict, set]
+    expected = {t.__name__: t() for t in types}
+    assert m.default_constructors() == expected
+
+    data = {
+        str: 42,
+        bool: "Not empty",
+        int: "42",
+        float: "+1e3",
+        tuple: range(3),
+        list: range(3),
+        dict: [("two", 2), ("one", 1), ("three", 3)],
+        set: [4, 4, 5, 6, 6, 6],
+        memoryview: b'abc'
+    }
+    inputs = {k.__name__: v for k, v in data.items()}
+    expected = {k.__name__: k(v) for k, v in data.items()}
+
+    assert m.converting_constructors(inputs) == expected
+    assert m.cast_functions(inputs) == expected
+
+    # Converting constructors and cast functions should just reference rather
+    # than copy when no conversion is needed:
+    noconv1 = m.converting_constructors(expected)
+    for k in noconv1:
+        assert noconv1[k] is expected[k]
+
+    noconv2 = m.cast_functions(expected)
+    for k in noconv2:
+        assert noconv2[k] is expected[k]
+
+
+def test_implicit_casting():
+    """Tests implicit casting when assigning or appending to dicts and lists."""
+    z = m.get_implicit_casting()
+    assert z['d'] == {
+        'char*_i1': 'abc', 'char*_i2': 'abc', 'char*_e': 'abc', 'char*_p': 'abc',
+        'str_i1': 'str', 'str_i2': 'str1', 'str_e': 'str2', 'str_p': 'str3',
+        'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
+    }
+    assert z['l'] == [3, 6, 9, 12, 15]
+
+
+def test_print(capture):
+    with capture:
+        m.print_function()
+    assert capture == """
+        Hello, World!
+        1 2.0 three True -- multiple args
+        *args-and-a-custom-separator
+        no new line here -- next print
+        flush
+        py::print + str.format = this
+    """
+    assert capture.stderr == "this goes to stderr"
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.print_failure()
+    assert str(excinfo.value) == "make_tuple(): unable to convert " + (
+        "argument of type 'UnregisteredType' to Python object"
+        if debug_enabled else
+        "arguments to Python object (compile in debug mode for details)"
+    )
+
+
+def test_hash():
+    class Hashable(object):
+        def __init__(self, value):
+            self.value = value
+
+        def __hash__(self):
+            return self.value
+
+    class Unhashable(object):
+        __hash__ = None
+
+    assert m.hash_function(Hashable(42)) == 42
+    with pytest.raises(TypeError):
+        m.hash_function(Unhashable())
diff --git a/pybind11/tests/test_sequences_and_iterators.cpp b/pybind11/tests/test_sequences_and_iterators.cpp
index 323b4bf00..a45521256 100644
--- a/pybind11/tests/test_sequences_and_iterators.cpp
+++ b/pybind11/tests/test_sequences_and_iterators.cpp
@@ -13,146 +13,6 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 
-class Sequence {
-public:
-    Sequence(size_t size) : m_size(size) {
-        print_created(this, "of size", m_size);
-        m_data = new float[size];
-        memset(m_data, 0, sizeof(float) * size);
-    }
-
-    Sequence(const std::vector<float> &value) : m_size(value.size()) {
-        print_created(this, "of size", m_size, "from std::vector");
-        m_data = new float[m_size];
-        memcpy(m_data, &value[0], sizeof(float) * m_size);
-    }
-
-    Sequence(const Sequence &s) : m_size(s.m_size) {
-        print_copy_created(this);
-        m_data = new float[m_size];
-        memcpy(m_data, s.m_data, sizeof(float)*m_size);
-    }
-
-    Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
-        print_move_created(this);
-        s.m_size = 0;
-        s.m_data = nullptr;
-    }
-
-    ~Sequence() {
-        print_destroyed(this);
-        delete[] m_data;
-    }
-
-    Sequence &operator=(const Sequence &s) {
-        if (&s != this) {
-            delete[] m_data;
-            m_size = s.m_size;
-            m_data = new float[m_size];
-            memcpy(m_data, s.m_data, sizeof(float)*m_size);
-        }
-
-        print_copy_assigned(this);
-
-        return *this;
-    }
-
-    Sequence &operator=(Sequence &&s) {
-        if (&s != this) {
-            delete[] m_data;
-            m_size = s.m_size;
-            m_data = s.m_data;
-            s.m_size = 0;
-            s.m_data = nullptr;
-        }
-
-        print_move_assigned(this);
-
-        return *this;
-    }
-
-    bool operator==(const Sequence &s) const {
-        if (m_size != s.size())
-            return false;
-        for (size_t i=0; i<m_size; ++i)
-            if (m_data[i] != s[i])
-                return false;
-        return true;
-    }
-
-    bool operator!=(const Sequence &s) const {
-        return !operator==(s);
-    }
-
-    float operator[](size_t index) const {
-        return m_data[index];
-    }
-
-    float &operator[](size_t index) {
-        return m_data[index];
-    }
-
-    bool contains(float v) const {
-        for (size_t i=0; i<m_size; ++i)
-            if (v == m_data[i])
-                return true;
-        return false;
-    }
-
-    Sequence reversed() const {
-        Sequence result(m_size);
-        for (size_t i=0; i<m_size; ++i)
-            result[m_size-i-1] = m_data[i];
-        return result;
-    }
-
-    size_t size() const { return m_size; }
-
-    const float *begin() const { return m_data; }
-    const float *end() const { return m_data+m_size; }
-
-private:
-    size_t m_size;
-    float *m_data;
-};
-
-class IntPairs {
-public:
-    IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
-    const std::pair<int, int>* begin() const { return data_.data(); }
-
-private:
-    std::vector<std::pair<int, int>> data_;
-};
-
-// Interface of a map-like object that isn't (directly) an unordered_map, but provides some basic
-// map-like functionality.
-class StringMap {
-public:
-    StringMap() = default;
-    StringMap(std::unordered_map<std::string, std::string> init)
-        : map(std::move(init)) {}
-
-    void set(std::string key, std::string val) {
-        map[key] = val;
-    }
-
-    std::string get(std::string key) const {
-        return map.at(key);
-    }
-
-    size_t size() const {
-        return map.size();
-    }
-
-private:
-    std::unordered_map<std::string, std::string> map;
-
-public:
-    decltype(map.cbegin()) begin() const { return map.cbegin(); }
-    decltype(map.cend()) end() const { return map.cend(); }
-};
-
 template<typename T>
 class NonZeroIterator {
     const T* ptr_;
@@ -169,65 +29,205 @@ bool operator==(const NonZeroIterator<std::pair<A, B>>& it, const NonZeroSentine
     return !(*it).first || !(*it).second;
 }
 
-test_initializer sequences_and_iterators([](py::module &m) {
+template <typename PythonType>
+py::list test_random_access_iterator(PythonType x) {
+    if (x.size() < 5)
+        throw py::value_error("Please provide at least 5 elements for testing.");
 
-    py::class_<Sequence> seq(m, "Sequence");
+    auto checks = py::list();
+    auto assert_equal = [&checks](py::handle a, py::handle b) {
+        auto result = PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_EQ);
+        if (result == -1) { throw py::error_already_set(); }
+        checks.append(result != 0);
+    };
+
+    auto it = x.begin();
+    assert_equal(x[0], *it);
+    assert_equal(x[0], it[0]);
+    assert_equal(x[1], it[1]);
+
+    assert_equal(x[1], *(++it));
+    assert_equal(x[1], *(it++));
+    assert_equal(x[2], *it);
+    assert_equal(x[3], *(it += 1));
+    assert_equal(x[2], *(--it));
+    assert_equal(x[2], *(it--));
+    assert_equal(x[1], *it);
+    assert_equal(x[0], *(it -= 1));
+
+    assert_equal(it->attr("real"), x[0].attr("real"));
+    assert_equal((it + 1)->attr("real"), x[1].attr("real"));
+
+    assert_equal(x[1], *(it + 1));
+    assert_equal(x[1], *(1 + it));
+    it += 3;
+    assert_equal(x[1], *(it - 2));
+
+    checks.append(static_cast<std::size_t>(x.end() - x.begin()) == x.size());
+    checks.append((x.begin() + static_cast<std::ptrdiff_t>(x.size())) == x.end());
+    checks.append(x.begin() < x.end());
+
+    return checks;
+}
 
-    seq.def(py::init<size_t>())
-       .def(py::init<const std::vector<float>&>())
-       /// Bare bones interface
-       .def("__getitem__", [](const Sequence &s, size_t i) {
-            if (i >= s.size())
-                throw py::index_error();
+TEST_SUBMODULE(sequences_and_iterators, m) {
+
+    // test_sequence
+    class Sequence {
+    public:
+        Sequence(size_t size) : m_size(size) {
+            print_created(this, "of size", m_size);
+            m_data = new float[size];
+            memset(m_data, 0, sizeof(float) * size);
+        }
+        Sequence(const std::vector<float> &value) : m_size(value.size()) {
+            print_created(this, "of size", m_size, "from std::vector");
+            m_data = new float[m_size];
+            memcpy(m_data, &value[0], sizeof(float) * m_size);
+        }
+        Sequence(const Sequence &s) : m_size(s.m_size) {
+            print_copy_created(this);
+            m_data = new float[m_size];
+            memcpy(m_data, s.m_data, sizeof(float)*m_size);
+        }
+        Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
+            print_move_created(this);
+            s.m_size = 0;
+            s.m_data = nullptr;
+        }
+
+        ~Sequence() { print_destroyed(this); delete[] m_data; }
+
+        Sequence &operator=(const Sequence &s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = new float[m_size];
+                memcpy(m_data, s.m_data, sizeof(float)*m_size);
+            }
+            print_copy_assigned(this);
+            return *this;
+        }
+
+        Sequence &operator=(Sequence &&s) {
+            if (&s != this) {
+                delete[] m_data;
+                m_size = s.m_size;
+                m_data = s.m_data;
+                s.m_size = 0;
+                s.m_data = nullptr;
+            }
+            print_move_assigned(this);
+            return *this;
+        }
+
+        bool operator==(const Sequence &s) const {
+            if (m_size != s.size()) return false;
+            for (size_t i = 0; i < m_size; ++i)
+                if (m_data[i] != s[i])
+                    return false;
+            return true;
+        }
+        bool operator!=(const Sequence &s) const { return !operator==(s); }
+
+        float operator[](size_t index) const { return m_data[index]; }
+        float &operator[](size_t index) { return m_data[index]; }
+
+        bool contains(float v) const {
+            for (size_t i = 0; i < m_size; ++i)
+                if (v == m_data[i])
+                    return true;
+            return false;
+        }
+
+        Sequence reversed() const {
+            Sequence result(m_size);
+            for (size_t i = 0; i < m_size; ++i)
+                result[m_size - i - 1] = m_data[i];
+            return result;
+        }
+
+        size_t size() const { return m_size; }
+
+        const float *begin() const { return m_data; }
+        const float *end() const { return m_data+m_size; }
+
+    private:
+        size_t m_size;
+        float *m_data;
+    };
+    py::class_<Sequence>(m, "Sequence")
+        .def(py::init<size_t>())
+        .def(py::init<const std::vector<float>&>())
+        /// Bare bones interface
+        .def("__getitem__", [](const Sequence &s, size_t i) {
+            if (i >= s.size()) throw py::index_error();
             return s[i];
         })
-       .def("__setitem__", [](Sequence &s, size_t i, float v) {
-            if (i >= s.size())
-                throw py::index_error();
+        .def("__setitem__", [](Sequence &s, size_t i, float v) {
+            if (i >= s.size()) throw py::index_error();
             s[i] = v;
         })
-       .def("__len__", &Sequence::size)
-       /// Optional sequence protocol operations
-       .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
-                        py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
-       .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
-       .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
-       /// Slicing protocol (optional)
-       .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
+        .def("__len__", &Sequence::size)
+        /// Optional sequence protocol operations
+        .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
+                         py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
+        .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
+        .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
+        /// Slicing protocol (optional)
+        .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
             size_t start, stop, step, slicelength;
             if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
                 throw py::error_already_set();
             Sequence *seq = new Sequence(slicelength);
-            for (size_t i=0; i<slicelength; ++i) {
+            for (size_t i = 0; i < slicelength; ++i) {
                 (*seq)[i] = s[start]; start += step;
             }
             return seq;
         })
-       .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
+        .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
             size_t start, stop, step, slicelength;
             if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
                 throw py::error_already_set();
             if (slicelength != value.size())
                 throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
-            for (size_t i=0; i<slicelength; ++i) {
+            for (size_t i = 0; i < slicelength; ++i) {
                 s[start] = value[i]; start += step;
             }
         })
-       /// Comparisons
-       .def(py::self == py::self)
-       .def(py::self != py::self);
-       // Could also define py::self + py::self for concatenation, etc.
-
-    py::class_<StringMap> map(m, "StringMap");
+        /// Comparisons
+        .def(py::self == py::self)
+        .def(py::self != py::self)
+        // Could also define py::self + py::self for concatenation, etc.
+        ;
 
-    map .def(py::init<>())
+    // test_map_iterator
+    // Interface of a map-like object that isn't (directly) an unordered_map, but provides some basic
+    // map-like functionality.
+    class StringMap {
+    public:
+        StringMap() = default;
+        StringMap(std::unordered_map<std::string, std::string> init)
+            : map(std::move(init)) {}
+
+        void set(std::string key, std::string val) { map[key] = val; }
+        std::string get(std::string key) const { return map.at(key); }
+        size_t size() const { return map.size(); }
+    private:
+        std::unordered_map<std::string, std::string> map;
+    public:
+        decltype(map.cbegin()) begin() const { return map.cbegin(); }
+        decltype(map.cend()) end() const { return map.cend(); }
+    };
+    py::class_<StringMap>(m, "StringMap")
+        .def(py::init<>())
         .def(py::init<std::unordered_map<std::string, std::string>>())
         .def("__getitem__", [](const StringMap &map, std::string key) {
                 try { return map.get(key); }
                 catch (const std::out_of_range&) {
                     throw py::key_error("key '" + key + "' does not exist");
                 }
-                })
+        })
         .def("__setitem__", &StringMap::set)
         .def("__len__", &StringMap::size)
         .def("__iter__", [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
@@ -236,14 +236,23 @@ test_initializer sequences_and_iterators([](py::module &m) {
                 py::keep_alive<0, 1>())
         ;
 
+    // test_generalized_iterators
+    class IntPairs {
+    public:
+        IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
+        const std::pair<int, int>* begin() const { return data_.data(); }
+    private:
+        std::vector<std::pair<int, int>> data_;
+    };
     py::class_<IntPairs>(m, "IntPairs")
         .def(py::init<std::vector<std::pair<int, int>>>())
         .def("nonzero", [](const IntPairs& s) {
                 return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
-            }, py::keep_alive<0, 1>())
+        }, py::keep_alive<0, 1>())
         .def("nonzero_keys", [](const IntPairs& s) {
             return py::make_key_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
-        }, py::keep_alive<0, 1>());
+        }, py::keep_alive<0, 1>())
+        ;
 
 
 #if 0
@@ -272,4 +281,54 @@ test_initializer sequences_and_iterators([](py::module &m) {
     On the actual Sequence object, the iterator would be constructed as follows:
     .def("__iter__", [](py::object s) { return PySequenceIterator(s.cast<const Sequence &>(), s); })
 #endif
-});
+
+    // test_python_iterator_in_cpp
+    m.def("object_to_list", [](py::object o) {
+        auto l = py::list();
+        for (auto item : o) {
+            l.append(item);
+        }
+        return l;
+    });
+
+    m.def("iterator_to_list", [](py::iterator it) {
+        auto l = py::list();
+        while (it != py::iterator::sentinel()) {
+            l.append(*it);
+            ++it;
+        }
+        return l;
+    });
+
+    // Make sure that py::iterator works with std algorithms
+    m.def("count_none", [](py::object o) {
+        return std::count_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+    });
+
+    m.def("find_none", [](py::object o) {
+        auto it = std::find_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
+        return it->is_none();
+    });
+
+    m.def("count_nonzeros", [](py::dict d) {
+       return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
+           return p.second.cast<int>() != 0;
+       });
+    });
+
+    m.def("tuple_iterator", &test_random_access_iterator<py::tuple>);
+    m.def("list_iterator", &test_random_access_iterator<py::list>);
+    m.def("sequence_iterator", &test_random_access_iterator<py::sequence>);
+
+    // test_iterator_passthrough
+    // #181: iterator passthrough did not compile
+    m.def("iterator_passthrough", [](py::iterator s) -> py::iterator {
+        return py::make_iterator(std::begin(s), std::end(s));
+    });
+
+    // test_iterator_rvp
+    // #388: Can't make iterators via make_iterator() with different r/v policies
+    static std::vector<int> list = { 1, 2, 3 };
+    m.def("make_iterator_1", []() { return py::make_iterator<py::return_value_policy::copy>(list); });
+    m.def("make_iterator_2", []() { return py::make_iterator<py::return_value_policy::automatic>(list); });
+}
diff --git a/pybind11/tests/test_sequences_and_iterators.py b/pybind11/tests/test_sequences_and_iterators.py
index 76b9f43f6..640ca07bd 100644
--- a/pybind11/tests/test_sequences_and_iterators.py
+++ b/pybind11/tests/test_sequences_and_iterators.py
@@ -1,4 +1,6 @@
 import pytest
+from pybind11_tests import sequences_and_iterators as m
+from pybind11_tests import ConstructorStats
 
 
 def isclose(a, b, rel_tol=1e-05, abs_tol=0.0):
@@ -11,23 +13,30 @@ def allclose(a_list, b_list, rel_tol=1e-05, abs_tol=0.0):
 
 
 def test_generalized_iterators():
-    from pybind11_tests import IntPairs
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero()) == [(1, 2), (3, 4)]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero()) == [(1, 2)]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero()) == []
 
-    assert list(IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero()) == [(1, 2), (3, 4)]
-    assert list(IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero()) == [(1, 2)]
-    assert list(IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero()) == []
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero_keys()) == [1, 3]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_keys()) == [1]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_keys()) == []
 
-    assert list(IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero_keys()) == [1, 3]
-    assert list(IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_keys()) == [1]
-    assert list(IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_keys()) == []
+    # __next__ must continue to raise StopIteration
+    it = m.IntPairs([(0, 0)]).nonzero()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
 
+    it = m.IntPairs([(0, 0)]).nonzero_keys()
+    for _ in range(3):
+        with pytest.raises(StopIteration):
+            next(it)
 
-def test_sequence():
-    from pybind11_tests import Sequence, ConstructorStats
 
-    cstats = ConstructorStats.get(Sequence)
+def test_sequence():
+    cstats = ConstructorStats.get(m.Sequence)
 
-    s = Sequence(5)
+    s = m.Sequence(5)
     assert cstats.values() == ['of size', '5']
 
     assert "Sequence" in repr(s)
@@ -44,16 +53,24 @@ def test_sequence():
     rev2 = s[::-1]
     assert cstats.values() == ['of size', '5']
 
+    it = iter(m.Sequence(0))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+    assert cstats.values() == ['of size', '0']
+
     expected = [0, 56.78, 0, 0, 12.34]
     assert allclose(rev, expected)
     assert allclose(rev2, expected)
     assert rev == rev2
 
-    rev[0::2] = Sequence([2.0, 2.0, 2.0])
+    rev[0::2] = m.Sequence([2.0, 2.0, 2.0])
     assert cstats.values() == ['of size', '3', 'from std::vector']
 
     assert allclose(rev, [2, 56.78, 2, 0, 2])
 
+    assert cstats.alive() == 4
+    del it
     assert cstats.alive() == 3
     del s
     assert cstats.alive() == 2
@@ -71,20 +88,71 @@ def test_sequence():
 
 
 def test_map_iterator():
-    from pybind11_tests import StringMap
-
-    m = StringMap({'hi': 'bye', 'black': 'white'})
-    assert m['hi'] == 'bye'
-    assert len(m) == 2
-    assert m['black'] == 'white'
+    sm = m.StringMap({'hi': 'bye', 'black': 'white'})
+    assert sm['hi'] == 'bye'
+    assert len(sm) == 2
+    assert sm['black'] == 'white'
 
     with pytest.raises(KeyError):
-        assert m['orange']
-    m['orange'] = 'banana'
-    assert m['orange'] == 'banana'
+        assert sm['orange']
+    sm['orange'] = 'banana'
+    assert sm['orange'] == 'banana'
 
     expected = {'hi': 'bye', 'black': 'white', 'orange': 'banana'}
-    for k in m:
-        assert m[k] == expected[k]
-    for k, v in m.items():
+    for k in sm:
+        assert sm[k] == expected[k]
+    for k, v in sm.items():
         assert v == expected[k]
+
+    it = iter(m.StringMap({}))
+    for _ in range(3):  # __next__ must continue to raise StopIteration
+        with pytest.raises(StopIteration):
+            next(it)
+
+
+def test_python_iterator_in_cpp():
+    t = (1, 2, 3)
+    assert m.object_to_list(t) == [1, 2, 3]
+    assert m.object_to_list(iter(t)) == [1, 2, 3]
+    assert m.iterator_to_list(iter(t)) == [1, 2, 3]
+
+    with pytest.raises(TypeError) as excinfo:
+        m.object_to_list(1)
+    assert "object is not iterable" in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        m.iterator_to_list(1)
+    assert "incompatible function arguments" in str(excinfo.value)
+
+    def bad_next_call():
+        raise RuntimeError("py::iterator::advance() should propagate errors")
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.iterator_to_list(iter(bad_next_call, None))
+    assert str(excinfo.value) == "py::iterator::advance() should propagate errors"
+
+    l = [1, None, 0, None]
+    assert m.count_none(l) == 2
+    assert m.find_none(l) is True
+    assert m.count_nonzeros({"a": 0, "b": 1, "c": 2}) == 2
+
+    r = range(5)
+    assert all(m.tuple_iterator(tuple(r)))
+    assert all(m.list_iterator(list(r)))
+    assert all(m.sequence_iterator(r))
+
+
+def test_iterator_passthrough():
+    """#181: iterator passthrough did not compile"""
+    from pybind11_tests.sequences_and_iterators import iterator_passthrough
+
+    assert list(iterator_passthrough(iter([3, 5, 7, 9, 11, 13, 15]))) == [3, 5, 7, 9, 11, 13, 15]
+
+
+def test_iterator_rvp():
+    """#388: Can't make iterators via make_iterator() with different r/v policies """
+    import pybind11_tests.sequences_and_iterators as m
+
+    assert list(m.make_iterator_1()) == [1, 2, 3]
+    assert list(m.make_iterator_2()) == [1, 2, 3]
+    assert not isinstance(m.make_iterator_1(), type(m.make_iterator_2()))
diff --git a/pybind11/tests/test_smart_ptr.cpp b/pybind11/tests/test_smart_ptr.cpp
index 4d1e77e32..dccb1e9be 100644
--- a/pybind11/tests/test_smart_ptr.cpp
+++ b/pybind11/tests/test_smart_ptr.cpp
@@ -8,210 +8,178 @@
     BSD-style license that can be found in the LICENSE file.
 */
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
+#  pragma warning(disable: 4702) // unreachable code in system header
+#endif
+
 #include "pybind11_tests.h"
 #include "object.h"
 
-/// Custom object with builtin reference counting (see 'object.h' for the implementation)
-class MyObject1 : public Object {
-public:
-    MyObject1(int value) : value(value) {
-        print_created(this, toString());
-    }
-
-    std::string toString() const {
-        return "MyObject1[" + std::to_string(value) + "]";
-    }
-
-protected:
-    virtual ~MyObject1() {
-        print_destroyed(this);
-    }
-
-private:
-    int value;
-};
-
-/// Object managed by a std::shared_ptr<>
-class MyObject2 {
-public:
-    MyObject2(int value) : value(value) {
-        print_created(this, toString());
-    }
-
-    std::string toString() const {
-        return "MyObject2[" + std::to_string(value) + "]";
-    }
-
-    virtual ~MyObject2() {
-        print_destroyed(this);
-    }
-
-private:
-    int value;
-};
-
-/// Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
-class MyObject3 : public std::enable_shared_from_this<MyObject3> {
-public:
-    MyObject3(int value) : value(value) {
-        print_created(this, toString());
-    }
-
-    std::string toString() const {
-        return "MyObject3[" + std::to_string(value) + "]";
-    }
-
-    virtual ~MyObject3() {
-        print_destroyed(this);
-    }
-
-private:
-    int value;
-};
-
-class MyObject4 {
-public:
-    MyObject4(int value) : value{value} {
-        print_created(this);
-    }
-    int value;
-private:
-    ~MyObject4() {
-        print_destroyed(this);
-    }
-};
-
-/// Make pybind aware of the ref-counted wrapper type (s)
+// Make pybind aware of the ref-counted wrapper type (s):
 
 // ref<T> is a wrapper for 'Object' which uses intrusive reference counting
 // It is always possible to construct a ref<T> from an Object* pointer without
 // possible incosistencies, hence the 'true' argument at the end.
 PYBIND11_DECLARE_HOLDER_TYPE(T, ref<T>, true);
-PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>); // Not required any more for std::shared_ptr,
-                                                     // but it should compile without error
-
-Object *make_object_1() { return new MyObject1(1); }
-ref<Object> make_object_2() { return new MyObject1(2); }
-
-MyObject1 *make_myobject1_1() { return new MyObject1(4); }
-ref<MyObject1> make_myobject1_2() { return new MyObject1(5); }
+// Make pybind11 aware of the non-standard getter member function
+namespace pybind11 { namespace detail {
+    template <typename T>
+    struct holder_helper<ref<T>> {
+        static const T *get(const ref<T> &p) { return p.get_ptr(); }
+    };
+}}
 
-MyObject2 *make_myobject2_1() { return new MyObject2(6); }
-std::shared_ptr<MyObject2> make_myobject2_2() { return std::make_shared<MyObject2>(7); }
+// The following is not required anymore for std::shared_ptr, but it should compile without error:
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
 
-MyObject3 *make_myobject3_1() { return new MyObject3(8); }
-std::shared_ptr<MyObject3> make_myobject3_2() { return std::make_shared<MyObject3>(9); }
+// This is just a wrapper around unique_ptr, but with extra fields to deliberately bloat up the
+// holder size to trigger the non-simple-layout internal instance layout for single inheritance with
+// large holder type:
+template <typename T> class huge_unique_ptr {
+    std::unique_ptr<T> ptr;
+    uint64_t padding[10];
+public:
+    huge_unique_ptr(T *p) : ptr(p) {};
+    T *get() { return ptr.get(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
 
-void print_object_1(const Object *obj) { py::print(obj->toString()); }
-void print_object_2(ref<Object> obj) { py::print(obj->toString()); }
-void print_object_3(const ref<Object> &obj) { py::print(obj->toString()); }
-void print_object_4(const ref<Object> *obj) { py::print((*obj)->toString()); }
+// Simple custom holder that works like unique_ptr
+template <typename T>
+class custom_unique_ptr {
+    std::unique_ptr<T> impl;
+public:
+    custom_unique_ptr(T* p) : impl(p) { }
+    T* get() const { return impl.get(); }
+    T* release_ptr() { return impl.release(); }
+};
+PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
 
-void print_myobject1_1(const MyObject1 *obj) { py::print(obj->toString()); }
-void print_myobject1_2(ref<MyObject1> obj) { py::print(obj->toString()); }
-void print_myobject1_3(const ref<MyObject1> &obj) { py::print(obj->toString()); }
-void print_myobject1_4(const ref<MyObject1> *obj) { py::print((*obj)->toString()); }
 
-void print_myobject2_1(const MyObject2 *obj) { py::print(obj->toString()); }
-void print_myobject2_2(std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); }
-void print_myobject2_3(const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); }
-void print_myobject2_4(const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); }
+TEST_SUBMODULE(smart_ptr, m) {
 
-void print_myobject3_1(const MyObject3 *obj) { py::print(obj->toString()); }
-void print_myobject3_2(std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); }
-void print_myobject3_3(const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); }
-void print_myobject3_4(const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); }
+    // test_smart_ptr
 
-test_initializer smart_ptr([](py::module &m) {
+    // Object implementation in `object.h`
     py::class_<Object, ref<Object>> obj(m, "Object");
     obj.def("getRefCount", &Object::getRefCount);
 
+    // Custom object with builtin reference counting (see 'object.h' for the implementation)
+    class MyObject1 : public Object {
+    public:
+        MyObject1(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject1[" + std::to_string(value) + "]"; }
+    protected:
+        virtual ~MyObject1() { print_destroyed(this); }
+    private:
+        int value;
+    };
     py::class_<MyObject1, ref<MyObject1>>(m, "MyObject1", obj)
         .def(py::init<int>());
+    py::implicitly_convertible<py::int_, MyObject1>();
+
+    m.def("make_object_1", []() -> Object * { return new MyObject1(1); });
+    m.def("make_object_2", []() -> ref<Object> { return new MyObject1(2); });
+    m.def("make_myobject1_1", []() -> MyObject1 * { return new MyObject1(4); });
+    m.def("make_myobject1_2", []() -> ref<MyObject1> { return new MyObject1(5); });
+    m.def("print_object_1", [](const Object *obj) { py::print(obj->toString()); });
+    m.def("print_object_2", [](ref<Object> obj) { py::print(obj->toString()); });
+    m.def("print_object_3", [](const ref<Object> &obj) { py::print(obj->toString()); });
+    m.def("print_object_4", [](const ref<Object> *obj) { py::print((*obj)->toString()); });
+    m.def("print_myobject1_1", [](const MyObject1 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_2", [](ref<MyObject1> obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_3", [](const ref<MyObject1> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject1_4", [](const ref<MyObject1> *obj) { py::print((*obj)->toString()); });
 
-    m.def("test_object1_refcounting",
-        []() -> bool {
-            ref<MyObject1> o = new MyObject1(0);
-            bool good = o->getRefCount() == 1;
-            py::object o2 = py::cast(o, py::return_value_policy::reference);
-            // always request (partial) ownership for objects with intrusive
-            // reference counting even when using the 'reference' RVP
-            good &= o->getRefCount() == 2;
-            return good;
-        }
-    );
-
-    m.def("make_object_1", &make_object_1);
-    m.def("make_object_2", &make_object_2);
-    m.def("make_myobject1_1", &make_myobject1_1);
-    m.def("make_myobject1_2", &make_myobject1_2);
-    m.def("print_object_1", &print_object_1);
-    m.def("print_object_2", &print_object_2);
-    m.def("print_object_3", &print_object_3);
-    m.def("print_object_4", &print_object_4);
-    m.def("print_myobject1_1", &print_myobject1_1);
-    m.def("print_myobject1_2", &print_myobject1_2);
-    m.def("print_myobject1_3", &print_myobject1_3);
-    m.def("print_myobject1_4", &print_myobject1_4);
+    // Expose constructor stats for the ref type
+    m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
 
+
+    // Object managed by a std::shared_ptr<>
+    class MyObject2 {
+    public:
+        MyObject2(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
+        virtual ~MyObject2() { print_destroyed(this); }
+    private:
+        int value;
+    };
     py::class_<MyObject2, std::shared_ptr<MyObject2>>(m, "MyObject2")
         .def(py::init<int>());
-    m.def("make_myobject2_1", &make_myobject2_1);
-    m.def("make_myobject2_2", &make_myobject2_2);
-    m.def("print_myobject2_1", &print_myobject2_1);
-    m.def("print_myobject2_2", &print_myobject2_2);
-    m.def("print_myobject2_3", &print_myobject2_3);
-    m.def("print_myobject2_4", &print_myobject2_4);
-
+    m.def("make_myobject2_1", []() { return new MyObject2(6); });
+    m.def("make_myobject2_2", []() { return std::make_shared<MyObject2>(7); });
+    m.def("print_myobject2_1", [](const MyObject2 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_2", [](std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_3", [](const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject2_4", [](const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); });
+
+    // Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
+    class MyObject3 : public std::enable_shared_from_this<MyObject3> {
+    public:
+        MyObject3(int value) : value(value) { print_created(this, toString()); }
+        std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
+        virtual ~MyObject3() { print_destroyed(this); }
+    private:
+        int value;
+    };
     py::class_<MyObject3, std::shared_ptr<MyObject3>>(m, "MyObject3")
         .def(py::init<int>());
-    m.def("make_myobject3_1", &make_myobject3_1);
-    m.def("make_myobject3_2", &make_myobject3_2);
-    m.def("print_myobject3_1", &print_myobject3_1);
-    m.def("print_myobject3_2", &print_myobject3_2);
-    m.def("print_myobject3_3", &print_myobject3_3);
-    m.def("print_myobject3_4", &print_myobject3_4);
-
+    m.def("make_myobject3_1", []() { return new MyObject3(8); });
+    m.def("make_myobject3_2", []() { return std::make_shared<MyObject3>(9); });
+    m.def("print_myobject3_1", [](const MyObject3 *obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_2", [](std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_3", [](const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); });
+    m.def("print_myobject3_4", [](const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); });
+
+    // test_smart_ptr_refcounting
+    m.def("test_object1_refcounting", []() {
+        ref<MyObject1> o = new MyObject1(0);
+        bool good = o->getRefCount() == 1;
+        py::object o2 = py::cast(o, py::return_value_policy::reference);
+        // always request (partial) ownership for objects with intrusive
+        // reference counting even when using the 'reference' RVP
+        good &= o->getRefCount() == 2;
+        return good;
+    });
+
+    // test_unique_nodelete
+    // Object with a private destructor
+    class MyObject4 {
+    public:
+        MyObject4(int value) : value{value} { print_created(this); }
+        int value;
+    private:
+        ~MyObject4() { print_destroyed(this); }
+    };
     py::class_<MyObject4, std::unique_ptr<MyObject4, py::nodelete>>(m, "MyObject4")
         .def(py::init<int>())
         .def_readwrite("value", &MyObject4::value);
 
-    py::implicitly_convertible<py::int_, MyObject1>();
-
-    // Expose constructor stats for the ref type
-    m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
-});
-
-struct SharedPtrRef {
-    struct A {
-        A() { print_created(this); }
-        A(const A &) { print_copy_created(this); }
-        A(A &&) { print_move_created(this); }
-        ~A() { print_destroyed(this); }
+    // test_large_holder
+    class MyObject5 { // managed by huge_unique_ptr
+    public:
+        MyObject5(int value) : value{value} { print_created(this); }
+        ~MyObject5() { print_destroyed(this); }
+        int value;
     };
-
-    A value = {};
-    std::shared_ptr<A> shared = std::make_shared<A>();
-};
-
-struct SharedFromThisRef {
-    struct B : std::enable_shared_from_this<B> {
-        B() { print_created(this); }
-        B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
-        B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
-        ~B() { print_destroyed(this); }
+    py::class_<MyObject5, huge_unique_ptr<MyObject5>>(m, "MyObject5")
+        .def(py::init<int>())
+        .def_readwrite("value", &MyObject5::value);
+
+    // test_shared_ptr_and_references
+    struct SharedPtrRef {
+        struct A {
+            A() { print_created(this); }
+            A(const A &) { print_copy_created(this); }
+            A(A &&) { print_move_created(this); }
+            ~A() { print_destroyed(this); }
+        };
+
+        A value = {};
+        std::shared_ptr<A> shared = std::make_shared<A>();
     };
-
-    B value = {};
-    std::shared_ptr<B> shared = std::make_shared<B>();
-};
-
-test_initializer smart_ptr_and_references([](py::module &pm) {
-    auto m = pm.def_submodule("smart_ptr");
-
     using A = SharedPtrRef::A;
     py::class_<A, std::shared_ptr<A>>(m, "A");
-
     py::class_<SharedPtrRef>(m, "SharedPtrRef")
         .def(py::init<>())
         .def_readonly("ref", &SharedPtrRef::value)
@@ -223,9 +191,20 @@ test_initializer smart_ptr_and_references([](py::module &pm) {
         .def("set_ref", [](SharedPtrRef &, const A &) { return true; })
         .def("set_holder", [](SharedPtrRef &, std::shared_ptr<A>) { return true; });
 
+    // test_shared_ptr_from_this_and_references
+    struct SharedFromThisRef {
+        struct B : std::enable_shared_from_this<B> {
+            B() { print_created(this); }
+            B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
+            B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
+            ~B() { print_destroyed(this); }
+        };
+
+        B value = {};
+        std::shared_ptr<B> shared = std::make_shared<B>();
+    };
     using B = SharedFromThisRef::B;
     py::class_<B, std::shared_ptr<B>>(m, "B");
-
     py::class_<SharedFromThisRef>(m, "SharedFromThisRef")
         .def(py::init<>())
         .def_readonly("bad_wp", &SharedFromThisRef::value)
@@ -237,4 +216,55 @@ test_initializer smart_ptr_and_references([](py::module &pm) {
                                py::return_value_policy::copy)
         .def("set_ref", [](SharedFromThisRef &, const B &) { return true; })
         .def("set_holder", [](SharedFromThisRef &, std::shared_ptr<B>) { return true; });
-});
+
+    // Issue #865: shared_from_this doesn't work with virtual inheritance
+    struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
+        virtual ~SharedFromThisVBase() = default;
+    };
+    struct SharedFromThisVirt : virtual SharedFromThisVBase {};
+    static std::shared_ptr<SharedFromThisVirt> sft(new SharedFromThisVirt());
+    py::class_<SharedFromThisVirt, std::shared_ptr<SharedFromThisVirt>>(m, "SharedFromThisVirt")
+        .def_static("get", []() { return sft.get(); });
+
+    // test_move_only_holder
+    struct C {
+        C() { print_created(this); }
+        ~C() { print_destroyed(this); }
+    };
+    py::class_<C, custom_unique_ptr<C>>(m, "TypeWithMoveOnlyHolder")
+        .def_static("make", []() { return custom_unique_ptr<C>(new C); });
+
+    // test_smart_ptr_from_default
+    struct HeldByDefaultHolder { };
+    py::class_<HeldByDefaultHolder>(m, "HeldByDefaultHolder")
+        .def(py::init<>())
+        .def_static("load_shared_ptr", [](std::shared_ptr<HeldByDefaultHolder>) {});
+
+    // test_shared_ptr_gc
+    // #187: issue involving std::shared_ptr<> return value policy & garbage collection
+    struct ElementBase { virtual void foo() { } /* Force creation of virtual table */ };
+    py::class_<ElementBase, std::shared_ptr<ElementBase>>(m, "ElementBase");
+
+    struct ElementA : ElementBase {
+        ElementA(int v) : v(v) { }
+        int value() { return v; }
+        int v;
+    };
+    py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m, "ElementA")
+        .def(py::init<int>())
+        .def("value", &ElementA::value);
+
+    struct ElementList {
+        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
+        std::vector<std::shared_ptr<ElementBase>> l;
+    };
+    py::class_<ElementList, std::shared_ptr<ElementList>>(m, "ElementList")
+        .def(py::init<>())
+        .def("add", &ElementList::add)
+        .def("get", [](ElementList &el) {
+            py::list list;
+            for (auto &e : el.l)
+                list.append(py::cast(e));
+            return list;
+        });
+}
diff --git a/pybind11/tests/test_smart_ptr.py b/pybind11/tests/test_smart_ptr.py
index a6867b485..4dfe0036f 100644
--- a/pybind11/tests/test_smart_ptr.py
+++ b/pybind11/tests/test_smart_ptr.py
@@ -1,40 +1,35 @@
 import pytest
+from pybind11_tests import smart_ptr as m
 from pybind11_tests import ConstructorStats
 
 
 def test_smart_ptr(capture):
     # Object1
-    from pybind11_tests import (MyObject1, make_object_1, make_object_2,
-                                print_object_1, print_object_2, print_object_3, print_object_4)
-
-    for i, o in enumerate([make_object_1(), make_object_2(), MyObject1(3)], start=1):
+    for i, o in enumerate([m.make_object_1(), m.make_object_2(), m.MyObject1(3)], start=1):
         assert o.getRefCount() == 1
         with capture:
-            print_object_1(o)
-            print_object_2(o)
-            print_object_3(o)
-            print_object_4(o)
+            m.print_object_1(o)
+            m.print_object_2(o)
+            m.print_object_3(o)
+            m.print_object_4(o)
         assert capture == "MyObject1[{i}]\n".format(i=i) * 4
 
-    from pybind11_tests import (make_myobject1_1, make_myobject1_2,
-                                print_myobject1_1, print_myobject1_2,
-                                print_myobject1_3, print_myobject1_4)
-
-    for i, o in enumerate([make_myobject1_1(), make_myobject1_2(), MyObject1(6), 7], start=4):
+    for i, o in enumerate([m.make_myobject1_1(), m.make_myobject1_2(), m.MyObject1(6), 7],
+                          start=4):
         print(o)
         with capture:
             if not isinstance(o, int):
-                print_object_1(o)
-                print_object_2(o)
-                print_object_3(o)
-                print_object_4(o)
-            print_myobject1_1(o)
-            print_myobject1_2(o)
-            print_myobject1_3(o)
-            print_myobject1_4(o)
+                m.print_object_1(o)
+                m.print_object_2(o)
+                m.print_object_3(o)
+                m.print_object_4(o)
+            m.print_myobject1_1(o)
+            m.print_myobject1_2(o)
+            m.print_myobject1_3(o)
+            m.print_myobject1_4(o)
         assert capture == "MyObject1[{i}]\n".format(i=i) * (4 if isinstance(o, int) else 8)
 
-    cstats = ConstructorStats.get(MyObject1)
+    cstats = ConstructorStats.get(m.MyObject1)
     assert cstats.alive() == 0
     expected_values = ['MyObject1[{}]'.format(i) for i in range(1, 7)] + ['MyObject1[7]'] * 4
     assert cstats.values() == expected_values
@@ -45,21 +40,16 @@ def test_smart_ptr(capture):
     assert cstats.move_assignments == 0
 
     # Object2
-    from pybind11_tests import (MyObject2, make_myobject2_1, make_myobject2_2,
-                                make_myobject3_1, make_myobject3_2,
-                                print_myobject2_1, print_myobject2_2,
-                                print_myobject2_3, print_myobject2_4)
-
-    for i, o in zip([8, 6, 7], [MyObject2(8), make_myobject2_1(), make_myobject2_2()]):
+    for i, o in zip([8, 6, 7], [m.MyObject2(8), m.make_myobject2_1(), m.make_myobject2_2()]):
         print(o)
         with capture:
-            print_myobject2_1(o)
-            print_myobject2_2(o)
-            print_myobject2_3(o)
-            print_myobject2_4(o)
+            m.print_myobject2_1(o)
+            m.print_myobject2_2(o)
+            m.print_myobject2_3(o)
+            m.print_myobject2_4(o)
         assert capture == "MyObject2[{i}]\n".format(i=i) * 4
 
-    cstats = ConstructorStats.get(MyObject2)
+    cstats = ConstructorStats.get(m.MyObject2)
     assert cstats.alive() == 1
     o = None
     assert cstats.alive() == 0
@@ -71,19 +61,16 @@ def test_smart_ptr(capture):
     assert cstats.move_assignments == 0
 
     # Object3
-    from pybind11_tests import (MyObject3, print_myobject3_1, print_myobject3_2,
-                                print_myobject3_3, print_myobject3_4)
-
-    for i, o in zip([9, 8, 9], [MyObject3(9), make_myobject3_1(), make_myobject3_2()]):
+    for i, o in zip([9, 8, 9], [m.MyObject3(9), m.make_myobject3_1(), m.make_myobject3_2()]):
         print(o)
         with capture:
-            print_myobject3_1(o)
-            print_myobject3_2(o)
-            print_myobject3_3(o)
-            print_myobject3_4(o)
+            m.print_myobject3_1(o)
+            m.print_myobject3_2(o)
+            m.print_myobject3_3(o)
+            m.print_myobject3_4(o)
         assert capture == "MyObject3[{i}]\n".format(i=i) * 4
 
-    cstats = ConstructorStats.get(MyObject3)
+    cstats = ConstructorStats.get(m.MyObject3)
     assert cstats.alive() == 1
     o = None
     assert cstats.alive() == 0
@@ -94,10 +81,8 @@ def test_smart_ptr(capture):
     assert cstats.copy_assignments == 0
     assert cstats.move_assignments == 0
 
-    # Object and ref
-    from pybind11_tests import Object, cstats_ref
-
-    cstats = ConstructorStats.get(Object)
+    # Object
+    cstats = ConstructorStats.get(m.Object)
     assert cstats.alive() == 0
     assert cstats.values() == []
     assert cstats.default_constructions == 10
@@ -106,7 +91,8 @@ def test_smart_ptr(capture):
     assert cstats.copy_assignments == 0
     assert cstats.move_assignments == 0
 
-    cstats = cstats_ref()
+    # ref<>
+    cstats = m.cstats_ref()
     assert cstats.alive() == 0
     assert cstats.values() == ['from pointer'] * 10
     assert cstats.default_constructions == 30
@@ -117,26 +103,30 @@ def test_smart_ptr(capture):
 
 
 def test_smart_ptr_refcounting():
-    from pybind11_tests import test_object1_refcounting
-    assert test_object1_refcounting()
+    assert m.test_object1_refcounting()
 
 
 def test_unique_nodelete():
-    from pybind11_tests import MyObject4
-    o = MyObject4(23)
+    o = m.MyObject4(23)
     assert o.value == 23
-    cstats = ConstructorStats.get(MyObject4)
+    cstats = ConstructorStats.get(m.MyObject4)
     assert cstats.alive() == 1
     del o
-    cstats = ConstructorStats.get(MyObject4)
     assert cstats.alive() == 1  # Leak, but that's intentional
 
 
-def test_shared_ptr_and_references():
-    from pybind11_tests.smart_ptr import SharedPtrRef, A
+def test_large_holder():
+    o = m.MyObject5(5)
+    assert o.value == 5
+    cstats = ConstructorStats.get(m.MyObject5)
+    assert cstats.alive() == 1
+    del o
+    assert cstats.alive() == 0
+
 
-    s = SharedPtrRef()
-    stats = ConstructorStats.get(A)
+def test_shared_ptr_and_references():
+    s = m.SharedPtrRef()
+    stats = ConstructorStats.get(m.A)
     assert stats.alive() == 2
 
     ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false)
@@ -166,10 +156,8 @@ def test_shared_ptr_and_references():
 
 
 def test_shared_ptr_from_this_and_references():
-    from pybind11_tests.smart_ptr import SharedFromThisRef, B
-
-    s = SharedFromThisRef()
-    stats = ConstructorStats.get(B)
+    s = m.SharedFromThisRef()
+    stats = ConstructorStats.get(m.B)
     assert stats.alive() == 2
 
     ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=false)
@@ -201,3 +189,32 @@ def test_shared_ptr_from_this_and_references():
 
     del ref, bad_wp, copy, holder_ref, holder_copy, s
     assert stats.alive() == 0
+
+    z = m.SharedFromThisVirt.get()
+    y = m.SharedFromThisVirt.get()
+    assert y is z
+
+
+def test_move_only_holder():
+    a = m.TypeWithMoveOnlyHolder.make()
+    stats = ConstructorStats.get(m.TypeWithMoveOnlyHolder)
+    assert stats.alive() == 1
+    del a
+    assert stats.alive() == 0
+
+
+def test_smart_ptr_from_default():
+    instance = m.HeldByDefaultHolder()
+    with pytest.raises(RuntimeError) as excinfo:
+        m.HeldByDefaultHolder.load_shared_ptr(instance)
+    assert "Unable to load a custom holder type from a default-holder instance" in str(excinfo)
+
+
+def test_shared_ptr_gc():
+    """#187: issue involving std::shared_ptr<> return value policy & garbage collection"""
+    el = m.ElementList()
+    for i in range(10):
+        el.add(m.ElementA(i))
+    pytest.gc_collect()
+    for i, v in enumerate(el.get()):
+        assert i == v.value()
diff --git a/pybind11/tests/test_stl.cpp b/pybind11/tests/test_stl.cpp
new file mode 100644
index 000000000..7d53e9c18
--- /dev/null
+++ b/pybind11/tests/test_stl.cpp
@@ -0,0 +1,238 @@
+/*
+    tests/test_stl.cpp -- STL type casters
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#include "pybind11_tests.h"
+#include <pybind11/stl.h>
+
+// Test with `std::variant` in C++17 mode, or with `boost::variant` in C++11/14
+#if PYBIND11_HAS_VARIANT
+using std::variant;
+#elif defined(PYBIND11_TEST_BOOST) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+#  include <boost/variant.hpp>
+#  define PYBIND11_HAS_VARIANT 1
+using boost::variant;
+
+namespace pybind11 { namespace detail {
+template <typename... Ts>
+struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+
+template <>
+struct visit_helper<boost::variant> {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+        return boost::apply_visitor(args...);
+    }
+};
+}} // namespace pybind11::detail
+#endif
+
+/// Issue #528: templated constructor
+struct TplCtorClass {
+    template <typename T> TplCtorClass(const T &) { }
+    bool operator==(const TplCtorClass &) const { return true; }
+};
+
+namespace std {
+    template <>
+    struct hash<TplCtorClass> { size_t operator()(const TplCtorClass &) const { return 0; } };
+}
+
+
+TEST_SUBMODULE(stl, m) {
+    // test_vector
+    m.def("cast_vector", []() { return std::vector<int>{1}; });
+    m.def("load_vector", [](const std::vector<int> &v) { return v.at(0) == 1 && v.at(1) == 2; });
+    // `std::vector<bool>` is special because it returns proxy objects instead of references
+    m.def("cast_bool_vector", []() { return std::vector<bool>{true, false}; });
+    m.def("load_bool_vector", [](const std::vector<bool> &v) {
+        return v.at(0) == true && v.at(1) == false;
+    });
+    // Unnumbered regression (caused by #936): pointers to stl containers aren't castable
+    static std::vector<RValueCaster> lvv{2};
+    m.def("cast_ptr_vector", []() { return &lvv; });
+
+    // test_array
+    m.def("cast_array", []() { return std::array<int, 2> {{1 , 2}}; });
+    m.def("load_array", [](const std::array<int, 2> &a) { return a[0] == 1 && a[1] == 2; });
+
+    // test_valarray
+    m.def("cast_valarray", []() { return std::valarray<int>{1, 4, 9}; });
+    m.def("load_valarray", [](const std::valarray<int>& v) {
+        return v.size() == 3 && v[0] == 1 && v[1] == 4 && v[2] == 9;
+    });
+
+    // test_map
+    m.def("cast_map", []() { return std::map<std::string, std::string>{{"key", "value"}}; });
+    m.def("load_map", [](const std::map<std::string, std::string> &map) {
+        return map.at("key") == "value" && map.at("key2") == "value2";
+    });
+
+    // test_set
+    m.def("cast_set", []() { return std::set<std::string>{"key1", "key2"}; });
+    m.def("load_set", [](const std::set<std::string> &set) {
+        return set.count("key1") && set.count("key2") && set.count("key3");
+    });
+
+    // test_recursive_casting
+    m.def("cast_rv_vector", []() { return std::vector<RValueCaster>{2}; });
+    m.def("cast_rv_array", []() { return std::array<RValueCaster, 3>(); });
+    // NB: map and set keys are `const`, so while we technically do move them (as `const Type &&`),
+    // casters don't typically do anything with that, which means they fall to the `const Type &`
+    // caster.
+    m.def("cast_rv_map", []() { return std::unordered_map<std::string, RValueCaster>{{"a", RValueCaster{}}}; });
+    m.def("cast_rv_nested", []() {
+        std::vector<std::array<std::list<std::unordered_map<std::string, RValueCaster>>, 2>> v;
+        v.emplace_back(); // add an array
+        v.back()[0].emplace_back(); // add a map to the array
+        v.back()[0].back().emplace("b", RValueCaster{});
+        v.back()[0].back().emplace("c", RValueCaster{});
+        v.back()[1].emplace_back(); // add a map to the array
+        v.back()[1].back().emplace("a", RValueCaster{});
+        return v;
+    });
+    static std::array<RValueCaster, 2> lva;
+    static std::unordered_map<std::string, RValueCaster> lvm{{"a", RValueCaster{}}, {"b", RValueCaster{}}};
+    static std::unordered_map<std::string, std::vector<std::list<std::array<RValueCaster, 2>>>> lvn;
+    lvn["a"].emplace_back(); // add a list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["a"].emplace_back(); // another list
+    lvn["a"].back().emplace_back(); // add an array
+    lvn["b"].emplace_back(); // add a list
+    lvn["b"].back().emplace_back(); // add an array
+    lvn["b"].back().emplace_back(); // add another array
+    m.def("cast_lv_vector", []() -> const decltype(lvv) & { return lvv; });
+    m.def("cast_lv_array", []() -> const decltype(lva) & { return lva; });
+    m.def("cast_lv_map", []() -> const decltype(lvm) & { return lvm; });
+    m.def("cast_lv_nested", []() -> const decltype(lvn) & { return lvn; });
+    // #853:
+    m.def("cast_unique_ptr_vector", []() {
+        std::vector<std::unique_ptr<UserType>> v;
+        v.emplace_back(new UserType{7});
+        v.emplace_back(new UserType{42});
+        return v;
+    });
+
+    // test_move_out_container
+    struct MoveOutContainer {
+        struct Value { int value; };
+        std::list<Value> move_list() const { return {{0}, {1}, {2}}; }
+    };
+    py::class_<MoveOutContainer::Value>(m, "MoveOutContainerValue")
+        .def_readonly("value", &MoveOutContainer::Value::value);
+    py::class_<MoveOutContainer>(m, "MoveOutContainer")
+        .def(py::init<>())
+        .def_property_readonly("move_list", &MoveOutContainer::move_list);
+
+    // Class that can be move- and copy-constructed, but not assigned
+    struct NoAssign {
+        int value;
+
+        explicit NoAssign(int value = 0) : value(value) { }
+        NoAssign(const NoAssign &) = default;
+        NoAssign(NoAssign &&) = default;
+
+        NoAssign &operator=(const NoAssign &) = delete;
+        NoAssign &operator=(NoAssign &&) = delete;
+    };
+    py::class_<NoAssign>(m, "NoAssign", "Class with no C++ assignment operators")
+        .def(py::init<>())
+        .def(py::init<int>());
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    // test_optional
+    m.attr("has_optional") = true;
+
+    using opt_int = std::optional<int>;
+    using opt_no_assign = std::optional<NoAssign>;
+    m.def("double_or_zero", [](const opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none", [](int x) -> opt_int {
+        return x ? opt_int(x / 2) : opt_int();
+    });
+    m.def("test_nullopt", [](opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::nullopt, "None"));
+    m.def("test_no_assign", [](const opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::nullopt, "None"));
+
+    m.def("nodefer_none_optional", [](std::optional<int>) { return true; });
+    m.def("nodefer_none_optional", [](py::none) { return false; });
+#endif
+
+#ifdef PYBIND11_HAS_EXP_OPTIONAL
+    // test_exp_optional
+    m.attr("has_exp_optional") = true;
+
+    using exp_opt_int = std::experimental::optional<int>;
+    using exp_opt_no_assign = std::experimental::optional<NoAssign>;
+    m.def("double_or_zero_exp", [](const exp_opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none_exp", [](int x) -> exp_opt_int {
+        return x ? exp_opt_int(x / 2) : exp_opt_int();
+    });
+    m.def("test_nullopt_exp", [](exp_opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+    m.def("test_no_assign_exp", [](const exp_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", std::experimental::nullopt, "None"));
+#endif
+
+#ifdef PYBIND11_HAS_VARIANT
+    static_assert(std::is_same<py::detail::variant_caster_visitor::result_type, py::handle>::value,
+                  "visitor::result_type is required by boost::variant in C++11 mode");
+
+    struct visitor {
+        using result_type = const char *;
+
+        result_type operator()(int) { return "int"; }
+        result_type operator()(std::string) { return "std::string"; }
+        result_type operator()(double) { return "double"; }
+        result_type operator()(std::nullptr_t) { return "std::nullptr_t"; }
+    };
+
+    // test_variant
+    m.def("load_variant", [](variant<int, std::string, double, std::nullptr_t> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("load_variant_2pass", [](variant<double, int> v) {
+        return py::detail::visit_helper<variant>::call(visitor(), v);
+    });
+    m.def("cast_variant", []() {
+        using V = variant<int, std::string>;
+        return py::make_tuple(V(5), V("Hello"));
+    });
+#endif
+
+    // #528: templated constructor
+    // (no python tests: the test here is that this compiles)
+    m.def("tpl_ctor_vector", [](std::vector<TplCtorClass> &) {});
+    m.def("tpl_ctor_map", [](std::unordered_map<TplCtorClass, TplCtorClass> &) {});
+    m.def("tpl_ctor_set", [](std::unordered_set<TplCtorClass> &) {});
+#if defined(PYBIND11_HAS_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::optional<TplCtorClass> &) {});
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+    m.def("tpl_constr_optional", [](std::experimental::optional<TplCtorClass> &) {});
+#endif
+
+    // test_vec_of_reference_wrapper
+    // #171: Can't return STL structures containing reference wrapper
+    m.def("return_vec_of_reference_wrapper", [](std::reference_wrapper<UserType> p4) {
+        static UserType p1{1}, p2{2}, p3{3};
+        return std::vector<std::reference_wrapper<UserType>> {
+            std::ref(p1), std::ref(p2), std::ref(p3), p4
+        };
+    });
+
+    // test_stl_pass_by_pointer
+    m.def("stl_pass_by_pointer", [](std::vector<int>* v) { return *v; }, "v"_a=nullptr);
+}
diff --git a/pybind11/tests/test_stl.py b/pybind11/tests/test_stl.py
new file mode 100644
index 000000000..db8515e7a
--- /dev/null
+++ b/pybind11/tests/test_stl.py
@@ -0,0 +1,200 @@
+import pytest
+
+from pybind11_tests import stl as m
+from pybind11_tests import UserType
+
+
+def test_vector(doc):
+    """std::vector <-> list"""
+    l = m.cast_vector()
+    assert l == [1]
+    l.append(2)
+    assert m.load_vector(l)
+    assert m.load_vector(tuple(l))
+
+    assert m.cast_bool_vector() == [True, False]
+    assert m.load_bool_vector([True, False])
+
+    assert doc(m.cast_vector) == "cast_vector() -> List[int]"
+    assert doc(m.load_vector) == "load_vector(arg0: List[int]) -> bool"
+
+    # Test regression caused by 936: pointers to stl containers weren't castable
+    assert m.cast_ptr_vector() == ["lvalue", "lvalue"]
+
+
+def test_array(doc):
+    """std::array <-> list"""
+    l = m.cast_array()
+    assert l == [1, 2]
+    assert m.load_array(l)
+
+    assert doc(m.cast_array) == "cast_array() -> List[int[2]]"
+    assert doc(m.load_array) == "load_array(arg0: List[int[2]]) -> bool"
+
+
+def test_valarray(doc):
+    """std::valarray <-> list"""
+    l = m.cast_valarray()
+    assert l == [1, 4, 9]
+    assert m.load_valarray(l)
+
+    assert doc(m.cast_valarray) == "cast_valarray() -> List[int]"
+    assert doc(m.load_valarray) == "load_valarray(arg0: List[int]) -> bool"
+
+
+def test_map(doc):
+    """std::map <-> dict"""
+    d = m.cast_map()
+    assert d == {"key": "value"}
+    d["key2"] = "value2"
+    assert m.load_map(d)
+
+    assert doc(m.cast_map) == "cast_map() -> Dict[str, str]"
+    assert doc(m.load_map) == "load_map(arg0: Dict[str, str]) -> bool"
+
+
+def test_set(doc):
+    """std::set <-> set"""
+    s = m.cast_set()
+    assert s == {"key1", "key2"}
+    s.add("key3")
+    assert m.load_set(s)
+
+    assert doc(m.cast_set) == "cast_set() -> Set[str]"
+    assert doc(m.load_set) == "load_set(arg0: Set[str]) -> bool"
+
+
+def test_recursive_casting():
+    """Tests that stl casters preserve lvalue/rvalue context for container values"""
+    assert m.cast_rv_vector() == ["rvalue", "rvalue"]
+    assert m.cast_lv_vector() == ["lvalue", "lvalue"]
+    assert m.cast_rv_array() == ["rvalue", "rvalue", "rvalue"]
+    assert m.cast_lv_array() == ["lvalue", "lvalue"]
+    assert m.cast_rv_map() == {"a": "rvalue"}
+    assert m.cast_lv_map() == {"a": "lvalue", "b": "lvalue"}
+    assert m.cast_rv_nested() == [[[{"b": "rvalue", "c": "rvalue"}], [{"a": "rvalue"}]]]
+    assert m.cast_lv_nested() == {
+        "a": [[["lvalue", "lvalue"]], [["lvalue", "lvalue"]]],
+        "b": [[["lvalue", "lvalue"], ["lvalue", "lvalue"]]]
+    }
+
+    # Issue #853 test case:
+    z = m.cast_unique_ptr_vector()
+    assert z[0].value == 7 and z[1].value == 42
+
+
+def test_move_out_container():
+    """Properties use the `reference_internal` policy by default. If the underlying function
+    returns an rvalue, the policy is automatically changed to `move` to avoid referencing
+    a temporary. In case the return value is a container of user-defined types, the policy
+    also needs to be applied to the elements, not just the container."""
+    c = m.MoveOutContainer()
+    moved_out_list = c.move_list
+    assert [x.value for x in moved_out_list] == [0, 1, 2]
+
+
+@pytest.mark.skipif(not hasattr(m, "has_optional"), reason='no <optional>')
+def test_optional():
+    assert m.double_or_zero(None) == 0
+    assert m.double_or_zero(42) == 84
+    pytest.raises(TypeError, m.double_or_zero, 'foo')
+
+    assert m.half_or_none(0) is None
+    assert m.half_or_none(42) == 21
+    pytest.raises(TypeError, m.half_or_none, 'foo')
+
+    assert m.test_nullopt() == 42
+    assert m.test_nullopt(None) == 42
+    assert m.test_nullopt(42) == 42
+    assert m.test_nullopt(43) == 43
+
+    assert m.test_no_assign() == 42
+    assert m.test_no_assign(None) == 42
+    assert m.test_no_assign(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign, 43)
+
+    assert m.nodefer_none_optional(None)
+
+
+@pytest.mark.skipif(not hasattr(m, "has_exp_optional"), reason='no <experimental/optional>')
+def test_exp_optional():
+    assert m.double_or_zero_exp(None) == 0
+    assert m.double_or_zero_exp(42) == 84
+    pytest.raises(TypeError, m.double_or_zero_exp, 'foo')
+
+    assert m.half_or_none_exp(0) is None
+    assert m.half_or_none_exp(42) == 21
+    pytest.raises(TypeError, m.half_or_none_exp, 'foo')
+
+    assert m.test_nullopt_exp() == 42
+    assert m.test_nullopt_exp(None) == 42
+    assert m.test_nullopt_exp(42) == 42
+    assert m.test_nullopt_exp(43) == 43
+
+    assert m.test_no_assign_exp() == 42
+    assert m.test_no_assign_exp(None) == 42
+    assert m.test_no_assign_exp(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign_exp, 43)
+
+
+@pytest.mark.skipif(not hasattr(m, "load_variant"), reason='no <variant>')
+def test_variant(doc):
+    assert m.load_variant(1) == "int"
+    assert m.load_variant("1") == "std::string"
+    assert m.load_variant(1.0) == "double"
+    assert m.load_variant(None) == "std::nullptr_t"
+
+    assert m.load_variant_2pass(1) == "int"
+    assert m.load_variant_2pass(1.0) == "double"
+
+    assert m.cast_variant() == (5, "Hello")
+
+    assert doc(m.load_variant) == "load_variant(arg0: Union[int, str, float, None]) -> str"
+
+
+def test_vec_of_reference_wrapper():
+    """#171: Can't return reference wrappers (or STL structures containing them)"""
+    assert str(m.return_vec_of_reference_wrapper(UserType(4))) == \
+        "[UserType(1), UserType(2), UserType(3), UserType(4)]"
+
+
+def test_stl_pass_by_pointer(msg):
+    """Passing nullptr or None to an STL container pointer is not expected to work"""
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer()  # default value is `nullptr`
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int]=None) -> List[int]
+
+        Invoked with:
+    """  # noqa: E501 line too long
+
+    with pytest.raises(TypeError) as excinfo:
+        m.stl_pass_by_pointer(None)
+    assert msg(excinfo.value) == """
+        stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
+            1. (v: List[int]=None) -> List[int]
+
+        Invoked with: None
+    """  # noqa: E501 line too long
+
+    assert m.stl_pass_by_pointer([1, 2, 3]) == [1, 2, 3]
+
+
+def test_missing_header_message():
+    """Trying convert `list` to a `std::vector`, or vice versa, without including
+    <pybind11/stl.h> should result in a helpful suggestion in the error message"""
+    import pybind11_cross_module_tests as cm
+
+    expected_message = ("Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                        "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                        "conversions are optional and require extra headers to be included\n"
+                        "when compiling your pybind11 module.")
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_arg([1.0, 2.0, 3.0])
+    assert expected_message in str(excinfo.value)
+
+    with pytest.raises(TypeError) as excinfo:
+        cm.missing_header_return()
+    assert expected_message in str(excinfo.value)
diff --git a/pybind11/tests/test_stl_binders.cpp b/pybind11/tests/test_stl_binders.cpp
index ce0b33257..a88b589e1 100644
--- a/pybind11/tests/test_stl_binders.cpp
+++ b/pybind11/tests/test_stl_binders.cpp
@@ -10,15 +10,11 @@
 #include "pybind11_tests.h"
 
 #include <pybind11/stl_bind.h>
+#include <pybind11/numpy.h>
 #include <map>
 #include <deque>
 #include <unordered_map>
 
-#ifdef _MSC_VER
-// We get some really long type names here which causes MSVC to emit warnings
-#  pragma warning(disable: 4503) // warning C4503: decorated name length exceeded, name was truncated
-#endif
-
 class El {
 public:
     El() = delete;
@@ -58,43 +54,54 @@ template <class Map> Map *times_ten(int n) {
     return m;
 }
 
-test_initializer stl_binder_vector([](py::module &m) {
+TEST_SUBMODULE(stl_binders, m) {
+    // test_vector_int
+    py::bind_vector<std::vector<unsigned int>>(m, "VectorInt", py::buffer_protocol());
+
+    // test_vector_custom
     py::class_<El>(m, "El")
         .def(py::init<int>());
-
-    py::bind_vector<std::vector<unsigned int>>(m, "VectorInt");
-    py::bind_vector<std::vector<bool>>(m, "VectorBool");
-
     py::bind_vector<std::vector<El>>(m, "VectorEl");
-
     py::bind_vector<std::vector<std::vector<El>>>(m, "VectorVectorEl");
 
-});
-
-test_initializer stl_binder_map([](py::module &m) {
+    // test_map_string_double
     py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
     py::bind_map<std::unordered_map<std::string, double>>(m, "UnorderedMapStringDouble");
 
+    // test_map_string_double_const
     py::bind_map<std::map<std::string, double const>>(m, "MapStringDoubleConst");
     py::bind_map<std::unordered_map<std::string, double const>>(m, "UnorderedMapStringDoubleConst");
 
-});
-
-test_initializer stl_binder_noncopyable([](py::module &m) {
     py::class_<E_nc>(m, "ENC")
         .def(py::init<int>())
         .def_readwrite("value", &E_nc::value);
 
+    // test_noncopyable_containers
     py::bind_vector<std::vector<E_nc>>(m, "VectorENC");
     m.def("get_vnc", &one_to_n<std::vector<E_nc>>, py::return_value_policy::reference);
-
     py::bind_vector<std::deque<E_nc>>(m, "DequeENC");
     m.def("get_dnc", &one_to_n<std::deque<E_nc>>, py::return_value_policy::reference);
-
     py::bind_map<std::map<int, E_nc>>(m, "MapENC");
     m.def("get_mnc", &times_ten<std::map<int, E_nc>>, py::return_value_policy::reference);
-
     py::bind_map<std::unordered_map<int, E_nc>>(m, "UmapENC");
     m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>, py::return_value_policy::reference);
-});
 
+    // test_vector_buffer
+    py::bind_vector<std::vector<unsigned char>>(m, "VectorUChar", py::buffer_protocol());
+    // no dtype declared for this version:
+    struct VUndeclStruct { bool w; uint32_t x; double y; bool z; };
+    m.def("create_undeclstruct", [m] () mutable {
+        py::bind_vector<std::vector<VUndeclStruct>>(m, "VectorUndeclStruct", py::buffer_protocol());
+    });
+
+    // The rest depends on numpy:
+    try { py::module::import("numpy"); }
+    catch (...) { return; }
+
+    // test_vector_buffer_numpy
+    struct VStruct { bool w; uint32_t x; double y; bool z; };
+    PYBIND11_NUMPY_DTYPE(VStruct, w, x, y, z);
+    py::class_<VStruct>(m, "VStruct").def_readwrite("x", &VStruct::x);
+    py::bind_vector<std::vector<VStruct>>(m, "VectorStruct", py::buffer_protocol());
+    m.def("get_vectorstruct", [] {return std::vector<VStruct> {{0, 5, 3.0, 1}, {1, 30, -1e4, 0}};});
+}
diff --git a/pybind11/tests/test_stl_binders.py b/pybind11/tests/test_stl_binders.py
index c9bcc7935..bf1aa674c 100644
--- a/pybind11/tests/test_stl_binders.py
+++ b/pybind11/tests/test_stl_binders.py
@@ -1,49 +1,94 @@
-def test_vector_int():
-    from pybind11_tests import VectorInt
+import pytest
+import sys
+from pybind11_tests import stl_binders as m
+
+with pytest.suppress(ImportError):
+    import numpy as np
 
-    v_int = VectorInt([0, 0])
+
+def test_vector_int():
+    v_int = m.VectorInt([0, 0])
     assert len(v_int) == 2
     assert bool(v_int) is True
 
-    v_int2 = VectorInt([0, 0])
+    v_int2 = m.VectorInt([0, 0])
     assert v_int == v_int2
     v_int2[1] = 1
     assert v_int != v_int2
 
     v_int2.append(2)
-    v_int2.append(3)
     v_int2.insert(0, 1)
     v_int2.insert(0, 2)
     v_int2.insert(0, 3)
+    v_int2.insert(6, 3)
     assert str(v_int2) == "VectorInt[3, 2, 1, 0, 1, 2, 3]"
+    with pytest.raises(IndexError):
+        v_int2.insert(8, 4)
 
     v_int.append(99)
     v_int2[2:-2] = v_int
-    assert v_int2 == VectorInt([3, 2, 0, 0, 99, 2, 3])
+    assert v_int2 == m.VectorInt([3, 2, 0, 0, 99, 2, 3])
     del v_int2[1:3]
-    assert v_int2 == VectorInt([3, 0, 99, 2, 3])
+    assert v_int2 == m.VectorInt([3, 0, 99, 2, 3])
     del v_int2[0]
-    assert v_int2 == VectorInt([0, 99, 2, 3])
-
-
-def test_vector_custom():
-    from pybind11_tests import El, VectorEl, VectorVectorEl
-
-    v_a = VectorEl()
-    v_a.append(El(1))
-    v_a.append(El(2))
-    assert str(v_a) == "VectorEl[El{1}, El{2}]"
-
-    vv_a = VectorVectorEl()
-    vv_a.append(v_a)
-    vv_b = vv_a[0]
-    assert str(vv_b) == "VectorEl[El{1}, El{2}]"
+    assert v_int2 == m.VectorInt([0, 99, 2, 3])
+
+
+# related to the PyPy's buffer protocol.
+@pytest.unsupported_on_pypy
+def test_vector_buffer():
+    b = bytearray([1, 2, 3, 4])
+    v = m.VectorUChar(b)
+    assert v[1] == 2
+    v[2] = 5
+    mv = memoryview(v)  # We expose the buffer interface
+    if sys.version_info.major > 2:
+        assert mv[2] == 5
+        mv[2] = 6
+    else:
+        assert mv[2] == '\x05'
+        mv[2] = '\x06'
+    assert v[2] == 6
+
+    with pytest.raises(RuntimeError) as excinfo:
+        m.create_undeclstruct()  # Undeclared struct contents, no buffer interface
+    assert "NumPy type info missing for " in str(excinfo.value)
+
+
+@pytest.unsupported_on_pypy
+@pytest.requires_numpy
+def test_vector_buffer_numpy():
+    a = np.array([1, 2, 3, 4], dtype=np.int32)
+    with pytest.raises(TypeError):
+        m.VectorInt(a)
+
+    a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.uintc)
+    v = m.VectorInt(a[0, :])
+    assert len(v) == 4
+    assert v[2] == 3
+    ma = np.asarray(v)
+    ma[2] = 5
+    assert v[2] == 5
+
+    v = m.VectorInt(a[:, 1])
+    assert len(v) == 3
+    assert v[2] == 10
+
+    v = m.get_vectorstruct()
+    assert v[0].x == 5
+    ma = np.asarray(v)
+    ma[1]['x'] = 99
+    assert v[1].x == 99
+
+    v = m.VectorStruct(np.zeros(3, dtype=np.dtype([('w', 'bool'), ('x', 'I'),
+                                                   ('y', 'float64'), ('z', 'bool')], align=True)))
+    assert len(v) == 3
 
 
 def test_vector_bool():
-    from pybind11_tests import VectorBool
+    import pybind11_cross_module_tests as cm
 
-    vv_c = VectorBool()
+    vv_c = cm.VectorBool()
     for i in range(10):
         vv_c.append(i % 2 == 0)
     for i in range(10):
@@ -51,18 +96,28 @@ def test_vector_bool():
     assert str(vv_c) == "VectorBool[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]"
 
 
-def test_map_string_double():
-    from pybind11_tests import MapStringDouble, UnorderedMapStringDouble
+def test_vector_custom():
+    v_a = m.VectorEl()
+    v_a.append(m.El(1))
+    v_a.append(m.El(2))
+    assert str(v_a) == "VectorEl[El{1}, El{2}]"
+
+    vv_a = m.VectorVectorEl()
+    vv_a.append(v_a)
+    vv_b = vv_a[0]
+    assert str(vv_b) == "VectorEl[El{1}, El{2}]"
+
 
-    m = MapStringDouble()
-    m['a'] = 1
-    m['b'] = 2.5
+def test_map_string_double():
+    mm = m.MapStringDouble()
+    mm['a'] = 1
+    mm['b'] = 2.5
 
-    assert list(m) == ['a', 'b']
-    assert list(m.items()) == [('a', 1), ('b', 2.5)]
-    assert str(m) == "MapStringDouble{a: 1, b: 2.5}"
+    assert list(mm) == ['a', 'b']
+    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    assert str(mm) == "MapStringDouble{a: 1, b: 2.5}"
 
-    um = UnorderedMapStringDouble()
+    um = m.UnorderedMapStringDouble()
     um['ua'] = 1.1
     um['ub'] = 2.6
 
@@ -72,35 +127,29 @@ def test_map_string_double():
 
 
 def test_map_string_double_const():
-    from pybind11_tests import MapStringDoubleConst, UnorderedMapStringDoubleConst
-
-    mc = MapStringDoubleConst()
+    mc = m.MapStringDoubleConst()
     mc['a'] = 10
     mc['b'] = 20.5
     assert str(mc) == "MapStringDoubleConst{a: 10, b: 20.5}"
 
-    umc = UnorderedMapStringDoubleConst()
+    umc = m.UnorderedMapStringDoubleConst()
     umc['a'] = 11
     umc['b'] = 21.5
 
     str(umc)
 
 
-def test_noncopyable_vector():
-    from pybind11_tests import get_vnc
-
-    vnc = get_vnc(5)
+def test_noncopyable_containers():
+    # std::vector
+    vnc = m.get_vnc(5)
     for i in range(0, 5):
         assert vnc[i].value == i + 1
 
     for i, j in enumerate(vnc, start=1):
         assert j.value == i
 
-
-def test_noncopyable_deque():
-    from pybind11_tests import get_dnc
-
-    dnc = get_dnc(5)
+    # std::deque
+    dnc = m.get_dnc(5)
     for i in range(0, 5):
         assert dnc[i].value == i + 1
 
@@ -109,11 +158,8 @@ def test_noncopyable_deque():
         assert(j.value == i)
         i += 1
 
-
-def test_noncopyable_map():
-    from pybind11_tests import get_mnc
-
-    mnc = get_mnc(5)
+    # std::map
+    mnc = m.get_mnc(5)
     for i in range(1, 6):
         assert mnc[i].value == 10 * i
 
@@ -124,11 +170,8 @@ def test_noncopyable_map():
 
     assert vsum == 150
 
-
-def test_noncopyable_unordered_map():
-    from pybind11_tests import get_umnc
-
-    mnc = get_umnc(5)
+    # std::unordered_map
+    mnc = m.get_umnc(5)
     for i in range(1, 6):
         assert mnc[i].value == 10 * i
 
diff --git a/pybind11/tests/test_virtual_functions.cpp b/pybind11/tests/test_virtual_functions.cpp
index 0f8ed2afb..953b390b8 100644
--- a/pybind11/tests/test_virtual_functions.cpp
+++ b/pybind11/tests/test_virtual_functions.cpp
@@ -145,16 +145,150 @@ class NCVirtTrampoline : public NCVirt {
     }
 };
 
-int runExampleVirt(ExampleVirt *ex, int value) {
-    return ex->run(value);
-}
+struct Base {
+    /* for some reason MSVC2015 can't compile this if the function is pure virtual */
+    virtual std::string dispatch() const { return {}; };
+    virtual ~Base() = default;
+};
 
-bool runExampleVirtBool(ExampleVirt* ex) {
-    return ex->run_bool();
-}
+struct DispatchIssue : Base {
+    virtual std::string dispatch() const {
+        PYBIND11_OVERLOAD_PURE(std::string, Base, dispatch, /* no arguments */);
+    }
+};
+
+// Forward declaration (so that we can put the main tests here; the inherited virtual approaches are
+// rather long).
+void initialize_inherited_virtuals(py::module &m);
+
+TEST_SUBMODULE(virtual_functions, m) {
+    // test_override
+    py::class_<ExampleVirt, PyExampleVirt>(m, "ExampleVirt")
+        .def(py::init<int>())
+        /* Reference original class in function definitions */
+        .def("run", &ExampleVirt::run)
+        .def("run_bool", &ExampleVirt::run_bool)
+        .def("pure_virtual", &ExampleVirt::pure_virtual);
+
+    py::class_<NonCopyable>(m, "NonCopyable")
+        .def(py::init<int, int>());
+
+    py::class_<Movable>(m, "Movable")
+        .def(py::init<int, int>());
+
+    // test_move_support
+#if !defined(__INTEL_COMPILER)
+    py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
+        .def(py::init<>())
+        .def("get_noncopyable", &NCVirt::get_noncopyable)
+        .def("get_movable", &NCVirt::get_movable)
+        .def("print_nc", &NCVirt::print_nc)
+        .def("print_movable", &NCVirt::print_movable);
+#endif
+
+    m.def("runExampleVirt", [](ExampleVirt *ex, int value) { return ex->run(value); });
+    m.def("runExampleVirtBool", [](ExampleVirt* ex) { return ex->run_bool(); });
+    m.def("runExampleVirtVirtual", [](ExampleVirt *ex) { ex->pure_virtual(); });
+
+    m.def("cstats_debug", &ConstructorStats::get<ExampleVirt>);
+    initialize_inherited_virtuals(m);
 
-void runExampleVirtVirtual(ExampleVirt *ex) {
-    ex->pure_virtual();
+    // test_alias_delay_initialization1
+    // don't invoke Python dispatch classes by default when instantiating C++ classes
+    // that were not extended on the Python side
+    struct A {
+        virtual ~A() {}
+        virtual void f() { py::print("A.f()"); }
+    };
+
+    struct PyA : A {
+        PyA() { py::print("PyA.PyA()"); }
+        ~PyA() { py::print("PyA.~PyA()"); }
+
+        void f() override {
+            py::print("PyA.f()");
+            PYBIND11_OVERLOAD(void, A, f);
+        }
+    };
+
+    py::class_<A, PyA>(m, "A")
+        .def(py::init<>())
+        .def("f", &A::f);
+
+    m.def("call_f", [](A *a) { a->f(); });
+
+    // test_alias_delay_initialization2
+    // ... unless we explicitly request it, as in this example:
+    struct A2 {
+        virtual ~A2() {}
+        virtual void f() { py::print("A2.f()"); }
+    };
+
+    struct PyA2 : A2 {
+        PyA2() { py::print("PyA2.PyA2()"); }
+        ~PyA2() { py::print("PyA2.~PyA2()"); }
+        void f() override {
+            py::print("PyA2.f()");
+            PYBIND11_OVERLOAD(void, A2, f);
+        }
+    };
+
+    py::class_<A2, PyA2>(m, "A2")
+        .def(py::init_alias<>())
+        .def(py::init([](int) { return new PyA2(); }))
+        .def("f", &A2::f);
+
+    m.def("call_f", [](A2 *a2) { a2->f(); });
+
+    // test_dispatch_issue
+    // #159: virtual function dispatch has problems with similar-named functions
+    py::class_<Base, DispatchIssue>(m, "DispatchIssue")
+        .def(py::init<>())
+        .def("dispatch", &Base::dispatch);
+
+    m.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
+
+    // test_override_ref
+    // #392/397: overridding reference-returning functions
+    class OverrideTest {
+    public:
+        struct A { std::string value = "hi"; };
+        std::string v;
+        A a;
+        explicit OverrideTest(const std::string &v) : v{v} {}
+        virtual std::string str_value() { return v; }
+        virtual std::string &str_ref() { return v; }
+        virtual A A_value() { return a; }
+        virtual A &A_ref() { return a; }
+        virtual ~OverrideTest() = default;
+    };
+
+    class PyOverrideTest : public OverrideTest {
+    public:
+        using OverrideTest::OverrideTest;
+        std::string str_value() override { PYBIND11_OVERLOAD(std::string, OverrideTest, str_value); }
+        // Not allowed (uncommenting should hit a static_assert failure): we can't get a reference
+        // to a python numeric value, since we only copy values in the numeric type caster:
+//      std::string &str_ref() override { PYBIND11_OVERLOAD(std::string &, OverrideTest, str_ref); }
+        // But we can work around it like this:
+    private:
+        std::string _tmp;
+        std::string str_ref_helper() { PYBIND11_OVERLOAD(std::string, OverrideTest, str_ref); }
+    public:
+        std::string &str_ref() override { return _tmp = str_ref_helper(); }
+
+        A A_value() override { PYBIND11_OVERLOAD(A, OverrideTest, A_value); }
+        A &A_ref() override { PYBIND11_OVERLOAD(A &, OverrideTest, A_ref); }
+    };
+
+    py::class_<OverrideTest::A>(m, "OverrideTest_A")
+        .def_readwrite("value", &OverrideTest::A::value);
+    py::class_<OverrideTest, PyOverrideTest>(m, "OverrideTest")
+        .def(py::init<const std::string &>())
+        .def("str_value", &OverrideTest::str_value)
+//      .def("str_ref", &OverrideTest::str_ref)
+        .def("A_value", &OverrideTest::A_value)
+        .def("A_ref", &OverrideTest::A_ref);
 }
 
 
@@ -179,6 +313,7 @@ public: \
         return say_something(1) + " " + std::to_string(unlucky_number()); \
     }
 A_METHODS
+    virtual ~A_Repeat() = default;
 };
 class B_Repeat : public A_Repeat {
 #define B_METHODS \
@@ -203,7 +338,7 @@ D_METHODS
 };
 
 // Base classes for templated inheritance trampolines.  Identical to the repeat-everything version:
-class A_Tpl { A_METHODS };
+class A_Tpl { A_METHODS; virtual ~A_Tpl() = default; };
 class B_Tpl : public A_Tpl { B_METHODS };
 class C_Tpl : public B_Tpl { C_METHODS };
 class D_Tpl : public C_Tpl { D_METHODS };
@@ -281,6 +416,8 @@ public:
 
 
 void initialize_inherited_virtuals(py::module &m) {
+    // test_inherited_virtuals
+
     // Method 1: repeat
     py::class_<A_Repeat, PyA_Repeat>(m, "A_Repeat")
         .def(py::init<>())
@@ -295,6 +432,7 @@ void initialize_inherited_virtuals(py::module &m) {
     py::class_<D_Repeat, C_Repeat, PyD_Repeat>(m, "D_Repeat")
         .def(py::init<>());
 
+    // test_
     // Method 2: Templated trampolines
     py::class_<A_Tpl, PyA_Tpl<>>(m, "A_Tpl")
         .def(py::init<>())
@@ -310,38 +448,3 @@ void initialize_inherited_virtuals(py::module &m) {
         .def(py::init<>());
 
 };
-
-
-test_initializer virtual_functions([](py::module &m) {
-    /* Important: indicate the trampoline class PyExampleVirt using the third
-       argument to py::class_. The second argument with the unique pointer
-       is simply the default holder type used by pybind11. */
-    py::class_<ExampleVirt, PyExampleVirt>(m, "ExampleVirt")
-        .def(py::init<int>())
-        /* Reference original class in function definitions */
-        .def("run", &ExampleVirt::run)
-        .def("run_bool", &ExampleVirt::run_bool)
-        .def("pure_virtual", &ExampleVirt::pure_virtual);
-
-    py::class_<NonCopyable>(m, "NonCopyable")
-        .def(py::init<int, int>());
-
-    py::class_<Movable>(m, "Movable")
-        .def(py::init<int, int>());
-
-#if !defined(__INTEL_COMPILER)
-    py::class_<NCVirt, NCVirtTrampoline>(m, "NCVirt")
-        .def(py::init<>())
-        .def("get_noncopyable", &NCVirt::get_noncopyable)
-        .def("get_movable", &NCVirt::get_movable)
-        .def("print_nc", &NCVirt::print_nc)
-        .def("print_movable", &NCVirt::print_movable);
-#endif
-
-    m.def("runExampleVirt", &runExampleVirt);
-    m.def("runExampleVirtBool", &runExampleVirtBool);
-    m.def("runExampleVirtVirtual", &runExampleVirtVirtual);
-
-    m.def("cstats_debug", &ConstructorStats::get<ExampleVirt>);
-    initialize_inherited_virtuals(m);
-});
diff --git a/pybind11/tests/test_virtual_functions.py b/pybind11/tests/test_virtual_functions.py
index b11c699df..b91ebfa3e 100644
--- a/pybind11/tests/test_virtual_functions.py
+++ b/pybind11/tests/test_virtual_functions.py
@@ -1,13 +1,11 @@
 import pytest
-import pybind11_tests
+
+from pybind11_tests import virtual_functions as m
 from pybind11_tests import ConstructorStats
 
 
 def test_override(capture, msg):
-    from pybind11_tests import (ExampleVirt, runExampleVirt, runExampleVirtVirtual,
-                                runExampleVirtBool)
-
-    class ExtendedExampleVirt(ExampleVirt):
+    class ExtendedExampleVirt(m.ExampleVirt):
         def __init__(self, state):
             super(ExtendedExampleVirt, self).__init__(state + 1)
             self.data = "Hello world"
@@ -33,40 +31,40 @@ def test_override(capture, msg):
         def get_string2(self):
             return "override2"
 
-    ex12 = ExampleVirt(10)
+    ex12 = m.ExampleVirt(10)
     with capture:
-        assert runExampleVirt(ex12, 20) == 30
+        assert m.runExampleVirt(ex12, 20) == 30
     assert capture == """
         Original implementation of ExampleVirt::run(state=10, value=20, str1=default1, str2=default2)
     """  # noqa: E501 line too long
 
     with pytest.raises(RuntimeError) as excinfo:
-        runExampleVirtVirtual(ex12)
+        m.runExampleVirtVirtual(ex12)
     assert msg(excinfo.value) == 'Tried to call pure virtual function "ExampleVirt::pure_virtual"'
 
     ex12p = ExtendedExampleVirt(10)
     with capture:
-        assert runExampleVirt(ex12p, 20) == 32
+        assert m.runExampleVirt(ex12p, 20) == 32
     assert capture == """
         ExtendedExampleVirt::run(20), calling parent..
         Original implementation of ExampleVirt::run(state=11, value=21, str1=override1, str2=default2)
     """  # noqa: E501 line too long
     with capture:
-        assert runExampleVirtBool(ex12p) is False
+        assert m.runExampleVirtBool(ex12p) is False
     assert capture == "ExtendedExampleVirt::run_bool()"
     with capture:
-        runExampleVirtVirtual(ex12p)
+        m.runExampleVirtVirtual(ex12p)
     assert capture == "ExtendedExampleVirt::pure_virtual(): Hello world"
 
     ex12p2 = ExtendedExampleVirt2(15)
     with capture:
-        assert runExampleVirt(ex12p2, 50) == 68
+        assert m.runExampleVirt(ex12p2, 50) == 68
     assert capture == """
         ExtendedExampleVirt::run(50), calling parent..
         Original implementation of ExampleVirt::run(state=17, value=51, str1=override1, str2=override2)
     """  # noqa: E501 line too long
 
-    cstats = ConstructorStats.get(ExampleVirt)
+    cstats = ConstructorStats.get(m.ExampleVirt)
     assert cstats.alive() == 3
     del ex12, ex12p, ex12p2
     assert cstats.alive() == 0
@@ -75,14 +73,181 @@ def test_override(capture, msg):
     assert cstats.move_constructions >= 0
 
 
-def test_inheriting_repeat():
-    from pybind11_tests import A_Repeat, B_Repeat, C_Repeat, D_Repeat, A_Tpl, B_Tpl, C_Tpl, D_Tpl
+def test_alias_delay_initialization1(capture):
+    """`A` only initializes its trampoline class when we inherit from it
+
+    If we just create and use an A instance directly, the trampoline initialization is
+    bypassed and we only initialize an A() instead (for performance reasons).
+    """
+    class B(m.A):
+        def __init__(self):
+            super(B, self).__init__()
+
+        def f(self):
+            print("In python f()")
+
+    # C++ version
+    with capture:
+        a = m.A()
+        m.call_f(a)
+        del a
+        pytest.gc_collect()
+    assert capture == "A.f()"
+
+    # Python version
+    with capture:
+        b = B()
+        m.call_f(b)
+        del b
+        pytest.gc_collect()
+    assert capture == """
+        PyA.PyA()
+        PyA.f()
+        In python f()
+        PyA.~PyA()
+    """
+
+
+def test_alias_delay_initialization2(capture):
+    """`A2`, unlike the above, is configured to always initialize the alias
+
+    While the extra initialization and extra class layer has small virtual dispatch
+    performance penalty, it also allows us to do more things with the trampoline
+    class such as defining local variables and performing construction/destruction.
+    """
+    class B2(m.A2):
+        def __init__(self):
+            super(B2, self).__init__()
+
+        def f(self):
+            print("In python B2.f()")
+
+    # No python subclass version
+    with capture:
+        a2 = m.A2()
+        m.call_f(a2)
+        del a2
+        pytest.gc_collect()
+        a3 = m.A2(1)
+        m.call_f(a3)
+        del a3
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+        PyA2.PyA2()
+        PyA2.f()
+        A2.f()
+        PyA2.~PyA2()
+    """
+
+    # Python subclass version
+    with capture:
+        b2 = B2()
+        m.call_f(b2)
+        del b2
+        pytest.gc_collect()
+    assert capture == """
+        PyA2.PyA2()
+        PyA2.f()
+        In python B2.f()
+        PyA2.~PyA2()
+    """
+
+
+# PyPy: Reference count > 1 causes call with noncopyable instance
+# to fail in ncv1.print_nc()
+@pytest.unsupported_on_pypy
+@pytest.mark.skipif(not hasattr(m, "NCVirt"), reason="NCVirt test broken on ICPC")
+def test_move_support():
+    class NCVirtExt(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Constructs and returns a new instance:
+            nc = m.NonCopyable(a * a, b * b)
+            return nc
+
+        def get_movable(self, a, b):
+            # Return a referenced copy
+            self.movable = m.Movable(a, b)
+            return self.movable
+
+    class NCVirtExt2(m.NCVirt):
+        def get_noncopyable(self, a, b):
+            # Keep a reference: this is going to throw an exception
+            self.nc = m.NonCopyable(a, b)
+            return self.nc
+
+        def get_movable(self, a, b):
+            # Return a new instance without storing it
+            return m.Movable(a, b)
+
+    ncv1 = NCVirtExt()
+    assert ncv1.print_nc(2, 3) == "36"
+    assert ncv1.print_movable(4, 5) == "9"
+    ncv2 = NCVirtExt2()
+    assert ncv2.print_movable(7, 7) == "14"
+    # Don't check the exception message here because it differs under debug/non-debug mode
+    with pytest.raises(RuntimeError):
+        ncv2.print_nc(9, 9)
 
-    class AR(A_Repeat):
+    nc_stats = ConstructorStats.get(m.NonCopyable)
+    mv_stats = ConstructorStats.get(m.Movable)
+    assert nc_stats.alive() == 1
+    assert mv_stats.alive() == 1
+    del ncv1, ncv2
+    assert nc_stats.alive() == 0
+    assert mv_stats.alive() == 0
+    assert nc_stats.values() == ['4', '9', '9', '9']
+    assert mv_stats.values() == ['4', '5', '7', '7']
+    assert nc_stats.copy_constructions == 0
+    assert mv_stats.copy_constructions == 1
+    assert nc_stats.move_constructions >= 0
+    assert mv_stats.move_constructions >= 0
+
+
+def test_dispatch_issue(msg):
+    """#159: virtual function dispatch has problems with similar-named functions"""
+    class PyClass1(m.DispatchIssue):
+        def dispatch(self):
+            return "Yay.."
+
+    class PyClass2(m.DispatchIssue):
+        def dispatch(self):
+            with pytest.raises(RuntimeError) as excinfo:
+                super(PyClass2, self).dispatch()
+            assert msg(excinfo.value) == 'Tried to call pure virtual function "Base::dispatch"'
+
+            p = PyClass1()
+            return m.dispatch_issue_go(p)
+
+    b = PyClass2()
+    assert m.dispatch_issue_go(b) == "Yay.."
+
+
+def test_override_ref():
+    """#392/397: overridding reference-returning functions"""
+    o = m.OverrideTest("asdf")
+
+    # Not allowed (see associated .cpp comment)
+    # i = o.str_ref()
+    # assert o.str_ref() == "asdf"
+    assert o.str_value() == "asdf"
+
+    assert o.A_value().value == "hi"
+    a = o.A_ref()
+    assert a.value == "hi"
+    a.value = "bye"
+    assert a.value == "bye"
+
+
+def test_inherited_virtuals():
+    class AR(m.A_Repeat):
         def unlucky_number(self):
             return 99
 
-    class AT(A_Tpl):
+    class AT(m.A_Tpl):
         def unlucky_number(self):
             return 999
 
@@ -96,21 +261,21 @@ def test_inheriting_repeat():
     assert obj.unlucky_number() == 999
     assert obj.say_everything() == "hi 999"
 
-    for obj in [B_Repeat(), B_Tpl()]:
+    for obj in [m.B_Repeat(), m.B_Tpl()]:
         assert obj.say_something(3) == "B says hi 3 times"
         assert obj.unlucky_number() == 13
         assert obj.lucky_number() == 7.0
         assert obj.say_everything() == "B says hi 1 times 13"
 
-    for obj in [C_Repeat(), C_Tpl()]:
+    for obj in [m.C_Repeat(), m.C_Tpl()]:
         assert obj.say_something(3) == "B says hi 3 times"
         assert obj.unlucky_number() == 4444
         assert obj.lucky_number() == 888.0
         assert obj.say_everything() == "B says hi 1 times 4444"
 
-    class CR(C_Repeat):
+    class CR(m.C_Repeat):
         def lucky_number(self):
-            return C_Repeat.lucky_number(self) + 1.25
+            return m.C_Repeat.lucky_number(self) + 1.25
 
     obj = CR()
     assert obj.say_something(3) == "B says hi 3 times"
@@ -118,7 +283,7 @@ def test_inheriting_repeat():
     assert obj.lucky_number() == 889.25
     assert obj.say_everything() == "B says hi 1 times 4444"
 
-    class CT(C_Tpl):
+    class CT(m.C_Tpl):
         pass
 
     obj = CT()
@@ -147,14 +312,14 @@ def test_inheriting_repeat():
     assert obj.lucky_number() == 888000.0
     assert obj.say_everything() == "B says hi 1 times 4444"
 
-    class DR(D_Repeat):
+    class DR(m.D_Repeat):
         def unlucky_number(self):
             return 123
 
         def lucky_number(self):
             return 42.0
 
-    for obj in [D_Repeat(), D_Tpl()]:
+    for obj in [m.D_Repeat(), m.D_Tpl()]:
         assert obj.say_something(3) == "B says hi 3 times"
         assert obj.unlucky_number() == 4444
         assert obj.lucky_number() == 888.0
@@ -166,7 +331,7 @@ def test_inheriting_repeat():
     assert obj.lucky_number() == 42.0
     assert obj.say_everything() == "B says hi 1 times 123"
 
-    class DT(D_Tpl):
+    class DT(m.D_Tpl):
         def say_something(self, times):
             return "DT says:" + (' quack' * times)
 
@@ -189,7 +354,7 @@ def test_inheriting_repeat():
         def unlucky_number(self):
             return -3
 
-    class BT(B_Tpl):
+    class BT(m.B_Tpl):
         def say_something(self, times):
             return "BT" * times
 
@@ -204,56 +369,3 @@ def test_inheriting_repeat():
     assert obj.unlucky_number() == -7
     assert obj.lucky_number() == -1.375
     assert obj.say_everything() == "BT -7"
-
-
-# PyPy: Reference count > 1 causes call with noncopyable instance
-# to fail in ncv1.print_nc()
-@pytest.unsupported_on_pypy
-@pytest.mark.skipif(not hasattr(pybind11_tests, 'NCVirt'),
-                    reason="NCVirt test broken on ICPC")
-def test_move_support():
-    from pybind11_tests import NCVirt, NonCopyable, Movable
-
-    class NCVirtExt(NCVirt):
-        def get_noncopyable(self, a, b):
-            # Constructs and returns a new instance:
-            nc = NonCopyable(a * a, b * b)
-            return nc
-
-        def get_movable(self, a, b):
-            # Return a referenced copy
-            self.movable = Movable(a, b)
-            return self.movable
-
-    class NCVirtExt2(NCVirt):
-        def get_noncopyable(self, a, b):
-            # Keep a reference: this is going to throw an exception
-            self.nc = NonCopyable(a, b)
-            return self.nc
-
-        def get_movable(self, a, b):
-            # Return a new instance without storing it
-            return Movable(a, b)
-
-    ncv1 = NCVirtExt()
-    assert ncv1.print_nc(2, 3) == "36"
-    assert ncv1.print_movable(4, 5) == "9"
-    ncv2 = NCVirtExt2()
-    assert ncv2.print_movable(7, 7) == "14"
-    # Don't check the exception message here because it differs under debug/non-debug mode
-    with pytest.raises(RuntimeError):
-        ncv2.print_nc(9, 9)
-
-    nc_stats = ConstructorStats.get(NonCopyable)
-    mv_stats = ConstructorStats.get(Movable)
-    assert nc_stats.alive() == 1
-    assert mv_stats.alive() == 1
-    del ncv1, ncv2
-    assert nc_stats.alive() == 0
-    assert mv_stats.alive() == 0
-    assert nc_stats.values() == ['4', '9', '9', '9']
-    assert mv_stats.values() == ['4', '5', '7', '7']
-    assert nc_stats.copy_constructions == 0
-    assert mv_stats.copy_constructions == 1
-    assert nc_stats.move_constructions >= 0
-    assert mv_stats.move_constructions >= 0
diff --git a/pybind11/tools/FindCatch.cmake b/pybind11/tools/FindCatch.cmake
new file mode 100644
index 000000000..9d490c5aa
--- /dev/null
+++ b/pybind11/tools/FindCatch.cmake
@@ -0,0 +1,57 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line REGEX "Catch v.*" LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(DOWNLOAD ${url} "${destination_dir}/catch.hpp" STATUS status)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+  endif()
+  set(CATCH_INCLUDE_DIR "${destination_dir}" CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(CATCH_INCLUDE_DIR NAMES catch.hpp PATH_SUFFIXES catch)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+set(CATCH_FOUND TRUE)
diff --git a/pybind11/tools/FindPythonLibsNew.cmake b/pybind11/tools/FindPythonLibsNew.cmake
index 278d2ad19..ad3ed48fa 100644
--- a/pybind11/tools/FindPythonLibsNew.cmake
+++ b/pybind11/tools/FindPythonLibsNew.cmake
@@ -50,7 +50,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #=============================================================================
 
-if(PYTHONLIBS_FOUND)
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
     return()
 endif()
 
@@ -187,11 +188,6 @@ SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
 SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
 SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
 
-dune_register_package_flags(
-    INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS}
-    LIBRARIES ${PYTHON_LIBRARIES}
-)
-
 find_package_message(PYTHON
     "Found PythonLibs: ${PYTHON_LIBRARY}"
     "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
diff --git a/pybind11/tools/check-style.sh b/pybind11/tools/check-style.sh
index b87cb16e6..a9eeb170b 100755
--- a/pybind11/tools/check-style.sh
+++ b/pybind11/tools/check-style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-# 
+#
 # Script to check include/test code for common pybind11 code style errors.
-# 
+#
 # This script currently checks for
 #
 # 1. use of tabs instead of spaces
@@ -11,73 +11,60 @@
 # 5. Missing space between right parenthesis and brace, e.g. 'for (...){'
 # 6. opening brace on its own line. It should always be on the same line as the
 #    if/while/for/do statment.
-# 
+#
 # Invoke as: tools/check-style.sh
 #
 
-errors=0
+check_style_errors=0
 IFS=$'\n'
-found=
-# The mt=41 sets a red background for matched tabs:
-exec 3< <(GREP_COLORS='mt=41' grep $'\t' include/ tests/*.{cpp,py,h} docs/*.rst -rn --color=always)
-while read -u 3 f; do
-    if [ -z "$found" ]; then
-        echo -e '\e[31m\e[01mError: found tabs instead of spaces in the following files:\e[0m'
-        found=1
-        errors=1
-    fi
-
-    echo "    $f"
-done
-
-found=
-# The mt=41 sets a red background for matched MS-DOS CRLF line endings
-exec 3< <(GREP_COLORS='mt=41' grep -IUlr $'\r' include/ tests/*.{cpp,py,h} docs/*.rst --color=always)
-while read -u 3 f; do
-    if [ -z "$found" ]; then
-        echo -e '\e[31m\e[01mError: found CRLF characters in the following files:\e[0m'
-        found=1
-        errors=1
-    fi
-
-    echo "    $f"
-done
 
-found=
-# The mt=41 sets a red background for matched trailing spaces
-exec 3< <(GREP_COLORS='mt=41' grep '\s\+$' include/ tests/*.{cpp,py,h} docs/*.rst -rn --color=always)
-while read -u 3 f; do
-    if [ -z "$found" ]; then
-        echo -e '\e[31m\e[01mError: found trailing spaces in the following files:\e[0m'
-        found=1
-        errors=1
-    fi
+found="$( GREP_COLORS='mt=41' GREP_COLOR='41' grep $'\t' include tests/*.{cpp,py,h} docs/*.rst -rn --color=always )"
+if [ -n "$found" ]; then
+    # The mt=41 sets a red background for matched tabs:
+    echo -e '\033[31;01mError: found tab characters in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
 
-    echo "    $f"
-done
 
-found=
-exec 3< <(grep '\<\(if\|for\|while\|catch\)(\|){' include/ tests/*.{cpp,py,h} -rn --color=always)
-while read -u 3 line; do
-    if [ -z "$found" ]; then
-        echo -e '\e[31m\e[01mError: found the following coding style problems:\e[0m'
-        found=1
-        errors=1
-    fi
+found="$( grep -IUlr $'\r' include tests/*.{cpp,py,h} docs/*.rst --color=always )"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found CRLF characters in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
 
-    echo "    $line"
-done
+found="$(GREP_COLORS='mt=41' GREP_COLOR='41' grep '[[:blank:]]\+$' include tests/*.{cpp,py,h} docs/*.rst -rn --color=always )"
+if [ -n "$found" ]; then
+    # The mt=41 sets a red background for matched trailing spaces
+    echo -e '\033[31;01mError: found trailing spaces in the following files:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
 
-found=
-exec 3< <(GREP_COLORS='mt=41' grep '^\s*{\s*$' include/ docs/*.rst -rn --color=always)
-while read -u 3 f; do
-    if [ -z "$found" ]; then
-        echo -e '\e[31m\e[01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files: \e[0m'
-        found=1
-        errors=1
-    fi
+found="$(grep '\<\(if\|for\|while\|catch\)(\|){' include tests/*.{cpp,h} -rn --color=always)"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found the following coding style problems:\033[0m'
+    check_style_errors=1
+    echo "$found" | sed -e 's/^/    /'
+fi
 
-    echo "    $f"
-done
+found="$(awk '
+function prefix(filename, lineno) {
+    return "    \033[35m" filename "\033[36m:\033[32m" lineno "\033[36m:\033[0m"
+}
+function mark(pattern, string) { sub(pattern, "\033[01;31m&\033[0m", string); return string }
+last && /^\s*{/ {
+    print prefix(FILENAME, FNR-1) mark("\\)\\s*$", last)
+    print prefix(FILENAME, FNR)   mark("^\\s*{", $0)
+    last=""
+}
+{ last = /(if|for|while|catch|switch)\s*\(.*\)\s*$/ ? $0 : "" }
+' $(find include -type f) tests/*.{cpp,h} docs/*.rst)"
+if [ -n "$found" ]; then
+    check_style_errors=1
+    echo -e '\033[31;01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files:\033[0m'
+    echo "$found"
+fi
 
-exit $errors
+exit $check_style_errors
diff --git a/pybind11/tools/clang/cindex.py b/pybind11/tools/clang/cindex.py
index d9c5da937..3a083de0d 100644
--- a/pybind11/tools/clang/cindex.py
+++ b/pybind11/tools/clang/cindex.py
@@ -1084,6 +1084,126 @@ CursorKind.NULL_STMT = CursorKind(230)
 # Adaptor class for mixing declarations with statements and expressions.
 CursorKind.DECL_STMT = CursorKind(231)
 
+# OpenMP parallel directive.
+CursorKind.OMP_PARALLEL_DIRECTIVE = CursorKind(232)
+
+# OpenMP SIMD directive.
+CursorKind.OMP_SIMD_DIRECTIVE = CursorKind(233)
+
+# OpenMP for directive.
+CursorKind.OMP_FOR_DIRECTIVE = CursorKind(234)
+
+# OpenMP sections directive.
+CursorKind.OMP_SECTIONS_DIRECTIVE = CursorKind(235)
+
+# OpenMP section directive.
+CursorKind.OMP_SECTION_DIRECTIVE = CursorKind(236)
+
+# OpenMP single directive.
+CursorKind.OMP_SINGLE_DIRECTIVE = CursorKind(237)
+
+# OpenMP parallel for directive.
+CursorKind.OMP_PARALLEL_FOR_DIRECTIVE = CursorKind(238)
+
+# OpenMP parallel sections directive.
+CursorKind.OMP_PARALLEL_SECTIONS_DIRECTIVE = CursorKind(239)
+
+# OpenMP task directive.
+CursorKind.OMP_TASK_DIRECTIVE = CursorKind(240)
+
+# OpenMP master directive.
+CursorKind.OMP_MASTER_DIRECTIVE = CursorKind(241)
+
+# OpenMP critical directive.
+CursorKind.OMP_CRITICAL_DIRECTIVE = CursorKind(242)
+
+# OpenMP taskyield directive.
+CursorKind.OMP_TASKYIELD_DIRECTIVE = CursorKind(243)
+
+# OpenMP barrier directive.
+CursorKind.OMP_BARRIER_DIRECTIVE = CursorKind(244)
+
+# OpenMP taskwait directive.
+CursorKind.OMP_TASKWAIT_DIRECTIVE = CursorKind(245)
+
+# OpenMP flush directive.
+CursorKind.OMP_FLUSH_DIRECTIVE = CursorKind(246)
+
+# Windows Structured Exception Handling's leave statement.
+CursorKind.SEH_LEAVE_STMT = CursorKind(247)
+
+# OpenMP ordered directive.
+CursorKind.OMP_ORDERED_DIRECTIVE = CursorKind(248)
+
+# OpenMP atomic directive.
+CursorKind.OMP_ATOMIC_DIRECTIVE = CursorKind(249)
+
+# OpenMP for SIMD directive.
+CursorKind.OMP_FOR_SIMD_DIRECTIVE = CursorKind(250)
+
+# OpenMP parallel for SIMD directive.
+CursorKind.OMP_PARALLELFORSIMD_DIRECTIVE = CursorKind(251)
+
+# OpenMP target directive.
+CursorKind.OMP_TARGET_DIRECTIVE = CursorKind(252)
+
+# OpenMP teams directive.
+CursorKind.OMP_TEAMS_DIRECTIVE = CursorKind(253)
+
+# OpenMP taskgroup directive.
+CursorKind.OMP_TASKGROUP_DIRECTIVE = CursorKind(254)
+
+# OpenMP cancellation point directive.
+CursorKind.OMP_CANCELLATION_POINT_DIRECTIVE = CursorKind(255)
+
+# OpenMP cancel directive.
+CursorKind.OMP_CANCEL_DIRECTIVE = CursorKind(256)
+
+# OpenMP target data directive.
+CursorKind.OMP_TARGET_DATA_DIRECTIVE = CursorKind(257)
+
+# OpenMP taskloop directive.
+CursorKind.OMP_TASK_LOOP_DIRECTIVE = CursorKind(258)
+
+# OpenMP taskloop simd directive.
+CursorKind.OMP_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(259)
+
+# OpenMP distribute directive.
+CursorKind.OMP_DISTRIBUTE_DIRECTIVE = CursorKind(260)
+
+# OpenMP target enter data directive.
+CursorKind.OMP_TARGET_ENTER_DATA_DIRECTIVE = CursorKind(261)
+
+# OpenMP target exit data directive.
+CursorKind.OMP_TARGET_EXIT_DATA_DIRECTIVE = CursorKind(262)
+
+# OpenMP target parallel directive.
+CursorKind.OMP_TARGET_PARALLEL_DIRECTIVE = CursorKind(263)
+
+# OpenMP target parallel for directive.
+CursorKind.OMP_TARGET_PARALLELFOR_DIRECTIVE = CursorKind(264)
+
+# OpenMP target update directive.
+CursorKind.OMP_TARGET_UPDATE_DIRECTIVE = CursorKind(265)
+
+# OpenMP distribute parallel for directive.
+CursorKind.OMP_DISTRIBUTE_PARALLELFOR_DIRECTIVE = CursorKind(266)
+
+# OpenMP distribute parallel for simd directive.
+CursorKind.OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(267)
+
+# OpenMP distribute simd directive.
+CursorKind.OMP_DISTRIBUTE_SIMD_DIRECTIVE = CursorKind(268)
+
+# OpenMP target parallel for simd directive.
+CursorKind.OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(269)
+
+# OpenMP target simd directive.
+CursorKind.OMP_TARGET_SIMD_DIRECTIVE = CursorKind(270)
+
+# OpenMP teams distribute directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(271)
+
 ###
 # Other Kinds
 
@@ -1136,6 +1256,10 @@ CursorKind.INCLUSION_DIRECTIVE = CursorKind(503)
 CursorKind.MODULE_IMPORT_DECL = CursorKind(600)
 # A type alias template declaration
 CursorKind.TYPE_ALIAS_TEMPLATE_DECL = CursorKind(601)
+# A static_assert or _Static_assert node
+CursorKind.STATIC_ASSERT = CursorKind(602)
+# A friend declaration
+CursorKind.FRIEND_DECL = CursorKind(603)
 
 # A code completion overload candidate.
 CursorKind.OVERLOAD_CANDIDATE = CursorKind(700)
@@ -1732,6 +1856,7 @@ TypeKind.OBJCID = TypeKind(27)
 TypeKind.OBJCCLASS = TypeKind(28)
 TypeKind.OBJCSEL = TypeKind(29)
 TypeKind.FLOAT128 = TypeKind(30)
+TypeKind.HALF = TypeKind(31)
 TypeKind.COMPLEX = TypeKind(100)
 TypeKind.POINTER = TypeKind(101)
 TypeKind.BLOCKPOINTER = TypeKind(102)
@@ -3016,6 +3141,10 @@ functionList = [
    [Cursor],
    bool),
 
+  ("clang_defaultDiagnosticDisplayOptions",
+   [],
+   c_uint),
+
   ("clang_defaultSaveOptions",
    [TranslationUnit],
    c_uint),
@@ -3057,6 +3186,10 @@ functionList = [
    [Type, Type],
    bool),
 
+  ("clang_formatDiagnostic",
+   [Diagnostic, c_uint],
+   _CXString),
+
   ("clang_getArgType",
    [Type, c_uint],
    Type,
diff --git a/pybind11/tools/mkdoc.py b/pybind11/tools/mkdoc.py
index 400fb05da..1fd8cceed 100644
--- a/pybind11/tools/mkdoc.py
+++ b/pybind11/tools/mkdoc.py
@@ -56,26 +56,19 @@ CPP_OPERATORS = OrderedDict(
 job_count = cpu_count()
 job_semaphore = Semaphore(job_count)
 
-registered_names = dict()
-
+output = []
 
 def d(s):
     return s.decode('utf8')
 
 
 def sanitize_name(name):
-    global registered_names
     name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name)
     for k, v in CPP_OPERATORS.items():
         name = name.replace('operator%s' % k, 'operator_%s' % v)
     name = re.sub('<.*>', '', name)
     name = ''.join([ch if ch.isalnum() else '_' for ch in name])
     name = re.sub('_$', '', re.sub('_+', '_', name))
-    if name in registered_names:
-        registered_names[name] += 1
-        name += '_' + str(registered_names[name])
-    else:
-        registered_names[name] = 1
     return '__doc_' + name
 
 
@@ -189,8 +182,7 @@ def process_comment(comment):
     return result.rstrip().lstrip('\n')
 
 
-def extract(filename, node, prefix, output):
-    num_extracted = 0
+def extract(filename, node, prefix):
     if not (node.location.file is None or
             os.path.samefile(d(node.location.file.name), filename)):
         return 0
@@ -201,9 +193,7 @@ def extract(filename, node, prefix, output):
                 sub_prefix += '_'
             sub_prefix += d(node.spelling)
         for i in node.get_children():
-            num_extracted += extract(filename, i, sub_prefix, output)
-        if num_extracted == 0:
-            return 0
+            extract(filename, i, sub_prefix)
     if node.kind in PRINT_LIST:
         comment = d(node.raw_comment) if node.raw_comment is not None else ''
         comment = process_comment(comment)
@@ -212,18 +202,15 @@ def extract(filename, node, prefix, output):
             sub_prefix += '_'
         if len(node.spelling) > 0:
             name = sanitize_name(sub_prefix + d(node.spelling))
-            output.append('\nstatic const char *%s =%sR"doc(%s)doc";' %
-                (name, '\n' if '\n' in comment else ' ', comment))
-            num_extracted += 1
-    return num_extracted
+            global output
+            output.append((name, filename, comment))
 
 
 class ExtractionThread(Thread):
-    def __init__(self, filename, parameters, output):
+    def __init__(self, filename, parameters):
         Thread.__init__(self)
         self.filename = filename
         self.parameters = parameters
-        self.output = output
         job_semaphore.acquire()
 
     def run(self):
@@ -232,7 +219,7 @@ class ExtractionThread(Thread):
             index = cindex.Index(
                 cindex.conf.lib.clang_createIndex(False, True))
             tu = index.parse(self.filename, self.parameters)
-            extract(self.filename, tu.cursor, '', self.output)
+            extract(self.filename, tu.cursor, '')
         finally:
             job_semaphore.release()
 
@@ -289,18 +276,26 @@ if __name__ == '__main__':
 #endif
 ''')
 
-    output = []
+    output.clear()
     for filename in filenames:
-        thr = ExtractionThread(filename, parameters, output)
+        thr = ExtractionThread(filename, parameters)
         thr.start()
 
     print('Waiting for jobs to finish ..', file=sys.stderr)
     for i in range(job_count):
         job_semaphore.acquire()
 
-    output.sort()
-    for l in output:
-        print(l)
+    name_ctr = 1
+    name_prev = None
+    for name, _, comment in list(sorted(output, key=lambda x: (x[0], x[1]))):
+        if name == name_prev:
+            name_ctr += 1
+            name = name + "_%i" % name_ctr
+        else:
+            name_prev = name
+            name_ctr = 1
+        print('\nstatic const char *%s =%sR"doc(%s)doc";' %
+              (name, '\n' if '\n' in comment else ' ', comment))
 
     print('''
 #if defined(__GNUG__)
diff --git a/pybind11/tools/pybind11Config.cmake.in b/pybind11/tools/pybind11Config.cmake.in
index a4206c166..3dd1b2c1a 100644
--- a/pybind11/tools/pybind11Config.cmake.in
+++ b/pybind11/tools/pybind11Config.cmake.in
@@ -21,18 +21,27 @@
 # Exported targets::
 #
 # If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
-# target. Python headers, libraries (as needed by platform), and C++ standard
+# interface library targets::
+#
+#   pybind11::module - for extension modules
+#   pybind11::embed - for embedding the Python interpreter
+#
+# Python headers, libraries (as needed by platform), and the C++ standard
 # are attached to the target. Set PythonLibsNew variables to influence
 # python detection and PYBIND11_CPP_STANDARD (-std=c++11 or -std=c++14) to
 # influence standard setting. ::
 #
-#   pybind11::module - the main pybind11 interface library for extension modules (i.e., headers)
-#
 #   find_package(pybind11 CONFIG REQUIRED)
-#   message(STATUS "Found pybind11: ${pybind11_INCLUDE_DIR} (found version ${pybind11_VERSION} & Py${PYTHON_VERSION_STRING})")
+#   message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+#
+#   # Create an extension module
 #   add_library(mylib MODULE main.cpp)
 #   target_link_libraries(mylib pybind11::module)
 #
+#   # Or embed the Python interpreter into an executable
+#   add_executable(myexe main.cpp)
+#   target_link_libraries(myexe pybind11::embed)
+#
 # Suggested usage::
 #
 # find_package with version info is not recommended except for release versions. ::
@@ -75,17 +84,16 @@ if(NOT TARGET ${PN}::pybind11)
     include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
 
     find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED)
-    set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIRS})
+    set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIRS})
+    set_property(TARGET ${PN}::embed APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
     if(WIN32 OR CYGWIN)
       set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
     endif()
 
-    select_cxx_standard()
-    set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_COMPILE_OPTIONS "${PYBIND11_CPP_STANDARD}")
+    set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS "${PYBIND11_CPP_STANDARD}")
 
-    get_property(_iid TARGET ${PN}::module PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    get_property(_iid TARGET ${PN}::pybind11 PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
     get_property(_ill TARGET ${PN}::module PROPERTY INTERFACE_LINK_LIBRARIES)
-    get_property(_ico TARGET ${PN}::module PROPERTY INTERFACE_COMPILE_OPTIONS)
     set(${PN}_INCLUDE_DIRS ${_iid})
     set(${PN}_LIBRARIES ${_ico} ${_ill})
 endif()
diff --git a/pybind11/tools/pybind11Tools.cmake b/pybind11/tools/pybind11Tools.cmake
index fba76adee..a7c471a07 100644
--- a/pybind11/tools/pybind11Tools.cmake
+++ b/pybind11/tools/pybind11Tools.cmake
@@ -8,7 +8,9 @@
 cmake_minimum_required(VERSION 2.8.12)
 
 # Add a CMake parameter for choosing a desired Python version
-set(PYBIND11_PYTHON_VERSION "" CACHE STRING "Python version to use for compiling modules")
+if(NOT PYBIND11_PYTHON_VERSION)
+  set(PYBIND11_PYTHON_VERSION "" CACHE STRING "Python version to use for compiling modules")
+endif()
 
 set(Python_ADDITIONAL_VERSIONS 3.7 3.6 3.5 3.4)
 find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} REQUIRED)
@@ -16,44 +18,95 @@ find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} REQUIRED)
 include(CheckCXXCompilerFlag)
 include(CMakeParseArguments)
 
-function(select_cxx_standard)
-  if(NOT MSVC AND NOT PYBIND11_CPP_STANDARD)
+if(NOT PYBIND11_CPP_STANDARD AND NOT CMAKE_CXX_STANDARD)
+  if(NOT MSVC)
     check_cxx_compiler_flag("-std=c++14" HAS_CPP14_FLAG)
-    check_cxx_compiler_flag("-std=c++11" HAS_CPP11_FLAG)
 
     if (HAS_CPP14_FLAG)
       set(PYBIND11_CPP_STANDARD -std=c++14)
-    elseif (HAS_CPP11_FLAG)
-      set(PYBIND11_CPP_STANDARD -std=c++11)
     else()
-      message(FATAL_ERROR "Unsupported compiler -- pybind11 requires C++11 support!")
+      check_cxx_compiler_flag("-std=c++11" HAS_CPP11_FLAG)
+      if (HAS_CPP11_FLAG)
+        set(PYBIND11_CPP_STANDARD -std=c++11)
+      else()
+        message(FATAL_ERROR "Unsupported compiler -- pybind11 requires C++11 support!")
+      endif()
     endif()
+  elseif(MSVC)
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+  endif()
 
-    set(PYBIND11_CPP_STANDARD ${PYBIND11_CPP_STANDARD} CACHE STRING
-        "C++ standard flag, e.g. -std=c++11 or -std=c++14. Defaults to latest available." FORCE)
+  set(PYBIND11_CPP_STANDARD ${PYBIND11_CPP_STANDARD} CACHE STRING
+      "C++ standard flag, e.g. -std=c++11, -std=c++14, /std:c++14.  Defaults to C++14 mode." FORCE)
+endif()
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.  cxxflags and
+# linkerflags are lists of flags to use.  The result variable is a unique variable name for each set
+# of flags: the compilation result will be cached base on the result variable.  If the flags work,
+# sets them in cxxflags_out/linkerflags_out internal cache variables (in addition to ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if (${result})
+    set(${cxxflags_out} "${cxxflags}" CACHE INTERNAL "" FORCE)
+    set(${linkerflags_out} "${linkerflags}" CACHE INTERNAL "" FORCE)
   endif()
 endfunction()
 
-# Internal: find the appropriate LTO flag for this compiler
-macro(_pybind11_find_lto_flag output_var prefer_thin_lto)
-  if(${prefer_thin_lto})
-    # Check for ThinLTO support (Clang)
-    check_cxx_compiler_flag("-flto=thin" HAS_THIN_LTO_FLAG)
-    set(${output_var} $<${HAS_THIN_LTO_FLAG}:-flto=thin>)
-  endif()
+# Internal: find the appropriate link time optimization flags for this compiler
+function(_pybind11_add_lto_flags target_name prefer_thin_lto)
+  if (NOT DEFINED PYBIND11_LTO_CXX_FLAGS)
+    set(PYBIND11_LTO_CXX_FLAGS "" CACHE INTERNAL "")
+    set(PYBIND11_LTO_LINKER_FLAGS "" CACHE INTERNAL "")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+      set(cxx_append "")
+      set(linker_append "")
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+        # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+        set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+      elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+        set(cxx_append ";-fno-fat-lto-objects")
+      endif()
 
-  if(NOT ${prefer_thin_lto} OR NOT HAS_THIN_LTO_FLAG)
-    if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-      # Check for Link Time Optimization support (GCC/Clang)
-      check_cxx_compiler_flag("-flto" HAS_LTO_FLAG)
-      set(${output_var} $<${HAS_LTO_FLAG}:-flto>)
-    else()
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO_THIN
+          "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+
+      if (NOT HAS_FLTO_THIN)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO
+          "-flto${cxx_append}" "-flto${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
       # Intel equivalent to LTO is called IPO
-      check_cxx_compiler_flag("-ipo" HAS_IPO_FLAG)
-      set(${output_var} $<${HAS_IPO_FLAG}:-ipo>)
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO
+      "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    elseif(MSVC)
+      # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+      # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+      # with - instead of /, even if it is a bit non-standard:
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG
+        "/GL" "-LTCG" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
     endif()
+
+    if (PYBIND11_LTO_CXX_FLAGS)
+      message(STATUS "LTO enabled")
+    else()
+      message(STATUS "LTO disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if (PYBIND11_LTO_CXX_FLAGS)
+    target_compile_options(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_CXX_FLAGS}>")
+  endif()
+  if (PYBIND11_LTO_LINKER_FLAGS)
+    target_link_libraries(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_LINKER_FLAGS}>")
   endif()
-endmacro()
+endfunction()
 
 # Build a Python extension module:
 # pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
@@ -86,6 +139,13 @@ function(pybind11_add_module target_name)
   set_target_properties(${target_name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
   set_target_properties(${target_name} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
 
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+
   if(WIN32 OR CYGWIN)
     # Link against the Python shared library on Windows
     target_link_libraries(${target_name} PRIVATE ${PYTHON_LIBRARIES})
@@ -112,52 +172,31 @@ function(pybind11_add_module target_name)
     endif()
   endif()
 
-  select_cxx_standard()
-  if(NOT MSVC)
-    # Make sure C++11/14 are enabled
-    target_compile_options(${target_name} PUBLIC ${PYBIND11_CPP_STANDARD})
-  endif()
+  # Make sure C++11/14 are enabled
+  target_compile_options(${target_name} PUBLIC ${PYBIND11_CPP_STANDARD})
 
   if(ARG_NO_EXTRAS)
     return()
   endif()
 
-  if(NOT MSVC)
-    # Enable link time optimization and set the default symbol
-    # visibility to hidden (very important to obtain small binaries)
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE)
-    if (NOT ${U_CMAKE_BUILD_TYPE} MATCHES DEBUG)
-      # Link Time Optimization
-      if(NOT CYGWIN)
-        _pybind11_find_lto_flag(lto_flag ARG_THIN_LTO)
-        target_compile_options(${target_name} PRIVATE ${lto_flag})
-      endif()
-
-      # Default symbol visibility
-      target_compile_options(${target_name} PRIVATE "-fvisibility=hidden")
-
-      # Strip unnecessary sections of the binary on Linux/Mac OS
-      if(CMAKE_STRIP)
-        if(APPLE)
-          add_custom_command(TARGET ${target_name} POST_BUILD
-                             COMMAND ${CMAKE_STRIP} -u -r $<TARGET_FILE:${target_name}>)
-        else()
-          add_custom_command(TARGET ${target_name} POST_BUILD
-                             COMMAND ${CMAKE_STRIP} $<TARGET_FILE:${target_name}>)
-        endif()
+  _pybind11_add_lto_flags(${target_name} ${ARG_THIN_LTO})
+
+  if (NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug)
+    # Strip unnecessary sections of the binary on Linux/Mac OS
+    if(CMAKE_STRIP)
+      if(APPLE)
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} -x $<TARGET_FILE:${target_name}>)
+      else()
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} $<TARGET_FILE:${target_name}>)
       endif()
     endif()
-  elseif(MSVC)
+  endif()
+
+  if(MSVC)
     # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
     # needed for bigger binding projects due to the limit to 64k addressable sections
     target_compile_options(${target_name} PRIVATE /MP /bigobj)
-
-    # Enforce link time code generation on MSVC, except in debug mode
-    target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/GL>)
-
-    # Fancy generator expressions don't work with linker flags, for reasons unknown
-    set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS_RELEASE /LTCG)
-    set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS_MINSIZEREL /LTCG)
-    set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS_RELWITHDEBINFO /LTCG)
   endif()
 endfunction()
-- 
GitLab