Commit c88b1585 authored by Alexander Gerwing's avatar Alexander Gerwing

Merge branch 'bucket_colouring' into 'master'

Bucket colouring

See merge request !99
parents 17ce6f00 d9d19560
Pipeline #38407 failed with stage
in 11 seconds
......@@ -81,7 +81,7 @@ dune_add_test(
set_property(TARGET solution01-host PROPERTY EXCLUDE_FROM_ALL 0)
# Test simulation code on device
foreach(flavour IN ITEMS device device-scatterkernel device-scatterkernelreorder)
foreach(flavour IN ITEMS device device-scatterkernel device-scatterkernelreorder device-colouring)
add_executable(solution01-${flavour} solution01-${flavour}.cc)
dune_add_test(
......
......@@ -17,8 +17,8 @@ wtf() {
#
# flavours and executors
#
declare -a available_flavours=(host device device-scatterkernel device-scatterkernelreorder)
declare -a default_flavours=(device device-scatterkernel device-scatterkernelreorder)
declare -a available_flavours=(host device device-scatterkernel device-scatterkernelreorder device-colouring)
declare -a default_flavours=(device device-scatterkernel device-scatterkernelreorder device-colouring)
declare -a available_executors=(cuda norv rv)
declare -a default_executors=(cuda norv)
......
#ifndef PACXX_PROJECTSEMINAR_2019_SRC_COLOURING_GRIDOPERATOR_HH
#define PACXX_PROJECTSEMINAR_2019_SRC_COLOURING_GRIDOPERATOR_HH
#include <array>
#include <cstddef>
#include <future>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include <dune/common/classname.hh>
#include <dune/common/fvector.hh>
#include <dune/common/rangeutilities.hh>
#include <dune/common/typelist.hh>
#include "common-gridoperator.hh"
#include "gpu/colouring-gpucontext.hh"
#include "multilineargeometry.hh"
#include "timings.hh"
#include <PACXX.h>
#include <pacxx/detail/device/DeviceCode.h>
namespace PPS {
namespace colouring {
/**
* \brief Grid Operator
*
* Assumes Q1/P1 scalar finite element and Dof layout
*
* Assumes BlockVector/BCRSMatrix.
*
* \tparam RF Element type of vectors and type used for computations
*/
template<class GridView, class LocalOperator, class RF>
class GridOperator :
public CommonGridOperator<GridView, LocalOperator, RF>
{
using Base = CommonGridOperator<GridView, LocalOperator, RF>;
using DF = typename Base::DF;
using LB = typename Base::LB;
using Base::gv_;
using Base::lop_;
using Base::lb_;
using Base::cv_;
// Gpu context
std::shared_ptr<PPS::colouring::GpuContext<GridView, LocalOperator, RF>> gpu_context_;
public:
//! Constructor for non trivial constraints
GridOperator(GridView gv, LocalOperator lop, std::set<std::size_t> cv,
std::shared_ptr<Timings> timings)
: Base{ std::move(gv), std::move(lop), std::move(cv), std::move(timings) }
{
gpu_context_ = std::make_shared<PPS::colouring::GpuContext<GridView, LocalOperator, RF>>();
gpu_context_->z_changed = true;
}
// Notifies the gridoperator, that the linearization point changed
void notify_linearization_point_change() const
{
gpu_context_->z_changed = true;
}
//! Apply Jacobian(z) to w
/**
* \param z Point where to linearize
* \param w Point to apply the Jacobian to
* \param y Where to add the result
* \param trafo A functor used to transform values before accumulation
* into y (usually to apply a factor)
*
* Basically computes the derivative of the residual `r(z)` for each
* component of `z`, e.g. a_ij = dr_i/dz_j, then on the fly applies the
* result to `w`, e.g. y_i = sum_j (dr_i/dz_j)*w_j
*/
template<class Domain, class Range, class Trafo>
void nonlinear_jacobian_apply(const Domain &z, const Domain &w,
Range &y, Trafo trafo) const
{
// Transform residual into owner form. Do this outside of
// timer so timings don't include waiting for other ranks.
y = this->consistentToOwner(std::move(y));
static const auto table =
std::string(__func__) + '(' + Dune::className<Trafo>() + ')';
auto mainTimer = this->makeTimer(table);
// Create context
gpu_context_->template create<Domain, Range>(z, w, y, gv_, lop_, lb_);
// number of basis functions
auto size = lb_.size();
// Upload z (if neccessary)
std::vector<RF> tmp_array;
if (gpu_context_->z_changed)
{
gpu_context_->z_changed = false;
tmp_array.resize(z.size());
for (size_t i = 0; i < z.size(); ++i)
tmp_array[i] = z[i];
gpu_context_->z_array.upload(tmp_array.data(), tmp_array.size());
}
// Upload w
tmp_array.resize(w.size());
for (size_t i = 0; i < w.size(); ++i)
tmp_array[i] = w[i];
gpu_context_->w_array.upload(tmp_array.data(), tmp_array.size());
// Get pointer
auto dp_z = gpu_context_->z_array.getInstance();
auto dp_w = gpu_context_->w_array.getInstance();
auto dp_y = gpu_context_->y_array.getInstance();
auto dp_yl = gpu_context_->yl_array.getInstance();
auto dp_rule = gpu_context_->quadrature_rule.getInstance();
auto dp_lb = gpu_context_->localbase.getInstance();
auto dp_lop = gpu_context_->local_operator.getInstance();
auto dp_grid = gpu_context_->grid.getInstance();
auto dp_bucket_sizes = gpu_context_->bucket_sizes.getInstance();
auto dp_bucket_offsets = gpu_context_->bucket_offsets.getInstance();
auto dp_buckets_flattened = gpu_context_->buckets_flattened.getInstance();
// Create gpu kernel
auto gpu_kernel = [=](pacxx::v2::range &config, size_t bucket_num)
{
size_t worker_id = config.get_global(0);
if (worker_id < dp_bucket_sizes->get(bucket_num))
{
size_t element = dp_buckets_flattened->get(dp_bucket_offsets->get(bucket_num) + worker_id);
// Per-element tiral coefficient vectors
std::array<typename Domain::field_type, LB::size()> zl;
std::array<typename Domain::field_type, LB::size()> wl;
std::array<typename Range::field_type, LB::size()> yl;
// make sure local containers are initialized where needed
// gather coeffecients from global into local z and w
std::array<size_t, LB::size()> gis;
dp_grid->subIndices(element, gis);
for (size_t i = 0; i < LB::size(); ++i)
{
zl[i] = dp_z->get(gis[i]);
wl[i] = dp_w->get(gis[i]);
yl[i] = 0;
}
// Create the MultiLinearGeometry
std::array<Dune::FieldVector<typename Domain::field_type, GridView::dimension>, LB::size()> corners;
dp_grid->getCorners(element, corners);
PPS::MultiLinearGeometry<typename Domain::field_type,
GridView::dimension,
GridView::dimension,
GpuGeometryTraits<typename Domain::field_type >> multi_linear_geometry(corners);
// Apply volume
// use subcription in place of dereference to avoid pacxx warning
dp_lop->jacobian_apply_volume(multi_linear_geometry, dp_rule[0], dp_lb[0], zl, wl, yl);
for (size_t i = 0; i < LB::size(); ++i)
dp_y->data()[dp_grid->subIndex(element, i)] += trafo(yl[i]);
}
};
// Upload y
tmp_array.resize(y.size());
for (size_t i = 0; i < y.size(); ++i)
tmp_array[i] = y[i];
gpu_context_->y_array.upload(tmp_array.data(), tmp_array.size());
for(size_t i=0; i < gpu_context_->num_buckets; i++){
// Execute gpu kernel async
size_t num_workitems = 32;
size_t num_elems_in_bucket = gpu_context_->bucket_sizes_[i];
size_t num_workgroups = (num_elems_in_bucket / num_workitems) +
(num_elems_in_bucket % num_workitems != 0 ? 1 : 0);
//std::cout << "running kernel for bucket # " << i << " containing " << num_elems_in_bucket << " elements" << std::endl;
auto &executor = pacxx::v2::Executor::get(0);
executor.launch(gpu_kernel, {{num_workgroups},{num_workitems}}, i);
}
// Download y
gpu_context_->y_array.download(tmp_array.data(), tmp_array.size());
for (size_t i = 0; i < tmp_array.size(); ++i)
y[i] = tmp_array[i];
// Apply dirichlet constraints:
// simply set each constrained coefficient to 0
for (auto gi : cv_)
y[gi] = 0;
mainTimer.stop();
// Transform back into consistent form. Do this outside of
// timer so timings don't include waiting for other ranks.
y = this->additiveToConsistent(std::move(y));
}
};
struct GridOperatorFactory {
std::shared_ptr<Timings> timings;
bool supportsNonoverlapping() const { return true; }
auto operator()(Dune::SolverCategory::Category) const
{
return [timings=timings](auto gv, auto lop, auto rfType,
std::set<std::size_t> cv)
{
using GO =
PPS::colouring::GridOperator<decltype(gv), decltype(lop),
typename decltype(rfType)::type>;
return GO{ std::move(gv), std::move(lop), std::move(cv),
timings };
};
}
};
} // namespace colouring
} // namespace PPS
#endif // PACXX_PROJECTSEMINAR_2019_SRC_GRIDOPERATOR_HH
#ifndef PACXX_PROJECTSEMINAR_2019_SRC_COLOURING_GPU_CONTEXT_HH
#define PACXX_PROJECTSEMINAR_2019_SRC_COLOURING_GPU_CONTEXT_HH
#include "../q1localbasis.hh"
#include "gpuarray.hh"
#include "gpuobject.hh"
#include "gpugeometry.hh"
#include "gpuquadrature.hh"
#include "scatterkernel-gpugrid.hh"
namespace PPS {
namespace colouring {
template<class GridView, class LocalOperator, class RF>
class GpuContext
{
using LB = Q1LocalBasis<typename GridView::ctype, RF, GridView::dimension>;
public:
GpuContext() : initialized_(false) { }
bool z_changed;
PPS::GpuArrayHandle<RF> z_array;
PPS::GpuArrayHandle<RF> w_array;
PPS::GpuArrayHandle<RF> y_array;
PPS::GpuArrayHandle<RF> yl_array;
PPS::GpuObjectHandle<LB> localbase;
PPS::GpuObjectHandle<LocalOperator> local_operator;
PPS::scatterkernel::GpuGridHandle<RF, GridView::dimension, LB> grid;
PPS::GpuQuadratureRuleHandle<RF, GridView::dimension> quadrature_rule;
size_t num_buckets;
std::vector<size_t> bucket_sizes_;
std::vector<size_t> bucket_offsets_;
PPS::GpuArrayHandle<size_t> bucket_sizes;
PPS::GpuArrayHandle<size_t> bucket_offsets;
PPS::GpuArrayHandle<size_t> buckets_flattened;
template<class Domain, class Range>
void create(const Domain& z, const Domain& w, const Range& y, const GridView& gv, const LocalOperator& lop, const LB& lb)
{
if (!initialized_)
{
grid.create(gv);
localbase.create(lb);
local_operator.create(lop);
const int order = std::max(2 * lb.jorder(), lb.order()*lop.param().qprimeorder() + 2 * lb.order());
quadrature_rule.create(order);
z_array.create(z.size());
w_array.create(w.size());
y_array.create(y.size());
yl_array.create(grid.numElements() * lb.size());
arrange_in_buckets(gv, lb);
initialized_ = true;
}
}
private:
bool initialized_;
std::vector<std::vector<size_t>> y_id_buckets_elements_;
std::vector<std::vector<size_t>> y_id_buckets_vertices_;
void arrange_in_buckets(const GridView& gv, const LB& lb){
size_t current_bucket = 0;
std::vector<bool> assigned_elems(grid.numElements(), false);
const auto &iset = gv.indexSet();
//as long as not all elements are assigned to a bucket:
while(true){
//open a new bucket:
//std::cout << "################## filling bucket " << current_bucket << std::endl;
y_id_buckets_elements_.resize(current_bucket+1);
y_id_buckets_vertices_.resize(current_bucket+1);
size_t i = 0;
//iterate over all elements:
for (auto& element : elements(gv)){
//pick only elements that are not in a bucket yet:
if (assigned_elems[i] == false){
//std::cout << "testing element #" << i << std::endl;
// auto size = lb.size();
bool no_overlap = true;
//check if the vertices of this element are already contained in the current bucket:
for (size_t j=0; j<LB::size(); j++){
//std::cout << "element has " << element.dimension << " corners" << std::endl;
auto global_elem_index = iset.subIndex(element, j, element.dimension);
if(std::find(y_id_buckets_vertices_[current_bucket].begin(), y_id_buckets_vertices_[current_bucket].end(), global_elem_index)
!=y_id_buckets_vertices_[current_bucket].end()){
//std::cout << "element is overlapping with bucket..." << std::endl;
no_overlap = false;
break;
}
}
//if none of the vertices are in the current bucket, the element can be added to the bucket:
if(no_overlap == true){
//std::cout << "element can be added to the bucket..." << std::endl;
y_id_buckets_elements_[current_bucket].push_back(i);
//add all the elements' vertices to the vertex list of the current bucket:
for (size_t j=0; j<LB::size(); j++){
auto global_elem_index = iset.subIndex(element, j, element.dimension);
y_id_buckets_vertices_[current_bucket].push_back(global_elem_index);
}
//mark element as assigned!
assigned_elems[i] = true;
}
}
i++;
}
//added all non-conflicting elements
// if all elements are marked as assigned, break the while loop...
if (std::find(assigned_elems.begin(), assigned_elems.end(), false) == assigned_elems.end())
break;
current_bucket++;
}
std::cout << "number of buckets: " << (current_bucket-1) << std::endl;
for( size_t i =0; i< y_id_buckets_elements_.size(); i++){
std::cout << "contents of bucket: " << i << std::endl;
for( size_t j =0; j< y_id_buckets_elements_[i].size();j++){
std::cout << y_id_buckets_elements_[i][j] << " ";
}
std::cout << std::endl;
}
num_buckets = current_bucket+1;
bucket_sizes_ = std::vector<size_t>(num_buckets);
bucket_offsets_= std::vector<size_t>(num_buckets);
std::vector<size_t> buckets_flattened_;
size_t offset = 0;
for(size_t i=0; i<num_buckets;i++){
bucket_sizes_[i] = y_id_buckets_elements_[i].size();
bucket_offsets_[i] = offset;
for(size_t j=0; j<y_id_buckets_elements_[i].size();j++){
buckets_flattened_.push_back(y_id_buckets_elements_[i][j]);
}
offset += bucket_sizes_[i];
}
bucket_sizes.create(bucket_sizes_.data(), num_buckets);
bucket_offsets.create(bucket_offsets_.data(), num_buckets);
buckets_flattened.create(buckets_flattened_.data(), buckets_flattened_.size());
//Make sure that we've got as many elements in the flattened list as elements initially in the grid
assert(grid.numElements() == buckets_flattened_.size());
}
};
} // namespace colouring
} // namespace PPS
#endif // PACXX_PROJECTSEMINAR_2019_SRC_GPU_CONTEXT_HH
#include "config.h"
#include <dune/common/parallel/mpihelper.hh>
#include <dune/grid/uggrid.hh>
#include "driver.hh"
#include "factories.hh"
#include "grid.hh"
#include "colouring-gridoperator.hh"
#include "scatterkernelreorder-linearoperator-matrixfree.hh"
#include "linearsolver-richardson.hh"
#include "params.hh"
#include "timings.hh"
//===============================================================
// Main program with grid setup
//===============================================================
int main(int argc, char** argv)
{
// Maybe initialize Mpi
auto &mpiHelper = Dune::MPIHelper::instance(argc, argv);
auto ptree = PPS::readParams(argc, argv, "tutorial01.ini");
auto timings = PPS::timings(ptree.sub("timings"),
mpiHelper.getCollectiveCommunication());
typedef Dune::UGGrid<2> Grid;
auto factories =
PPS::makeFactories(PPS::GridFactory<Grid>{ptree.sub("grid")},
PPS::colouring::GridOperatorFactory{ timings },
PPS::scatterkernelreorder::MatrixFreeOperatorFactory{ptree.sub("operator")},
PPS::RichardsonSolverFactory{ptree.sub("solver")});
return PPS::driver(std::move(factories), ptree);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment