Commit d9d19560 authored by Alexander Gerwing's avatar Alexander Gerwing

Removed colorbalanced flavour

parent 414c0b03
Pipeline #37754 passed with stage
in 8 minutes and 15 seconds
#ifndef PACXX_PROJECTSEMINAR_2019_SRC_COLOURINGBALANCED_GRIDOPERATOR_HH
#define PACXX_PROJECTSEMINAR_2019_SRC_COLOURINGBALANCED_GRIDOPERATOR_HH
#include <array>
#include <cstddef>
#include <future>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include <dune/common/classname.hh>
#include <dune/common/fvector.hh>
#include <dune/common/rangeutilities.hh>
#include <dune/common/typelist.hh>
#include "common-gridoperator.hh"
#include "gpu/colouringbalanced-gpucontext.hh"
#include "multilineargeometry.hh"
#include "timings.hh"
#include <PACXX.h>
#include <pacxx/detail/device/DeviceCode.h>
namespace PPS {
namespace colouringbalanced {
/**
* \brief Grid Operator
*
* Assumes Q1/P1 scalar finite element and Dof layout
*
* Assumes BlockVector/BCRSMatrix.
*
* \tparam RF Element type of vectors and type used for computations
*/
template<class GridView, class LocalOperator, class RF>
class GridOperator :
public CommonGridOperator<GridView, LocalOperator, RF>
{
using Base = CommonGridOperator<GridView, LocalOperator, RF>;
using DF = typename Base::DF;
using LB = typename Base::LB;
using Base::gv_;
using Base::lop_;
using Base::lb_;
using Base::cv_;
// Gpu context
std::shared_ptr<PPS::colouringbalanced::GpuContext<GridView, LocalOperator, RF>> gpu_context_;
public:
//! Constructor for non trivial constraints
GridOperator(GridView gv, LocalOperator lop, std::set<std::size_t> cv,
std::shared_ptr<Timings> timings)
: Base{ std::move(gv), std::move(lop), std::move(cv), std::move(timings) }
{
gpu_context_ = std::make_shared<PPS::colouringbalanced::GpuContext<GridView, LocalOperator, RF>>();
gpu_context_->z_changed = true;
}
// Notifies the gridoperator, that the linearization point changed
void notify_linearization_point_change() const
{
gpu_context_->z_changed = true;
}
//! Apply Jacobian(z) to w
/**
* \param z Point where to linearize
* \param w Point to apply the Jacobian to
* \param y Where to add the result
* \param trafo A functor used to transform values before accumulation
* into y (usually to apply a factor)
*
* Basically computes the derivative of the residual `r(z)` for each
* component of `z`, e.g. a_ij = dr_i/dz_j, then on the fly applies the
* result to `w`, e.g. y_i = sum_j (dr_i/dz_j)*w_j
*/
template<class Domain, class Range, class Trafo>
void nonlinear_jacobian_apply(const Domain &z, const Domain &w,
Range &y, Trafo trafo) const
{
// Transform residual into owner form. Do this outside of
// timer so timings don't include waiting for other ranks.
y = this->consistentToOwner(std::move(y));
static const auto table =
std::string(__func__) + '(' + Dune::className<Trafo>() + ')';
auto mainTimer = this->makeTimer(table);
// Create context
gpu_context_->template create<Domain, Range>(z, w, y, gv_, lop_, lb_);
// number of basis functions
auto size = lb_.size();
// Upload z (if neccessary)
std::vector<RF> tmp_array;
if (gpu_context_->z_changed)
{
gpu_context_->z_changed = false;
tmp_array.resize(z.size());
for (size_t i = 0; i < z.size(); ++i)
tmp_array[i] = z[i];
gpu_context_->z_array.upload(tmp_array.data(), tmp_array.size());
}
// Upload w
tmp_array.resize(w.size());
for (size_t i = 0; i < w.size(); ++i)
tmp_array[i] = w[i];
gpu_context_->w_array.upload(tmp_array.data(), tmp_array.size());
// Get pointer
auto dp_z = gpu_context_->z_array.getInstance();
auto dp_w = gpu_context_->w_array.getInstance();
auto dp_y = gpu_context_->y_array.getInstance();
auto dp_yl = gpu_context_->yl_array.getInstance();
auto dp_rule = gpu_context_->quadrature_rule.getInstance();
auto dp_lb = gpu_context_->localbase.getInstance();
auto dp_lop = gpu_context_->local_operator.getInstance();
auto dp_grid = gpu_context_->grid.getInstance();
auto dp_bucket_sizes = gpu_context_->bucket_sizes.getInstance();
auto dp_bucket_offsets = gpu_context_->bucket_offsets.getInstance();
auto dp_buckets_flattened = gpu_context_->buckets_flattened.getInstance();
// Create gpu kernel
auto gpu_kernel = [=](pacxx::v2::range &config, size_t bucket_num)
{
size_t worker_id = config.get_global(0);
if (worker_id < dp_bucket_sizes->get(bucket_num))
{
size_t element = dp_buckets_flattened->get(dp_bucket_offsets->get(bucket_num) + worker_id);
// Per-element tiral coefficient vectors
std::array<typename Domain::field_type, LB::size()> zl;
std::array<typename Domain::field_type, LB::size()> wl;
std::array<typename Range::field_type, LB::size()> yl;
// make sure local containers are initialized where needed
// gather coeffecients from global into local z and w
std::array<size_t, LB::size()> gis;
dp_grid->subIndices(element, gis);
for (size_t i = 0; i < LB::size(); ++i)
{
zl[i] = dp_z->get(gis[i]);
wl[i] = dp_w->get(gis[i]);
yl[i] = 0;
}
// Create the MultiLinearGeometry
std::array<Dune::FieldVector<typename Domain::field_type, GridView::dimension>, LB::size()> corners;
dp_grid->getCorners(element, corners);
PPS::MultiLinearGeometry<typename Domain::field_type,
GridView::dimension,
GridView::dimension,
GpuGeometryTraits<typename Domain::field_type >> multi_linear_geometry(corners);
// Apply volume
// use subcription in place of dereference to avoid pacxx warning
dp_lop->jacobian_apply_volume(multi_linear_geometry, dp_rule[0], dp_lb[0], zl, wl, yl);
for (size_t i = 0; i < LB::size(); ++i)
dp_y->data()[dp_grid->subIndex(element, i)] += trafo(yl[i]);
}
};
// Upload y
tmp_array.resize(y.size());
for (size_t i = 0; i < y.size(); ++i)
tmp_array[i] = y[i];
gpu_context_->y_array.upload(tmp_array.data(), tmp_array.size());
for(size_t i=0; i < gpu_context_->num_buckets; i++){
// Execute gpu kernel async
size_t num_workitems = 32;
size_t num_elems_in_bucket = gpu_context_->bucket_sizes_[i];
size_t num_workgroups = (num_elems_in_bucket / num_workitems) +
(num_elems_in_bucket % num_workitems != 0 ? 1 : 0);
//std::cout << "running kernel for bucket # " << i << " containing " << num_elems_in_bucket << " elements" << std::endl;
auto &executor = pacxx::v2::Executor::get(0);
executor.launch(gpu_kernel, {{num_workgroups},{num_workitems}}, i);
}
// Download y
gpu_context_->y_array.download(tmp_array.data(), tmp_array.size());
for (size_t i = 0; i < tmp_array.size(); ++i)
y[i] = tmp_array[i];
// Apply dirichlet constraints:
// simply set each constrained coefficient to 0
for (auto gi : cv_)
y[gi] = 0;
mainTimer.stop();
// Transform back into consistent form. Do this outside of
// timer so timings don't include waiting for other ranks.
y = this->additiveToConsistent(std::move(y));
}
};
struct GridOperatorFactory {
std::shared_ptr<Timings> timings;
bool supportsNonoverlapping() const { return true; }
auto operator()(Dune::SolverCategory::Category) const
{
return [timings=timings](auto gv, auto lop, auto rfType,
std::set<std::size_t> cv)
{
using GO =
PPS::colouringbalanced::GridOperator<decltype(gv), decltype(lop),
typename decltype(rfType)::type>;
return GO{ std::move(gv), std::move(lop), std::move(cv),
timings };
};
}
};
} // namespace colouringbalanced
} // namespace PPS
#endif // PACXX_PROJECTSEMINAR_2019_SRC_GRIDOPERATOR_HH
#ifndef PACXX_PROJECTSEMINAR_2019_SRC_COLOURINGBALANCED_GPU_CONTEXT_HH
#define PACXX_PROJECTSEMINAR_2019_SRC_COLOURINGBALANCED_GPU_CONTEXT_HH
#include "../q1localbasis.hh"
#include "gpuarray.hh"
#include "gpuobject.hh"
#include "gpugeometry.hh"
#include "gpuquadrature.hh"
#include "scatterkernel-gpugrid.hh"
namespace PPS {
namespace colouringbalanced {
template<class GridView, class LocalOperator, class RF>
class GpuContext
{
using LB = Q1LocalBasis<typename GridView::ctype, RF, GridView::dimension>;
public:
GpuContext() : initialized_(false) { }
bool z_changed;
PPS::GpuArrayHandle<RF> z_array;
PPS::GpuArrayHandle<RF> w_array;
PPS::GpuArrayHandle<RF> y_array;
PPS::GpuArrayHandle<RF> yl_array;
PPS::GpuObjectHandle<LB> localbase;
PPS::GpuObjectHandle<LocalOperator> local_operator;
PPS::scatterkernel::GpuGridHandle<RF, GridView::dimension, LB> grid;
PPS::GpuQuadratureRuleHandle<RF, GridView::dimension> quadrature_rule;
size_t num_buckets;
std::vector<size_t> bucket_sizes_;
std::vector<size_t> bucket_offsets_;
PPS::GpuArrayHandle<size_t> bucket_sizes;
PPS::GpuArrayHandle<size_t> bucket_offsets;
PPS::GpuArrayHandle<size_t> buckets_flattened;
template<class Domain, class Range>
void create(const Domain& z, const Domain& w, const Range& y, const GridView& gv, const LocalOperator& lop, const LB& lb)
{
if (!initialized_)
{
grid.create(gv);
localbase.create(lb);
local_operator.create(lop);
const int order = std::max(2 * lb.jorder(), lb.order()*lop.param().qprimeorder() + 2 * lb.order());
quadrature_rule.create(order);
z_array.create(z.size());
w_array.create(w.size());
y_array.create(y.size());
yl_array.create(grid.numElements() * lb.size());
arrange_in_buckets(gv, lb);
initialized_ = true;
}
}
private:
bool initialized_;
std::vector<std::vector<size_t>> y_id_buckets_elements_;
std::vector<std::vector<size_t>> y_id_buckets_vertices_;
void arrange_in_buckets(const GridView& gv, const LB& lb)
{
// Algorithm description:
// 1. Identify the maximum number of elements in all groups of strongly connected elements
// Def: In this context strongly connected elements are defined as elements, which cannot
// be assigned the same colour
// - For each element E do
// - gather connected elements (using dune grid connectivity)
// - for each connected elements do
// - count how many of the gathered elements cannot be assigned the same colour
// - store this number, if it is a new maximum
// - Assign above maximum to E
// - Return the maximum as c
// 2. Sort the elements according to their assigned number in descending order
// 3. Create c buckets
// 4. Assign elements to the bucktes
// - For each element do
// - Identify the smallest bucket, in which the element can be sorted
// - Insert it into that bucket
}
};
} // namespace colouringbalanced
} // namespace PPS
#endif // PACXX_PROJECTSEMINAR_2019_SRC_GPU_CONTEXT_HH
#include "config.h"
#include <dune/common/parallel/mpihelper.hh>
#include <dune/grid/uggrid.hh>
#include "driver.hh"
#include "factories.hh"
#include "grid.hh"
#include "colouringbalanced-gridoperator.hh"
#include "scatterkernelreorder-linearoperator-matrixfree.hh"
#include "linearsolver-richardson.hh"
#include "params.hh"
#include "timings.hh"
//===============================================================
// Main program with grid setup
//===============================================================
int main(int argc, char** argv)
{
// Maybe initialize Mpi
auto &mpiHelper = Dune::MPIHelper::instance(argc, argv);
auto ptree = PPS::readParams(argc, argv, "tutorial01.ini");
auto timings = PPS::timings(ptree.sub("timings"),
mpiHelper.getCollectiveCommunication());
typedef Dune::UGGrid<2> Grid;
auto factories =
PPS::makeFactories(PPS::GridFactory<Grid>{ptree.sub("grid")},
PPS::colouringbalanced::GridOperatorFactory{ timings },
PPS::scatterkernelreorder::MatrixFreeOperatorFactory{ptree.sub("operator")},
PPS::RichardsonSolverFactory{ptree.sub("solver")});
return PPS::driver(std::move(factories), ptree);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment