diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1fc0d4a49aa60f1ea8382ddcb963dcbf97736b42..d037dec9a0b8e395a8a3d8313cec7f72fa30dd04 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,10 @@ SPDX-License-Identifier: LicenseRef-GPL-2.0-only-with-DUNE-exception
 
 # Master (will become release 2.10)
 
+- `UMFPACK` is extended to arbitrary blocked matrix structures. This includes `MultiTypeBlockMatrix`. The external interface is unchanged.
+  However, internally the `BCCSMatrixInitializer` is replaced by direct calls of `flatMatrixForEach` similar to `Cholmod`. This requires a
+  compatible vector field of "ignored" degrees of freedom. The method `setSubMatrix` with a top-level index set is preserved for compatibility.
+
 - The internal storage in `MatrixIndexSet` was changed from `std::set` to a sorted `std::vector`
   (with a `std::set` fallback for very dense rows) to improve performance. The stored index
   type was changed from `std::size_t` to `uint32_t` to reduce memory consumption and improve
diff --git a/dune/istl/test/umfpacktest.cc b/dune/istl/test/umfpacktest.cc
index 334107cfac40ddeed9263f074cd52896979733fe..dcfadbc716970634ca9eef5eb15da2248ebabddc 100644
--- a/dune/istl/test/umfpacktest.cc
+++ b/dune/istl/test/umfpacktest.cc
@@ -8,23 +8,47 @@
 
 #include <dune/common/fmatrix.hh>
 #include <dune/common/fvector.hh>
+#include <dune/common/indices.hh>
 #include <dune/common/timer.hh>
+#include <dune/common/test/testsuite.hh>
 #include <dune/istl/bvector.hh>
 #include <dune/istl/umfpack.hh>
 
 #include "laplacian.hh"
 
+using namespace Dune;
 
-template<typename Matrix, typename Vector>
-void runUMFPack(std::size_t N)
+// helper to add something to the diagonal of a BCRSMatrix with blocks
+auto addToDiagonal = [](auto&& matrix, const auto& value) {
+  for( auto rowIt=matrix.begin(); rowIt!=matrix.end(); rowIt++ )
+  {
+    for( auto entryIt=rowIt->begin(); entryIt!=rowIt->end(); entryIt++ )
+    {
+      if ( entryIt.index() == rowIt.index() )
+      {
+        auto&& block = Dune::Impl::asMatrix(*entryIt);
+        for (auto it=block.begin(); it!=block.end(); ++it)
+        {
+          (*it)[it.index()] += value;
+        }
+      }
+    }
+  }
+};
+
+template<typename Matrix, typename Vector, typename BitVector>
+TestSuite runUMFPack(std::size_t N)
 {
-  typedef Dune::MatrixAdapter<Matrix,Vector,Vector> Operator;
+  TestSuite t;
 
   Matrix mat;
-  Operator fop(mat);
   Vector b(N*N), x(N*N), b1(N/2), x1(N/2);
 
   setupLaplacian(mat,N);
+
+  // make this regular
+  addToDiagonal(mat,100.0);
+
   b=1;
   b1=1;
   x=0;
@@ -41,26 +65,170 @@ void runUMFPack(std::size_t N)
   solver.apply(x, b, res);
   solver.free();
 
+  // test
+  auto norm = b.two_norm();
+  mat.mmv(x,b);
+  t.check( b.two_norm()/norm < 1e-11 ) << " Error in UMFPACK, relative residual is too large: " << b.two_norm()/norm;
+
   Dune::UMFPack<Matrix> solver1;
 
   std::set<std::size_t> mrs;
+  BitVector bitVector(N*N);
+  bitVector = true;
   for(std::size_t s=0; s < N/2; ++s)
+  {
     mrs.insert(s);
+    bitVector[s] = false;
+  }
+
+  solver1.setMatrix(mat,bitVector);
+  solver1.apply(x1,b1, res);
 
+  // test
+  x=0;
+  b=0;
+  for( std::size_t i=0; i<N/2; i++ )
+  {
+    // set subindices
+    x[i] = x1[i];
+    b[i] = b1[i];
+  }
+
+  norm = b.two_norm();
+  mat.mmv(x,b);
+
+  // truncate deactivated indices
+  for( std::size_t i=N/2; i<N*N; i++ )
+    b[i] = 0;
+
+  t.check( b.two_norm()/norm < 1e-11 ) << " Error in UMFPACK, relative residual is too large: " << b.two_norm()/norm;
+
+  // compare with setSubMatrix
   solver1.setSubMatrix(mat,mrs);
   solver1.setVerbosity(true);
 
-  solver1.apply(x1,b1, res);
+  auto x2 = x1;
+  x2 = 0;
+  solver1.apply(x2,b1, res);
+
+  x2 -= x1;
+  t.check( x2.two_norm()/x1.two_norm() < 1e-11 ) << " Error in UMFPACK, setSubMatrix yields different result as setMatrix with BitVector, relative diff: " << x2.two_norm()/x1.two_norm();
+
+
   solver1.apply(reinterpret_cast<typename Matrix::field_type*>(&x1[0]),
                 reinterpret_cast<typename Matrix::field_type*>(&b1[0]));
 
   Dune::UMFPack<Matrix> save_solver(mat,"umfpack_decomp",0);
   Dune::UMFPack<Matrix> load_solver(mat,"umfpack_decomp",0);
+
+  return t;
+}
+
+
+
+template<typename Matrix, typename Vector, typename BitVector>
+TestSuite runUMFPackMultitype2x2(std::size_t N)
+{
+  TestSuite t;
+
+  using namespace Dune::Indices;
+
+  Matrix mat;
+
+  Vector b,x,b1,x1;
+
+  b[_0].resize(N*N);
+  b[_1].resize(N*N);
+
+  x[_0].resize(N*N);
+  x[_1].resize(N*N);
+
+  b1[_0].resize(N/2);
+  b1[_1].resize(N/2);
+  x1[_0].resize(N/2);
+  x1[_1].resize(N/2);
+
+
+  // initialize all 4 matrix blocks
+  setupLaplacian(mat[_0][_0],N);
+  setupLaplacian(mat[_0][_1],N);
+  setupLaplacian(mat[_1][_0],N);
+  setupLaplacian(mat[_1][_1],N);
+
+  // make this regular
+  addToDiagonal(mat[_0][_0],200.0);
+  addToDiagonal(mat[_1][_1],100.0);
+
+  b=1.0;
+  b1=1.0;
+  x=0.0;
+  x1=0.0;
+
+  Dune::Timer watch;
+
+  watch.reset();
+
+  Dune::UMFPack<Matrix> solver(mat,1);
+
+  Dune::InverseOperatorResult res;
+
+  solver.apply(x, b, res);
+  solver.free();
+
+  // test
+  auto norm = b.two_norm();
+  mat.mmv(x,b);
+  t.check( b.two_norm()/norm < 1e-11 ) << " Error in UMFPACK, relative residual is too large: " << b.two_norm()/norm;
+
+  Dune::UMFPack<Matrix> solver1;
+
+  BitVector bitVector;
+  bitVector[_0].resize(N*N);
+  bitVector[_1].resize(N*N);
+
+  bitVector = true;
+  for(std::size_t s=0; s < N/2; ++s)
+  {
+    bitVector[_0][s] = false;
+    bitVector[_1][s] = false;
+  }
+
+  solver1.setMatrix(mat,bitVector);
+  solver1.apply(x1,b1, res);
+
+  // test
+  x=0;
+  b=0;
+  for( std::size_t i=0; i<N/2; i++ )
+  {
+    // set subindices
+    x[_0][i] = x1[_0][i];
+    x[_1][i] = x1[_1][i];
+    b[_0][i] = b1[_0][i];
+    b[_1][i] = b1[_1][i];
+  }
+
+  norm = b.two_norm();
+  mat.mmv(x,b);
+
+  // truncate deactivated indices
+  for( std::size_t i=N/2; i<N*N; i++ )
+  {
+    b[_0][i] = 0;
+    b[_1][i] = 0;
+  }
+
+  t.check( b.two_norm()/norm < 1e-11 ) << " Error in UMFPACK, relative residual is too large: " << b.two_norm()/norm;
+
+  return t;
 }
 
+
 int main(int argc, char** argv) try
 {
 #if HAVE_SUITESPARSE_UMFPACK
+
+  TestSuite t;
   std::size_t N=100;
 
   if(argc>1)
@@ -69,28 +237,49 @@ int main(int argc, char** argv) try
   // ------------------------------------------------------------------------------
   std::cout<<"testing for N="<<N<<" BCRSMatrix<double>"<<std::endl;
   {
-    using Matrix = Dune::BCRSMatrix<double>;
-    using Vector = Dune::BlockVector<double>;
-    runUMFPack<Matrix,Vector>(N);
+    using Matrix    = Dune::BCRSMatrix<double>;
+    using Vector    = Dune::BlockVector<double>;
+    using BitVector = Dune::BlockVector<int>;
+    t.subTest(runUMFPack<Matrix,Vector,BitVector>(N));
   }
 
   // ------------------------------------------------------------------------------
   std::cout<<"testing for N="<<N<<" BCRSMatrix<FieldMatrix<double,1,1> >"<<std::endl;
   {
-    using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double,1,1> >;
-    using Vector = Dune::BlockVector<Dune::FieldVector<double,1> >;
-    runUMFPack<Matrix,Vector>(N);
+    using Matrix    = Dune::BCRSMatrix<Dune::FieldMatrix<double,1,1> >;
+    using Vector    = Dune::BlockVector<Dune::FieldVector<double,1> >;
+    using BitVector = Dune::BlockVector<Dune::FieldVector<int,1> >;
+    t.subTest(runUMFPack<Matrix,Vector,BitVector>(N));
   }
 
   // ------------------------------------------------------------------------------
   std::cout<<"testing for N="<<N<<" BCRSMatrix<FieldMatrix<double,2,2> >"<<std::endl;
   {
-    using Matrix = Dune::BCRSMatrix<Dune::FieldMatrix<double,2,2> >;
-    using Vector = Dune::BlockVector<Dune::FieldVector<double,2> >;
-    runUMFPack<Matrix,Vector>(N);
+    using Matrix    = Dune::BCRSMatrix<Dune::FieldMatrix<double,2,2> >;
+    using Vector    = Dune::BlockVector<Dune::FieldVector<double,2> >;
+    using BitVector = Dune::BlockVector<Dune::FieldVector<int,2> >;
+    t.subTest(runUMFPack<Matrix,Vector,BitVector>(N));
+  }
+
+  // ------------------------------------------------------------------------------
+  std::cout<<"testing for N="<<N<<" MultiTypeBlockMatrix<BCRSMatrix<FieldMatrix<double,2,2>> ... >"<<std::endl;
+  {
+    using MatrixBlock    = Dune::BCRSMatrix<Dune::FieldMatrix<double,2,2> >;
+    using BlockRow       = Dune::MultiTypeBlockVector<MatrixBlock,MatrixBlock>;
+    using Matrix         = Dune::MultiTypeBlockMatrix<BlockRow,BlockRow>;
+
+    using VectorBlock    = Dune::BlockVector<Dune::FieldVector<double,2> >;
+    using Vector         = Dune::MultiTypeBlockVector<VectorBlock,VectorBlock>;
+
+    using BitVectorBlock = Dune::BlockVector<Dune::FieldVector<int,2> >;
+    using BitVector      = Dune::MultiTypeBlockVector<BitVectorBlock,BitVectorBlock>;
+
+    t.subTest(runUMFPackMultitype2x2<Matrix,Vector,BitVector>(N));
   }
 
-  return 0;
+
+
+  return t.exit();
 #else // HAVE_SUITESPARSE_UMFPACK
   std::cerr << "You need SuiteSparse's UMFPack to run this test." << std::endl;
   return 77;
diff --git a/dune/istl/umfpack.hh b/dune/istl/umfpack.hh
index 20d72de5839b636da4b40dbf6885e34c6fdd1ec5..63fa05dcd353da99f39b8a46c61a479840e2b647 100644
--- a/dune/istl/umfpack.hh
+++ b/dune/istl/umfpack.hh
@@ -17,6 +17,9 @@
 #include<dune/common/fvector.hh>
 #include<dune/istl/bccsmatrixinitializer.hh>
 #include<dune/istl/bcrsmatrix.hh>
+#include<dune/istl/foreach.hh>
+#include<dune/istl/multitypeblockmatrix.hh>
+#include<dune/istl/multitypeblockvector.hh>
 #include<dune/istl/solvers.hh>
 #include<dune/istl/solvertype.hh>
 #include <dune/istl/solverfactory.hh>
@@ -193,6 +196,32 @@ namespace Dune {
       /** @brief The type of the range of the solver */
       using range_type  = BlockVector<T, A>;
     };
+
+    // to make the `UMFPackVectorChooser` work with `MultiTypeBlockMatrix`, we need to add an intermediate step for the rows, which are typically `MultiTypeBlockVector`
+    template<typename FirstBlock, typename... Blocks>
+    struct UMFPackVectorChooser<MultiTypeBlockVector<FirstBlock, Blocks...> >
+    {
+      /** @brief The type of the domain of the solver */
+      using domain_type = MultiTypeBlockVector<typename UMFPackVectorChooser<FirstBlock>::domain_type,typename UMFPackVectorChooser<Blocks>::domain_type... >;
+      /** @brief The type of the range of the solver */
+      using range_type  = typename UMFPackVectorChooser<FirstBlock>::range_type;
+    };
+
+    // specialization for `MultiTypeBlockMatrix` with `MultiTypeBlockVector` rows
+    template<typename FirstRow, typename... Rows>
+    struct UMFPackVectorChooser<MultiTypeBlockMatrix<FirstRow, Rows...> >
+    {
+      /** @brief The type of the domain of the solver */
+      using domain_type = typename UMFPackVectorChooser<FirstRow>::domain_type;
+      /** @brief The type of the range of the solver */
+      using range_type  = MultiTypeBlockVector< typename UMFPackVectorChooser<FirstRow>::range_type, typename UMFPackVectorChooser<Rows>::range_type... >;
+    };
+
+    // dummy class to represent no BitVector
+    struct NoBitVector
+    {};
+
+
   }
 
   /** @brief The %UMFPack direct sparse solver
@@ -223,7 +252,7 @@ namespace Dune {
     /** @brief The matrix type. */
     using Matrix = M;
     using matrix_type = M;
-    /** @brief The corresponding UMFPack matrix type.*/
+    /** @brief The corresponding (scalar) UMFPack matrix type.*/
     using UMFPackMatrix = ISTL::Impl::BCCSMatrix<typename Matrix::field_type, size_type>;
     /** @brief Type of an associated initializer class. */
     using MatrixInitializer = ISTL::Impl::BCCSMatrixInitializer<M, size_type>;
@@ -368,17 +397,37 @@ namespace Dune {
       if (b.size() == 0)
         return;
 
+      // we have to convert x and b into flat structures
+      // however, this is linear in time
+      std::vector<T> xFlat(x.dim()), bFlat(b.dim());
+
+      flatVectorForEach(x, [&](auto&& entry, auto&& offset)
+      {
+        xFlat[ offset ] = entry;
+      });
+
+      flatVectorForEach(b, [&](auto&& entry, auto&& offset)
+      {
+        bFlat[ offset ] = entry;
+      });
+
       double UMF_Apply_Info[UMFPACK_INFO];
       Caller::solve(UMFPACK_A,
                     umfpackMatrix_.getColStart(),
                     umfpackMatrix_.getRowIndex(),
                     umfpackMatrix_.getValues(),
-                    reinterpret_cast<double*>(&x[0]),
-                    reinterpret_cast<double*>(&b[0]),
+                    reinterpret_cast<double*>(&xFlat[0]),
+                    reinterpret_cast<double*>(&bFlat[0]),
                     UMF_Numeric,
                     UMF_Control,
                     UMF_Apply_Info);
 
+      // copy back to blocked vector
+      flatVectorForEach(x, [&](auto&& entry, auto offset)
+      {
+        entry = xFlat[offset];
+      });
+
       //this is a direct solver
       res.iterations = 1;
       res.converged = true;
@@ -399,6 +448,8 @@ namespace Dune {
      * @brief additional apply method with c-arrays in analogy to superlu
      * @param x solution array
      * @param b rhs array
+     *
+     * NOTE If the user hands over a pure pointer, we assume that they know what they are doing, hence no copy to flat structures
      */
     void apply(T* x, T* b)
     {
@@ -444,8 +495,17 @@ namespace Dune {
         DUNE_THROW(Dune::Exception,"IO ERROR while trying to save UMFPack decomposition");
     }
 
-    /** @brief Initialize data from given matrix. */
-    void setMatrix(const Matrix& matrix)
+    /** @brief Initialize data from given matrix.
+     *
+     * \tparam BitVector a compatible bitvector to the domain_type/range_type.
+     *         Defaults to NoBitVector for backwards compatibility
+     *
+     *  A positive bit indices that the corresponding matrix row/column is excluded from the
+     *  UMFPACK decomposition.
+     *  WARNING This is an opposite behavior of the previous implementation in `setSubMatrix`.
+     */
+    template<class BitVector = Impl::NoBitVector>
+    void setMatrix(const Matrix& matrix, const BitVector& bitVector = {})
     {
       if ((umfpackMatrix_.N() + umfpackMatrix_.M() > 0) || matrixIsLoaded_)
         free();
@@ -454,16 +514,128 @@ namespace Dune {
 
       if (umfpackMatrix_.N() + umfpackMatrix_.M() + umfpackMatrix_.nonzeroes() != 0)
         umfpackMatrix_.free();
-      umfpackMatrix_.setSize(MatrixDimension<Matrix>::rowdim(matrix),
-                             MatrixDimension<Matrix>::coldim(matrix));
-      ISTL::Impl::BCCSMatrixInitializer<Matrix, size_type> initializer(umfpackMatrix_);
 
-      copyToBCCSMatrix(initializer, matrix);
+      constexpr bool useBitVector = not std::is_same_v<BitVector,Impl::NoBitVector>;
+
+      // use a dynamic flat vector for the bitset
+      std::vector<bool> flatBitVector;
+      // and a mapping from the compressed indices
+      std::vector<size_type> subIndices;
+
+      int numberOfIgnoredDofs = 0;
+      int nonZeros = 0;
+
+      if constexpr ( useBitVector )
+      {
+        auto flatSize = flatVectorForEach(bitVector, [](auto&&, auto&&){});
+        flatBitVector.resize(flatSize);
+
+        flatVectorForEach(bitVector, [&](auto&& entry, auto&& offset)
+        {
+          flatBitVector[ offset ] = entry;
+          if ( entry )
+          {
+            numberOfIgnoredDofs++;
+          }
+        });
+      }
+
+      // compute the flat dimension and the number of nonzeros of the matrix
+      auto [flatRows,flatCols] = flatMatrixForEach( matrix, [&](auto&& /*entry*/, auto&& row, auto&& col){
+        // do not count ignored entries
+        if constexpr ( useBitVector )
+          if ( flatBitVector[row] or flatBitVector[col] )
+            return;
+
+        nonZeros++;
+      });
+
+      if constexpr ( useBitVector )
+      {
+        // use the original flatRows!
+        subIndices.resize(flatRows,std::numeric_limits<std::size_t>::max());
+
+        size_type subIndexCounter = 0;
+        for ( size_type i=0; i<size_type(flatRows); i++ )
+          if ( not  flatBitVector[ i ] )
+            subIndices[ i ] = subIndexCounter++;
+
+        // update the original matrix size
+        flatRows -= numberOfIgnoredDofs;
+        flatCols -= numberOfIgnoredDofs;
+      }
+
+
+      umfpackMatrix_.setSize(flatRows,flatCols);
+      umfpackMatrix_.Nnz_ = nonZeros;
+
+      // prepare the arrays
+      umfpackMatrix_.colstart = new size_type[flatCols+1];
+      umfpackMatrix_.rowindex = new size_type[nonZeros];
+      umfpackMatrix_.values   = new T[nonZeros];
+
+      for ( size_type i=0; i<size_type(flatCols+1); i++ )
+      {
+        umfpackMatrix_.colstart[i] = 0;
+      }
+
+      // at first, we need to compute the column start indices
+      // therefore, we count all entries in each column and in the end we accumulate everything
+      flatMatrixForEach(matrix, [&](auto&& /*entry*/, auto&& flatRowIndex, auto&& flatColIndex)
+      {
+        // do nothing if entry is excluded
+        if constexpr ( useBitVector )
+          if ( flatBitVector[flatRowIndex] or flatBitVector[flatColIndex] )
+            return;
+
+        // pick compressed or uncompressed index
+        // compiler will hopefully do some constexpr optimization here
+        auto colIdx = useBitVector ? subIndices[flatColIndex] : flatColIndex;
+
+        umfpackMatrix_.colstart[colIdx+1]++;
+      });
+
+      // now accumulate
+      for ( size_type i=0; i<(size_type)flatCols; i++ )
+      {
+        umfpackMatrix_.colstart[i+1] += umfpackMatrix_.colstart[i];
+      }
+
+      // we need a compressed position counter in each column
+      std::vector<size_type> colPosition(flatCols,0);
+
+      // now we can set the entries: the procedure below works with both row- or column major index ordering
+      flatMatrixForEach(matrix, [&](auto&& entry, auto&& flatRowIndex, auto&& flatColIndex)
+      {
+        // do nothing if entry is excluded
+        if constexpr ( useBitVector )
+          if ( flatBitVector[flatRowIndex] or flatBitVector[flatColIndex] )
+            return;
+
+        // pick compressed or uncompressed index
+        // compiler will hopefully do some constexpr optimization here
+        auto rowIdx = useBitVector ? subIndices[flatRowIndex] : flatRowIndex;
+        auto colIdx = useBitVector ? subIndices[flatColIndex] : flatColIndex;
+
+        // the start index of each column is already fixed
+        auto colStart = umfpackMatrix_.colstart[colIdx];
+        // get the current number of picked elements in this column
+        auto colPos   = colPosition[colIdx];
+        // assign the corresponding row index and the value of this element
+        umfpackMatrix_.rowindex[ colStart + colPos ] = rowIdx;
+        umfpackMatrix_.values[ colStart + colPos ] = entry;
+        // increase the number of picked elements in this column
+        colPosition[colIdx]++;
+      });
 
       decompose();
     }
 
-    template<class S>
+    // Keep legacy version using a set of scalar indices
+    // The new version using a bitVector type for marking the active matrix indices is
+    // directly given in `setMatrix` with an additional BitVector argument.
+    // The new version is more flexible and allows, e.g., marking single components of a matrix block.
+    template<typename S>
     void setSubMatrix(const Matrix& _mat, const S& rowIndexSet)
     {
       if ((umfpackMatrix_.N() + umfpackMatrix_.M() > 0) || matrixIsLoaded_)