ALP User Documentation  0.8.preview
Algebraic Programming User Documentation
kmeans.hpp
Go to the documentation of this file.
1 
2 /*
3  * Copyright 2021 Huawei Technologies Co., Ltd.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
27 #ifndef _H_GRB_KMEANS
28 #define _H_GRB_KMEANS
29 
30 #include <chrono>
31 #include <random>
32 
33 #include <assert.h>
34 
35 #include <graphblas.hpp>
36 
37 
38 namespace grb {
39 
40  namespace algorithms {
41 
55  template<
57  typename IOType = double,
58  class Operator = operators::square_diff< IOType, IOType, IOType >
59  >
62  const Matrix< IOType > &X,
63  const Operator &dist_op = Operator()
64  ) {
65  // declare monoids and semirings
67  Monoid<
70  > min_monoid;
71  Semiring<
75  > pattern_sum;
76 
77  // runtime sanity checks: the row dimension of X should match the column
78  // dimension of K
79  if( ncols( K ) != nrows( X ) ) {
80  return MISMATCH;
81  }
82 
83  // running error code
84  RC ret = SUCCESS;
85 
86  // get problem dimensions
87  const size_t n = ncols( X );
88  const size_t m = nrows( X );
89  const size_t k = nrows( K );
90 
91  // declare vector of indices of columns of X selected as the initial
92  // centroids
93  Vector< size_t > selected_indices( k );
94 
95  // declare column selection vector
96  Vector< bool > col_select( n );
97 
98  // declare selected point
99  Vector< IOType > selected( m );
100 
101  // declare vector of distances from the selected point
102  Vector< IOType > selected_distances( n );
103 
104  // declare vector of minimum distances to all points selected so far
105  Vector< IOType > min_distances( n );
106  ret = ret ? ret : grb::set( min_distances,
108  );
109 
110  // generate first centroid by selecting a column of X uniformly at random
111 
112  size_t i;
113  {
114  const size_t seed_uniform =
115  std::chrono::system_clock::now().time_since_epoch().count();
116  std::default_random_engine random_generator( seed_uniform );
117  std::uniform_int_distribution< size_t > uniform( 0, n - 1 );
118  i = uniform( random_generator );
119  }
120 
121  for( size_t l = 0; ret == SUCCESS && l < k; ++l ) {
122 
123  ret = grb::clear( col_select );
124  ret = ret ? ret : grb::clear( selected );
125  ret = ret ? ret : grb::clear( selected_distances );
126 
127  ret = ret ? ret : grb::setElement( selected_indices, i, l );
128 
129  ret = ret ? ret : grb::setElement( col_select, true, i );
130 
131  ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >(
132  selected, col_select, X, pattern_sum );
133 
134  ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid,
135  dist_op );
136 
137  ret = ret ? ret : grb::foldl( min_distances, selected_distances,
138  min_monoid );
139 
140  // TODO the remaining part of the loop should be replaced with the alias
141  // algorithm
142 
143  IOType range = add_monoid.template getIdentity< IOType >();
144  ret = ret ? ret : grb::foldl( range, min_distances, add_monoid );
145 
146  double sample = -1;
147  if( ret == SUCCESS ) {
148  {
149  const size_t seed =
150  std::chrono::system_clock::now().time_since_epoch().count();
151  std::default_random_engine generator( seed );
152  std::uniform_real_distribution< double > uniform( 0, 1 );
153  sample = uniform( generator );
154  }
155  ret = grb::collectives<>::broadcast( sample, 0 );
156  }
157  assert( sample >= 0 );
158 
159  // The following is not standard ALP/GraphBLAS and does not work for P>1
160  // (TODO internal issue #320)
161  if( ret == SUCCESS ) {
162  assert( grb::spmd<>::nprocs() == 1 );
163  IOType * const raw = internal::getRaw( selected_distances );
164  IOType running_sum = 0;
165  i = 0;
166  do {
167  running_sum += static_cast< double >( raw[ i ] ) / range;
168  } while( running_sum < sample && ++i < n );
169  i = ( i == n ) ? n - 1 : i;
170  }
171  }
172 
173  // create the matrix K by selecting the columns of X indexed by
174  // selected_indices
175 
176  // declare pattern matrix
177  Matrix< void > M( k, n );
178  ret = ret ? ret : grb::resize( M, n );
179 
180  if( ret == SUCCESS ) {
181  auto converter = grb::utils::makeVectorToMatrixConverter< void, size_t >(
182  selected_indices, [](const size_t &ind, const size_t &val ) {
183  return std::make_pair( ind, val );
184  }
185  );
186  ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(),
187  PARALLEL );
188  }
189 
190  ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
191  pattern_sum, RESIZE );
192  ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
193  pattern_sum );
194 
195  if( ret != SUCCESS ) {
196  std::cout << "\tkpp finished with unexpected return code!" << std::endl;
197  }
198 
199  return ret;
200  }
201 
219  template<
221  typename IOType = double,
223  >
225  Matrix< IOType > &K,
226  Vector< std::pair< size_t, IOType > > &clusters_and_distances,
227  const Matrix< IOType > &X,
228  const size_t max_iter = 1000,
229  const Operator &dist_op = Operator()
230  ) {
231  // declare monoids and semirings
232 
233  typedef std::pair< size_t, IOType > indexIOType;
234 
236 
237  Monoid<
240  > argmin_monoid;
241 
242  Monoid<
245  > comparison_monoid;
246 
247  Semiring<
251  > pattern_sum;
252 
253  Semiring<
257  > pattern_count;
258 
259  // runtime sanity checks: the row dimension of X should match the column
260  // dimension of K
261  if( ncols( K ) != nrows( X ) ) {
262  return MISMATCH;
263  }
264  if( size( clusters_and_distances ) != ncols( X ) ) {
265  return MISMATCH;
266  }
267 
268  // running error code
269  RC ret = SUCCESS;
270 
271  // get problem dimensions
272  const size_t n = ncols( X );
273  const size_t m = nrows( X );
274  const size_t k = nrows( K );
275 
276  // declare distance matrix
277  Matrix< IOType > Dist( k, n );
278 
279  // declare and initialise labels vector and ones vectors
280  Vector< size_t > labels( k );
281  Vector< bool > n_ones( n ), m_ones( m );
282 
283  ret = ret ? ret : grb::set< grb::descriptors::use_index >( labels, 0 );
284  ret = ret ? ret : grb::set( n_ones, true );
285  ret = ret ? ret : grb::set( m_ones, true );
286 
287  // declare pattern matrix
288  Matrix< void > M( k, n );
289  ret = ret ? ret : grb::resize( M, n );
290 
291  // declare the sizes vector
292  Vector< size_t > sizes( k );
293 
294  // declare auxiliary vectors and matrices
295  Matrix< IOType > K_aux( k, m );
296  Matrix< size_t > V_aux( k, m );
297 
298  // control variables
299  size_t iter = 0;
300  Vector< indexIOType > clusters_and_distances_prev( n );
301  bool converged;
302 
303  do {
304  (void) ++iter;
305 
306  ret = ret ? ret : grb::set( clusters_and_distances_prev,
307  clusters_and_distances );
308 
309  ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op, RESIZE );
310  ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op );
311 
312  ret = ret ? ret : vxm( clusters_and_distances, labels, Dist, argmin_monoid,
314 
315  auto converter = grb::utils::makeVectorToMatrixConverter<
316  void, indexIOType
317  > (
318  clusters_and_distances,
319  []( const size_t &ind, const indexIOType &pair ) {
320  return std::make_pair( pair.first, ind );
321  }
322  );
323 
324  ret = ret ? ret : grb::buildMatrixUnique( M, converter.begin(),
325  converter.end(), PARALLEL );
326 
327  ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
328  pattern_sum, RESIZE );
329  ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
330  pattern_sum );
331 
332  ret = ret ? ret : grb::mxv( sizes, M, n_ones, pattern_count );
333 
334  ret = ret ? ret : grb::outer( V_aux, sizes, m_ones,
336  ret = ret ? ret : grb::outer( V_aux, sizes, m_ones,
338 
339  ret = ret ? ret : eWiseApply( K, V_aux, K_aux,
341  ret = ret ? ret : eWiseApply( K, V_aux, K_aux,
343 
344  converged = true;
345  ret = ret ? ret : grb::dot(
346  converged,
347  clusters_and_distances_prev, clusters_and_distances,
348  comparison_monoid,
350  );
351 
352  } while( ret == SUCCESS && !converged && iter < max_iter );
353 
354  if( iter == max_iter ) {
355  std::cout << "\tkmeans reached maximum number of iterations!" << std::endl;
356  return FAILED;
357  }
358  if( converged ) {
359  std::cout << "\tkmeans converged successfully after " << iter
360  << " iterations." << std::endl;
361  return SUCCESS;
362  }
363 
364  std::cout << "\tkmeans finished with unexpected return code!" << std::endl;
365  return ret;
366  }
367 
368  } // namespace algorithms
369 
370 } // namespace grb
371 
372 #endif // end _H_GRB_KMEANS
373 
RC eWiseApply(Vector< OutputType, backend, Coords > &z, const InputType1 alpha, const InputType2 beta, const OP &op=OP(), const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< OutputType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&grb::is_operator< OP >::value, void >::type *const =nullptr)
Computes , out of place, operator version.
Definition: blas1.hpp:208
RC set(Vector< DataType, backend, Coords > &x, const T val, const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< DataType >::value &&!grb::is_object< T >::value, void >::type *const =nullptr) noexcept
Sets all elements of a vector to the given value.
Definition: io.hpp:858
Standard identity for numerical addition.
Definition: identities.hpp:57
An ALP/GraphBLAS matrix.
Definition: matrix.hpp:72
RC
Return codes of ALP primitives.
Definition: rc.hpp:47
Standard identity for the minimum operator.
Definition: identities.hpp:101
Compares std::pair inputs taking the first entry in every pair as the comparison key,...
Definition: ops.hpp:678
static RC broadcast(IOType &inout, const size_t root=0)
Schedules a broadcast operation of a single object of type IOType per process.
Definition: collectives.hpp:244
const_iterator begin() const
Same as cbegin().
Definition: matrix.hpp:359
static constexpr Descriptor no_operation
Indicates no additional pre- or post-processing on any of the GraphBLAS function arguments.
Definition: descriptors.hpp:63
unsigned int Descriptor
Descriptors indicate pre- or post-processing for some or all of the arguments to an ALP/GraphBLAS cal...
Definition: descriptors.hpp:54
size_t nrows(const Matrix< InputType, backend, RIT, CIT, NIT > &A) noexcept
Requests the row size of a given matrix.
Definition: io.hpp:286
The zip operator that operators on keys as a left-hand input and values as a right hand input,...
Definition: ops.hpp:649
RC buildMatrixUnique(Matrix< InputType, implementation, RIT, CIT, NIT > &A, fwd_iterator1 I, const fwd_iterator1 I_end, fwd_iterator2 J, const fwd_iterator2 J_end, fwd_iterator3 V, const fwd_iterator3 V_end, const IOMode mode)
Assigns nonzeroes to the matrix from a coordinate format.
Definition: io.hpp:1340
This operator assigns the left-hand input if the right-hand input evaluates true.
Definition: ops.hpp:85
size_t ncols(const Matrix< InputType, backend, RIT, CIT, NIT > &A) noexcept
Requests the column size of a given matrix.
Definition: io.hpp:339
RC vxm(Vector< IOType, backend, Coords > &u, const Vector< InputType3, backend, Coords > &u_mask, const Vector< InputType1, backend, Coords > &v, const Vector< InputType4, backend, Coords > &v_mask, const Matrix< InputType2, backend, RIT, CIT, NIT > &A, const Semiring &semiring=Semiring(), const Phase &phase=EXECUTE, typename std::enable_if< grb::is_semiring< Semiring >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&!grb::is_object< InputType3 >::value &&!grb::is_object< InputType4 >::value &&!grb::is_object< IOType >::value, void >::type *=nullptr)
Left-handed in-place doubly-masked sparse matrix times vector multiplication, .
Definition: blas2.hpp:307
RC foldl(IOType &x, const Vector< InputType, backend, Coords > &y, const Vector< MaskType, backend, Coords > &mask, const Monoid &monoid=Monoid(), const typename std::enable_if< !grb::is_object< IOType >::value &&!grb::is_object< InputType >::value &&!grb::is_object< MaskType >::value &&grb::is_monoid< Monoid >::value, void >::type *const =nullptr)
Reduces, or folds, a vector into a scalar.
Definition: blas1.hpp:3840
RC mxv(Vector< IOType, backend, Coords > &u, const Vector< InputType3, backend, Coords > &u_mask, const Matrix< InputType2, backend, RIT, CIT, NIT > &A, const Vector< InputType1, backend, Coords > &v, const Vector< InputType4, backend, Coords > &v_mask, const Semiring &semiring=Semiring(), const Phase &phase=EXECUTE, const typename std::enable_if< grb::is_semiring< Semiring >::value &&!grb::is_object< IOType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&!grb::is_object< InputType3 >::value &&!grb::is_object< InputType4 >::value, void >::type *const =nullptr)
Right-handed in-place doubly-masked sparse matrix times vector multiplication, .
Definition: blas2.hpp:243
This operation returns the squared difference between two numbers.
Definition: ops.hpp:623
Indicates when one of the grb::algorithms has failed to achieve its intended result,...
Definition: rc.hpp:154
This operator assigns the right-hand input if the left-hand input evaluates true.
Definition: ops.hpp:141
This operator takes the minimum of the two input parameters and writes the result to the output varia...
Definition: ops.hpp:274
Standard identity for the logical AND operator.
Definition: identities.hpp:178
Reversed division of two numbers.
Definition: ops.hpp:353
This operator takes the sum of the two input parameters and writes it to the output variable.
Definition: ops.hpp:175
The ALP/GraphBLAS namespace.
Definition: graphblas.hpp:477
The main header to include in order to use the ALP/GraphBLAS API.
RC dot(OutputType &z, const Vector< InputType1, backend, Coords > &x, const Vector< InputType2, backend, Coords > &y, const AddMonoid &addMonoid=AddMonoid(), const AnyOp &anyOp=AnyOp(), const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< OutputType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&grb::is_monoid< AddMonoid >::value &&grb::is_operator< AnyOp >::value, void >::type *const =nullptr)
Calculates the dot product, , under a given additive monoid and multiplicative operator.
Definition: blas1.hpp:4056
Speculatively assumes that the output container(s) of the requested operation lack the necessary capa...
Definition: phase.hpp:187
For backends that support multiple user processes this class defines some basic primitives to support...
Definition: spmd.hpp:51
Parallel mode IO.
Definition: iomode.hpp:92
size_t size(const Vector< DataType, backend, Coords > &x) noexcept
Request the size of a given vector.
Definition: io.hpp:235
The logical and.
Definition: ops.hpp:490
RC mxm(Matrix< OutputType, backend, CIT1, RIT1, NIT1 > &C, const Matrix< InputType1, backend, CIT2, RIT2, NIT2 > &A, const Matrix< InputType2, backend, CIT3, RIT3, NIT3 > &B, const Semiring &ring=Semiring(), const Phase &phase=EXECUTE)
Unmasked and in-place sparse matrix–sparse matrix multiplication (SpMSpM), .
Definition: blas3.hpp:96
Indicates the primitive has executed successfully.
Definition: rc.hpp:54
The argmin operator on key-value pairs.
Definition: ops.hpp:573
A generalised semiring.
Definition: semiring.hpp:190
RC setElement(Vector< DataType, backend, Coords > &x, const T val, const size_t i, const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< DataType >::value &&!grb::is_object< T >::value, void >::type *const =nullptr)
Sets the element of a given vector at a given position to a given value.
Definition: io.hpp:1129
RC clear(Vector< DataType, backend, Coords > &x) noexcept
Clears a given vector of all nonzeroes.
Definition: io.hpp:574
One or more of the ALP/GraphBLAS objects passed to the primitive that returned this error have mismat...
Definition: rc.hpp:90
RC resize(Vector< InputType, backend, Coords > &x, const size_t new_nz) noexcept
Resizes the nonzero capacity of this vector.
Definition: io.hpp:703
RC kpp_initialisation(Matrix< IOType > &K, const Matrix< IOType > &X, const Operator &dist_op=Operator())
a simple implementation of the k++ initialisation algorithm for kmeans
Definition: kmeans.hpp:60
RC kmeans_iteration(Matrix< IOType > &K, Vector< std::pair< size_t, IOType > > &clusters_and_distances, const Matrix< IOType > &X, const size_t max_iter=1000, const Operator &dist_op=Operator())
The kmeans iteration given an initialisation.
Definition: kmeans.hpp:224
A generalised monoid.
Definition: monoid.hpp:54