ALP User Documentation 0.7.0
Algebraic Programming User Documentation
kmeans.hpp
Go to the documentation of this file.
1
2/*
3 * Copyright 2021 Huawei Technologies Co., Ltd.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
27#ifndef _H_GRB_KMEANS
28#define _H_GRB_KMEANS
29
30#include <chrono>
31#include <random>
32
33#include <assert.h>
34
35#include <graphblas.hpp>
36
37
38namespace grb {
39
40 namespace algorithms {
41
55 template<
57 typename IOType = double,
58 class Operator = operators::square_diff< IOType, IOType, IOType >
59 >
62 const Matrix< IOType > &X,
63 const Operator &dist_op = Operator()
64 ) {
65 // declare monoids and semirings
67 Monoid<
70 > min_monoid;
75 > pattern_sum;
76
77 // runtime sanity checks: the row dimension of X should match the column
78 // dimension of K
79 if( ncols( K ) != nrows( X ) ) {
80 return MISMATCH;
81 }
82
83 // running error code
84 RC ret = SUCCESS;
85
86 // get problem dimensions
87 const size_t n = ncols( X );
88 const size_t m = nrows( X );
89 const size_t k = nrows( K );
90
91 // declare vector of indices of columns of X selected as the initial
92 // centroids
93 Vector< size_t > selected_indices( k );
94
95 // declare column selection vector
96 Vector< bool > col_select( n );
97
98 // declare selected point
99 Vector< IOType > selected( m );
100
101 // declare vector of distances from the selected point
102 Vector< IOType > selected_distances( n );
103
104 // declare vector of minimum distances to all points selected so far
105 Vector< IOType > min_distances( n );
106 ret = ret ? ret : grb::set( min_distances,
108 );
109
110 // generate first centroid by selecting a column of X uniformly at random
111
112 size_t i;
113 {
114 const size_t seed_uniform =
115 std::chrono::system_clock::now().time_since_epoch().count();
116 std::default_random_engine random_generator( seed_uniform );
117 std::uniform_int_distribution< size_t > uniform( 0, n - 1 );
118 i = uniform( random_generator );
119 }
120
121 for( size_t l = 0; ret == SUCCESS && l < k; ++l ) {
122
123 ret = grb::clear( col_select );
124 ret = ret ? ret : grb::clear( selected );
125 ret = ret ? ret : grb::clear( selected_distances );
126
127 ret = ret ? ret : grb::setElement( selected_indices, i, l );
128
129 ret = ret ? ret : grb::setElement( col_select, true, i );
130
131 ret = ret ? ret : grb::vxm< grb::descriptors::transpose_matrix >(
132 selected, col_select, X, pattern_sum );
133
134 ret = ret ? ret : grb::vxm( selected_distances, selected, X, add_monoid,
135 dist_op );
136
137 ret = ret ? ret : grb::foldl( min_distances, selected_distances,
138 min_monoid );
139
140 // TODO the remaining part of the loop should be replaced with the alias
141 // algorithm
142
143 IOType range = add_monoid.template getIdentity< IOType >();
144 ret = ret ? ret : grb::foldl( range, min_distances, add_monoid );
145
146 double sample = -1;
147 if( ret == SUCCESS ) {
148 {
149 const size_t seed =
150 std::chrono::system_clock::now().time_since_epoch().count();
151 std::default_random_engine generator( seed );
152 std::uniform_real_distribution< double > uniform( 0, 1 );
153 sample = uniform( generator );
154 }
155 ret = grb::collectives<>::broadcast( sample, 0 );
156 }
157 assert( sample >= 0 );
158
159 // The following is not standard ALP/GraphBLAS and does not work for P>1
160 // (TODO internal issue #320)
161 if( ret == SUCCESS ) {
162 assert( grb::spmd<>::nprocs() == 1 );
163 IOType * const raw = internal::getRaw( selected_distances );
164 IOType running_sum = 0;
165 i = 0;
166 do {
167 running_sum += static_cast< double >( raw[ i ] ) / range;
168 } while( running_sum < sample && ++i < n );
169 i = ( i == n ) ? n - 1 : i;
170 }
171 }
172
173 // create the matrix K by selecting the columns of X indexed by
174 // selected_indices
175
176 // declare pattern matrix
177 Matrix< void > M( k, n );
178 ret = ret ? ret : grb::resize( M, n );
179
180 if( ret == SUCCESS ) {
181 auto converter = grb::utils::makeVectorToMatrixConverter< void, size_t >(
182 selected_indices, [](const size_t &ind, const size_t &val ) {
183 return std::make_pair( ind, val );
184 }
185 );
186 ret = grb::buildMatrixUnique( M, converter.begin(), converter.end(),
187 PARALLEL );
188 }
189
190 ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
191 pattern_sum, RESIZE );
192 ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K, M, X,
193 pattern_sum );
194
195 if( ret != SUCCESS ) {
196 std::cout << "\tkpp finished with unexpected return code!" << std::endl;
197 }
198
199 return ret;
200 }
201
219 template<
221 typename IOType = double,
223 >
226 Vector< std::pair< size_t, IOType > > &clusters_and_distances,
227 const Matrix< IOType > &X,
228 const size_t max_iter = 1000,
229 const Operator &dist_op = Operator()
230 ) {
231 // declare monoids and semirings
232
233 typedef std::pair< size_t, IOType > indexIOType;
234
236
237 Monoid<
240 > argmin_monoid;
241
242 Monoid<
245 > comparison_monoid;
246
247 Semiring<
251 > pattern_sum;
252
253 Semiring<
257 > pattern_count;
258
259 // runtime sanity checks: the row dimension of X should match the column
260 // dimension of K
261 if( ncols( K ) != nrows( X ) ) {
262 return MISMATCH;
263 }
264 if( size( clusters_and_distances ) != ncols( X ) ) {
265 return MISMATCH;
266 }
267
268 // running error code
269 RC ret = SUCCESS;
270
271 // get problem dimensions
272 const size_t n = ncols( X );
273 const size_t m = nrows( X );
274 const size_t k = nrows( K );
275
276 // declare distance matrix
277 Matrix< IOType > Dist( k, n );
278
279 // declare and initialise labels vector and ones vectors
280 Vector< size_t > labels( k );
281 Vector< bool > n_ones( n ), m_ones( m );
282
283 ret = ret ? ret : grb::set< grb::descriptors::use_index >( labels, 0 );
284 ret = ret ? ret : grb::set( n_ones, true );
285 ret = ret ? ret : grb::set( m_ones, true );
286
287 // declare pattern matrix
288 Matrix< void > M( k, n );
289 ret = ret ? ret : grb::resize( M, n );
290
291 // declare the sizes vector
292 Vector< size_t > sizes( k );
293
294 // declare auxiliary vectors and matrices
295 Matrix< IOType > K_aux( k, m );
296 Matrix< size_t > V_aux( k, m );
297
298 // control variables
299 size_t iter = 0;
300 Vector< indexIOType > clusters_and_distances_prev( n );
301 bool converged;
302
303 do {
304 (void) ++iter;
305
306 ret = ret ? ret : grb::set( clusters_and_distances_prev,
307 clusters_and_distances );
308
309 ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op, RESIZE );
310 ret = ret ? ret : mxm( Dist, K, X, add_monoid, dist_op );
311
312 ret = ret ? ret : vxm( clusters_and_distances, labels, Dist, argmin_monoid,
314
315 auto converter = grb::utils::makeVectorToMatrixConverter<
316 void, indexIOType
317 > (
318 clusters_and_distances,
319 []( const size_t &ind, const indexIOType &pair ) {
320 return std::make_pair( pair.first, ind );
321 }
322 );
323
324 ret = ret ? ret : grb::buildMatrixUnique( M, converter.begin(),
325 converter.end(), PARALLEL );
326
327 ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
328 pattern_sum, RESIZE );
329 ret = ret ? ret : grb::mxm< descriptors::transpose_right >( K_aux, M, X,
330 pattern_sum );
331
332 ret = ret ? ret : grb::mxv( sizes, M, n_ones, pattern_count );
333
334 ret = ret ? ret : grb::outer( V_aux, sizes, m_ones,
336 ret = ret ? ret : grb::outer( V_aux, sizes, m_ones,
338
339 ret = ret ? ret : eWiseApply( K, V_aux, K_aux,
341 ret = ret ? ret : eWiseApply( K, V_aux, K_aux,
343
344 converged = true;
345 ret = ret ? ret : grb::dot(
346 converged,
347 clusters_and_distances_prev, clusters_and_distances,
348 comparison_monoid,
350 );
351
352 } while( ret == SUCCESS && !converged && iter < max_iter );
353
354 if( iter == max_iter ) {
355 std::cout << "\tkmeans reached maximum number of iterations!" << std::endl;
356 return FAILED;
357 }
358 if( converged ) {
359 std::cout << "\tkmeans converged successfully after " << iter
360 << " iterations." << std::endl;
361 return SUCCESS;
362 }
363
364 std::cout << "\tkmeans finished with unexpected return code!" << std::endl;
365 return ret;
366 }
367
368 } // namespace algorithms
369
370} // namespace grb
371
372#endif // end _H_GRB_KMEANS
373
An ALP/GraphBLAS matrix.
Definition: matrix.hpp:71
const_iterator begin() const
Same as cbegin().
Definition: matrix.hpp:358
A generalised monoid.
Definition: monoid.hpp:54
A generalised semiring.
Definition: semiring.hpp:186
static RC broadcast(IOType &inout, const size_t root=0)
Schedules a broadcast operation of a single object of type IOType per process.
Definition: collectives.hpp:244
Standard identity for the minimum operator.
Definition: identities.hpp:101
Standard identity for the logical AND operator.
Definition: identities.hpp:178
Standard identity for numerical addition.
Definition: identities.hpp:57
This operator takes the sum of the two input parameters and writes it to the output variable.
Definition: ops.hpp:177
The argmin operator on key-value pairs.
Definition: ops.hpp:573
Reversed division of two numbers.
Definition: ops.hpp:355
Compares std::pair inputs taking the first entry in every pair as the comparison key,...
Definition: ops.hpp:680
This operator assigns the left-hand input if the right-hand input evaluates true.
Definition: ops.hpp:88
The logical and.
Definition: ops.hpp:492
This operator takes the minimum of the two input parameters and writes the result to the output varia...
Definition: ops.hpp:276
This operator assigns the right-hand input if the left-hand input evaluates true.
Definition: ops.hpp:143
This operation returns the squared difference between two numbers.
Definition: ops.hpp:625
The zip operator that operators on keys as a left-hand input and values as a right hand input,...
Definition: ops.hpp:651
For backends that support multiple user processes this class defines some basic primitives to support...
Definition: spmd.hpp:51
The main header to include in order to use the ALP/GraphBLAS API.
RC eWiseApply(Vector< OutputType, backend, Coords > &z, const InputType1 alpha, const InputType2 beta, const OP &op=OP(), const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< OutputType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&grb::is_operator< OP >::value, void >::type *const =nullptr)
Computes , out of place, operator version.
Definition: blas1.hpp:208
RC foldl(IOType &x, const Vector< InputType, backend, Coords > &y, const Vector< MaskType, backend, Coords > &mask, const Monoid &monoid=Monoid(), const typename std::enable_if< !grb::is_object< IOType >::value &&!grb::is_object< InputType >::value &&!grb::is_object< MaskType >::value &&grb::is_monoid< Monoid >::value, void >::type *const =nullptr)
Reduces, or folds, a vector into a scalar.
Definition: blas1.hpp:3840
RC dot(OutputType &z, const Vector< InputType1, backend, Coords > &x, const Vector< InputType2, backend, Coords > &y, const AddMonoid &addMonoid=AddMonoid(), const AnyOp &anyOp=AnyOp(), const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< OutputType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&grb::is_monoid< AddMonoid >::value &&grb::is_operator< AnyOp >::value, void >::type *const =nullptr)
Calculates the dot product, , under a given additive monoid and multiplicative operator.
Definition: blas1.hpp:4056
RC mxv(Vector< IOType, backend, Coords > &u, const Vector< InputType3, backend, Coords > &u_mask, const Matrix< InputType2, backend, RIT, CIT, NIT > &A, const Vector< InputType1, backend, Coords > &v, const Vector< InputType4, backend, Coords > &v_mask, const Semiring &semiring=Semiring(), const Phase &phase=EXECUTE, const typename std::enable_if< grb::is_semiring< Semiring >::value &&!grb::is_object< IOType >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&!grb::is_object< InputType3 >::value &&!grb::is_object< InputType4 >::value, void >::type *const =nullptr)
Right-handed in-place doubly-masked sparse matrix times vector multiplication, .
Definition: blas2.hpp:243
RC vxm(Vector< IOType, backend, Coords > &u, const Vector< InputType3, backend, Coords > &u_mask, const Vector< InputType1, backend, Coords > &v, const Vector< InputType4, backend, Coords > &v_mask, const Matrix< InputType2, backend, RIT, CIT, NIT > &A, const Semiring &semiring=Semiring(), const Phase &phase=EXECUTE, typename std::enable_if< grb::is_semiring< Semiring >::value &&!grb::is_object< InputType1 >::value &&!grb::is_object< InputType2 >::value &&!grb::is_object< InputType3 >::value &&!grb::is_object< InputType4 >::value &&!grb::is_object< IOType >::value, void >::type *=nullptr)
Left-handed in-place doubly-masked sparse matrix times vector multiplication, .
Definition: blas2.hpp:307
RC mxm(Matrix< OutputType, backend, CIT, RIT, NIT > &C, const Matrix< InputType1, backend, CIT, RIT, NIT > &A, const Matrix< InputType2, backend, CIT, RIT, NIT > &B, const Semiring &ring=Semiring(), const Phase &phase=EXECUTE)
Unmasked and in-place sparse matrix–sparse matrix multiplication (SpMSpM), .
Definition: blas3.hpp:94
RC set(Vector< DataType, backend, Coords > &x, const T val, const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< DataType >::value &&!grb::is_object< T >::value, void >::type *const =nullptr) noexcept
Sets all elements of a vector to the given value.
Definition: io.hpp:857
RC buildMatrixUnique(Matrix< InputType, implementation > &A, fwd_iterator1 I, const fwd_iterator1 I_end, fwd_iterator2 J, const fwd_iterator2 J_end, fwd_iterator3 V, const fwd_iterator3 V_end, const IOMode mode)
Assigns nonzeroes to the matrix from a coordinate format.
Definition: io.hpp:1336
size_t size(const Vector< DataType, backend, Coords > &x) noexcept
Request the size of a given vector.
Definition: io.hpp:235
size_t ncols(const Matrix< InputType, backend, RIT, CIT, NIT > &A) noexcept
Requests the column size of a given matrix.
Definition: io.hpp:339
RC setElement(Vector< DataType, backend, Coords > &x, const T val, const size_t i, const Phase &phase=EXECUTE, const typename std::enable_if< !grb::is_object< DataType >::value &&!grb::is_object< T >::value, void >::type *const =nullptr)
Sets the element of a given vector at a given position to a given value.
Definition: io.hpp:1128
size_t nrows(const Matrix< InputType, backend, RIT, CIT, NIT > &A) noexcept
Requests the row size of a given matrix.
Definition: io.hpp:286
RC resize(Vector< InputType, backend, Coords > &x, const size_t new_nz) noexcept
Resizes the nonzero capacity of this vector.
Definition: io.hpp:703
RC clear(Vector< DataType, backend, Coords > &x) noexcept
Clears a given vector of all nonzeroes.
Definition: io.hpp:574
RC kmeans_iteration(Matrix< IOType > &K, Vector< std::pair< size_t, IOType > > &clusters_and_distances, const Matrix< IOType > &X, const size_t max_iter=1000, const Operator &dist_op=Operator())
The kmeans iteration given an initialisation.
Definition: kmeans.hpp:224
RC kpp_initialisation(Matrix< IOType > &K, const Matrix< IOType > &X, const Operator &dist_op=Operator())
a simple implementation of the k++ initialisation algorithm for kmeans
Definition: kmeans.hpp:60
static constexpr Descriptor no_operation
Indicates no additional pre- or post-processing on any of the GraphBLAS function arguments.
Definition: descriptors.hpp:63
The ALP/GraphBLAS namespace.
Definition: graphblas.hpp:452
@ PARALLEL
Parallel mode IO.
Definition: iomode.hpp:92
RC
Return codes of ALP primitives.
Definition: rc.hpp:47
@ MISMATCH
One or more of the ALP/GraphBLAS objects passed to the primitive that returned this error have mismat...
Definition: rc.hpp:90
@ SUCCESS
Indicates the primitive has executed successfully.
Definition: rc.hpp:54
@ FAILED
Indicates when one of the grb::algorithms has failed to achieve its intended result,...
Definition: rc.hpp:154
unsigned int Descriptor
Descriptors indicate pre- or post-processing for some or all of the arguments to an ALP/GraphBLAS cal...
Definition: descriptors.hpp:54
@ RESIZE
Speculatively assumes that the output container(s) of the requested operation lack the necessary capa...
Definition: phase.hpp:187