SparseLibrary  Version 1.6.0
RDScheme.hpp
1 /*
2  * Copyright (c) 2007-2014, A. N. Yzelman, Utrecht University 2007-2011;
3  * KU Leuven 2011-2014.
4  * R. H. Bisseling, Utrecht University 2007-2014.
5  *
6  * This file is part of the Sparse Library.
7  *
8  * This library was developed under supervision of Prof. dr. Rob H. Bisseling at
9  * Utrecht University, from 2007 until 2011. From 2011-2014, development continued
10  * at KU Leuven, where Prof. dr. Dirk Roose contributed significantly to the ideas
11  * behind the newer parts of the library code.
12  *
13  * The Sparse Library is free software: you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by the
15  * Free Software Foundation, either version 3 of the License, or (at your
16  * option) any later version.
17  *
18  * The Sparse Library is distributed in the hope that it will be useful, but
19  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21  * for more details.
22  *
23  * You should have received a copy of the GNU General Public License along
24  * with the Sparse Library. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 
28 /*
29  * File created by:
30  * A. N. Yzelman, Dept. of Computer Science, KU Leuven, 2011.
31  */
32 
33 
34 #include <iostream>
35 #include <vector>
36 #include <map>
37 #include <pthread.h>
38 
39 #ifndef _NO_LIBNUMA
40  #include <numa.h>
41 #endif
42 
43 #include "SparseMatrix.hpp"
44 #include "Hilbert.hpp"
45 
46 #ifndef _H_RDScheme
47 #define _H_RDScheme
48 
49 /*
50  * When defined, thread 0 will use the global y vector for its local
51  * computations. This introduces extra work in the form of one sync,
52  * and the amount of available processors for the parallel collect.
53  * The advantage is less data replication of the output vector.
54  *
55  * The synchronisation cannot be prevented. Using all processors in
56  * the collect code is possible but has not been programmed currently.
57  */
58 //#define RDScheme_GLOBAL_Y
59 
60 #ifndef _TESTMODE
61  /*
62  * When defined, RDScheme will not collect output results in the
63  * global output vector passed to this library. Used for timing
64  * the true SpMV speeds only.
65  */
66  #define RDScheme_NO_COLLECT
67 #endif
68 
70 template< typename T >
72 
73  public:
74 
76  size_t id;
77 
79  size_t P;
80 
82  unsigned char mode;
83 
85  unsigned long int repeat;
86 
88  std::vector< Triplet< T > > *original;
89 
91  size_t *nzb;
92 
94  double time;
95 
97  size_t bytes;
98 
100  pthread_mutex_t* mutex;
101 
103  pthread_cond_t* cond;
104 
106  pthread_mutex_t* end_mutex;
107 
109  pthread_cond_t* end_cond;
110 
112  size_t *sync;
113 
115  size_t *end_sync;
116 
119 
122 
125 
127  RDScheme_shared_data(): id( -1 ), P( -1 ), mode( 0 ), repeat( 0 ), original( NULL ), nzb( NULL ), time( 0 ),
128  mutex( NULL ), cond( NULL ), end_mutex( NULL ), end_cond( NULL ),
129  sync( NULL ), end_sync( NULL ),
131 
148  RDScheme_shared_data( size_t _id, size_t _P,
149  std::vector< Triplet< double > > *_original,
150  size_t *_nzb,
151  pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond,
152  size_t *_sync, size_t *_end_sync,
153  size_t _ovsize, size_t _ovoffset ):
154  id( _id ), P( _P ), mode( 1 ), repeat( 1 ), original( _original ), nzb( _nzb ), time( 0 ),
155  mutex( _mutex ), cond( _cond ), end_mutex( _end_mutex ), end_cond( _end_cond ),
156  sync( _sync ), end_sync( _end_sync ),
157  output_vector_size( _ovsize ), output_vector_offset( _ovoffset ), local_y( NULL ) {}
158 };
159 
162 template< typename T, typename DS >
163 class RDScheme: public SparseMatrix< T, ULI > {
164 
165  private:
166 
167  protected:
168 
170  static size_t P;
171 
173  static const T* input;
174 
176  static T* output;
177 
179  pthread_t *threads;
180 
183 
185  static clockid_t global_clock_id;
186 
188  pthread_mutex_t mutex;
189 
191  pthread_cond_t cond;
192 
194  pthread_mutex_t end_mutex;
195 
197  pthread_cond_t end_cond;
198 
200  size_t sync;
201 
203  size_t end_sync;
204 
205  public:
206 
208  RDScheme( const std::string file, T zero ) {
209  this->loadFromFile( file, zero );
210  }
211 
213  RDScheme( std::vector< Triplet< T > >& input, ULI m, ULI n, T zero ) {
214  load( input, m, n, zero );
215  }
216 
218  virtual ~RDScheme() {
219  //set all daemon threads to exit mode
220  for( size_t i=0; i<P; i++ )
221  thread_data[ i ].mode = 4;
222 
223  //wake up all daemon threads
224  pthread_mutex_lock( &mutex );
225  pthread_cond_broadcast( &cond );
226  pthread_mutex_unlock( &mutex );
227 
228  //allow threads to exit gracefully
229  for( size_t i=0; i<P; i++ )
230  pthread_join( threads[ i ], NULL );
231 
232  //destroy data
233  delete [] thread_data;
234  delete [] threads;
235  pthread_mutex_destroy( &mutex );
236  pthread_cond_destroy( &cond );
237  }
238 
242  void wait() {
243  //wait for end signal
244  pthread_cond_wait( &end_cond, &end_mutex );
245  pthread_mutex_unlock( &end_mutex );
246  }
247 
256  virtual void load( std::vector< Triplet< T > >& input, const ULI m, const ULI n, const T zero ) {
257  //get number of cores available
259 
260 #ifndef _NO_LIBNUMA
261  //set kernel to local thread allocation if it wasn't already the case
262  numa_set_localalloc();
263 #endif
264 
265  //base settings
266  this->zero_element = zero;
267  this->nor = m;
268  this->noc = n;
269  this->nnz = input.size();
270 
271  size_t *nzb = new size_t [ this->m() ];
272  for( size_t i=0; i<m; i++ ) nzb[ i ] = 0;
273 
274  //create P threads :)
275  this->threads = new pthread_t[ P ];
276  //initialize local initialisation data
278  //initialize mutexes and conditions and a synchronisation counter
279  pthread_mutex_init( &mutex, NULL );
280  pthread_cond_init ( &cond, NULL );
281  pthread_mutex_init( &end_mutex, NULL );
282  pthread_cond_init ( &end_cond, NULL );
283  sync = 0;
284  end_sync = 0;
285  //lock end mutex (disallow threads that are created to signal for end
286  //before this thread is done with spawning children)
287  pthread_mutex_lock( &end_mutex );
288  //go forth and multiply
289  for( size_t i=0; i<P; i++ ) {
290  //build thread-local init data
291  thread_data[ i ] = RDScheme_shared_data<T>( i, P, &input, nzb, &mutex, &cond, &end_mutex, &end_cond, &sync, &end_sync, -1, -1 );
292  //set fixed affinity for threads
293  cpu_set_t mask;
294  CPU_ZERO( &mask );
295  CPU_SET ( i, &mask );
296 
297  //TODO: use hwloc for better numa-aware pinning
298  /*hwloc_topology_t topology;
299  hwloc_topology_init ( &topology );
300  hwloc_topology_load( topology );
301  hwloc_bitmap_t cpuset;*/
302 
303  //prepare attributes
304  pthread_attr_t attr;
305  pthread_attr_init( &attr );
306  //set fixed affinity in attribute, so that it starts binded immediately
307  pthread_attr_setaffinity_np( &attr, sizeof( cpu_set_t ), &mask );
308  //fire up thread
309  pthread_create( &threads[i], &attr, &RDScheme::thread, (void*) &thread_data[i] );
310  //free attr
311  pthread_attr_destroy( &attr );
312  }
313 
314  //wait for threads to finish initialisation
315  wait();
316 
317  //delete temporary array
318  delete [] nzb;
319  }
320 
322  static void end( pthread_mutex_t* mutex, pthread_cond_t* cond, size_t *sync, const size_t P ) {
323  pthread_mutex_lock( mutex );
324  (*sync)++;
325  if( *sync == P ) {
326  //only one thread is waiting on this condition, use signal
327  pthread_cond_signal( cond );
328  *sync = 0;
329  }
330  pthread_mutex_unlock( mutex );
331  }
332 
334  static void synchronise( pthread_mutex_t* mutex, pthread_cond_t* cond, size_t *sync, const size_t P ) {
335  pthread_mutex_lock( mutex );
336  (*sync)++;
337  if( *sync == P ) {
338  *sync = 0;
339  pthread_cond_broadcast( cond );
340  } else
341  pthread_cond_wait( cond, mutex );
342  pthread_mutex_unlock( mutex );
343  }
344 
346  static void* thread( void *data ) {
347  //get short-hand notation
349  const size_t id = shared->id;
350  const size_t P = shared->P;
351  const size_t nnz = shared->original->size();
352  pthread_mutex_t *mutex = shared->mutex;
353  pthread_cond_t *cond = shared->cond;
354 
355  cpu_set_t mask;
356  CPU_ZERO( &mask );
357  pthread_getaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask );
358 
359  //sanity checks
360  if( !CPU_ISSET( id, &mask ) ) {
361  std::cerr << "Incorrect pinning for thread " << id << "!" << std::endl;
362  exit( 1 );
363  }
364  for( size_t s=0; s<P; s++ ) {
365  if( s==id ) continue;
366  if( CPU_ISSET( s, &mask ) ) {
367  std::cerr << "Thread " << id << " mask is larger than one core" << " (" << s << " is set)!" << std::endl;
368  exit( 1 );
369  }
370  }
371 
372  //prepare to get global matrix dimensions
373  ULI m, n;
374  m = n = 0;
375  //put rowsums in nzb
376  const size_t blocksize = (nnz % P) > 0 ? nnz / P + 1 : nnz / P;
377  for( size_t i=0; i<nnz; i++ ) {
378  const unsigned long int currow = (*(shared->original))[ i ].i();
379  const unsigned long int curcol = (*(shared->original))[ i ].j();
380  if( currow >= id * blocksize && currow < (id + 1) * blocksize )
381  shared->nzb[ currow ]++;
382  if( currow > m ) m = currow;
383  if( curcol > n ) n = curcol;
384  }
385 
386  //dimensions are one higher than max indices
387  m++;
388  n++;
389 
390  //sync
391  RDScheme::synchronise( mutex, cond, shared->sync, shared->P );
392 
393  //determine distribution
394  const size_t nnz_target = nnz / P;
395  size_t cursum = 0;
396 
397  //first sanity check
398  for( unsigned long int i=0; i<m; i++ ) cursum += shared->nzb[ i ];
399  assert( cursum == nnz );
400 
401  //continue
402  cursum = 0;
403  size_t start, end, k = 0;
404  start = end = -1;
405  //get start position for s=0 correct
406  if( id == 0 ) start = 0;
407  //do greedy load balancing to get ranges for prcoessors 0 to P-1
408  for( size_t i = 0; i < m; i++ ) {
409  cursum += shared->nzb[ i ];
410  if( cursum >= nnz_target ) {
411  if( k == id ) end = i + 1;
412  if(k+1== id ) start = i + 1;
413  k++;
414  cursum = 0;
415  }
416  }
417  //see if we missed out on any nonzeroes, and set an empty range if so
418  if( start == static_cast< size_t >(-1) ) start = m;
419  if( end == static_cast< size_t >(-1) ) end = m;
420  //get end position for s=P-1 correct
421  if( id == P-1 ) end = m;
422  //derive output vector sizes
423  shared->output_vector_size = end - start;
424  shared->output_vector_offset = start;
425  assert( shared->output_vector_size <= m );
426  assert( shared->output_vector_offset + shared->output_vector_size <= m );
427 
428  //copy to local first
429  std::vector< Triplet< T > > local;
430  for( size_t i = 0; i < static_cast< size_t >(nnz); i++ ) {
431  const size_t currow = (*(shared->original))[ i ].i();
432  if( currow >= start && currow < end )
433  local.push_back(
434  Triplet< T >( (*(shared->original))[ i ].i() - start,
435  (*(shared->original))[ i ].j(),
436  (*(shared->original))[ i ].value )
437  );
438  }
439  m = shared->output_vector_size; //new matrix size is new m times old n
440 
441  //load into datastructure
442  DS dss( local, m, n, 0 );
443 
444  //remember memory usage
445  shared->bytes = dss.bytesUsed();
446 
447  //create local shadow of y to avoid write-contention
448  T* y = NULL;
449 #ifdef RDScheme_GLOBAL_Y
450  if( id > 0 ) {
451 #endif
452  if( shared->output_vector_size > 0 ) {
453  y = new T[ shared->output_vector_size ];
454  for( size_t i=0; i<shared->output_vector_size; i++ )
455  y[ i ] = 0.0;
456  } else
457  y = NULL;
458 #ifdef RDScheme_GLOBAL_Y
459  }
460 #endif
461  shared->local_y = y;
462 
463  //exit construction mode
464  shared->mode = 0;
465 
466  //signal end of construction
467  pthread_mutex_lock( mutex );
468  RDScheme::end( shared->end_mutex, shared->end_cond, shared->end_sync, shared->P );
469 
470  //enter daemon mode
471  while( true ) {
472  struct timespec clk_start, clk_stop;
473  pthread_cond_wait( cond, mutex );
474  pthread_mutex_unlock( mutex );
475 
476  if( shared->mode == 4 ) break;
477 
478 #ifndef NDEBUG
479  const double * const p_input = RDScheme<T,DS>::input;
480  const double * const p_output = RDScheme<T,DS>::output;
481 #endif
482  switch( shared->mode ) {
483  case 3:
484  assert( p_input != NULL );
485  assert( p_output != NULL );
486 #ifdef RDScheme_GLOBAL_Y
487  if( id == 0 ) {
488  y = RDScheme::output;
489  shared->local_y = y;
490  }
491 #endif
492  assert( y != NULL );
493 
494  clock_gettime( global_clock_id, &clk_start);
495  shared->time = 0.0;
496  for( unsigned long int i=0; i<shared->repeat; ++i )
497  dss.zxa( RDScheme<T,DS>::input, y );
498  clock_gettime( global_clock_id, &clk_stop);
499  shared->time = (clk_stop.tv_sec-clk_start.tv_sec)*1000;
500  shared->time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;
501 
502 #ifndef RDScheme_NO_COLLECT
503  collectY( shared );
504 #endif
505  break;
506  case 2:
507  assert( p_input != NULL );
508  assert( p_output != NULL );
509 #ifdef RDScheme_GLOBAL_Y
510  if( id == 0 ) {
511  y = RDScheme::output;
512  shared->local_y = y;
513  }
514 #endif
515  assert( y != NULL );
516 
517  clock_gettime( global_clock_id, &clk_start);
518  shared->time = 0.0;
519  for( unsigned long int i=0; i<shared->repeat; ++i )
520  dss.zax( RDScheme<T,DS>::input, y );
521  clock_gettime( global_clock_id, &clk_stop);
522  shared->time = (clk_stop.tv_sec-clk_start.tv_sec)*1000;
523  shared->time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;
524 
525 #ifndef RDScheme_NO_COLLECT
526  collectY( shared );
527 #endif
528  break;
529  default:
530  std::cout << "Thread " << id << ": Error, undefined operation (" << shared->mode << ")!" << std::endl;
531  exit( -1 );
532  }
533  shared->mode = 0;
534 
535  //signal end of operation
536  pthread_mutex_lock( mutex );
537  RDScheme::end( shared->end_mutex, shared->end_cond, shared->sync, shared->P );
538  }
539 
540  //done
541 #ifdef RDScheme_GLOBAL_Y
542  if( id != 0 )
543 #endif
544  delete [] y;
545  return (NULL);
546  }
547 
554  static void collectY( RDScheme_shared_data<T> *shared ) {
555 
556 #ifdef RDScheme_GLOBAL_Y
557  //FIXME It could be possible to distribute work over all processors
558  //instead of p-1 processors, but this requires some extra balancing.
559  const size_t s = shared->id;
560  if( s == 0 ) return;
561 #endif
562 
563  //do collect items of own block
564  for( size_t i = 0; i < shared->output_vector_size; i++ ) {
565 #ifndef NDEBUG
566  const double * const p_output = RDScheme<T,DS>::output;
567  assert( p_output != NULL );
568  assert( shared->local_y != NULL );
569 #endif
570  RDScheme<T,DS>::output[ shared->output_vector_offset + i ] += shared->local_y[ i ];
571  }
572  }
573 
574 #ifndef _NO_LIBNUMA
575 
576  virtual T* mv( const T* x ) {
577  T* ret = (T*) numa_alloc_interleaved( this->nor * sizeof( T ) );
578  for( ULI i=0; i<this->nor; i++ ) ret[ i ] = this->zero_element;
579  zax( x, ret );
580  return ret;
581  }
582 #endif
583 
585  virtual void zxa( const T* x, T* z ) {
586  zxa( x, z, 1 );
587  }
588 
590  virtual void zxa( const T* x, T* z, const unsigned long int repeat ) {
591  //set all daemon threads to do zxa
592  for( size_t i=0; i<P; i++ ) {
593  thread_data[ i ].mode = 3;
594  thread_data[ i ].repeat = repeat;
595  }
596 
597  //set input vector
599 
600  //set output vector
602 
603  //wake up all daemon threads
604  pthread_mutex_lock( &end_mutex );
605  pthread_mutex_lock( &mutex );
606  pthread_cond_broadcast( &cond );
607  pthread_mutex_unlock( &mutex );
608 
609  //wait for end of operation
610  wait();
611 
612  //unset vectors
613  RDScheme<T,DS>::input = NULL;
614  RDScheme<T,DS>::output = NULL;
615  }
616 
618  virtual void zax( const T* x, T* z ) {
619  zax( x, z, 1, 0, NULL );
620  }
621 
623  virtual void zax( const T* x, T* z, const unsigned long int repeat, const clockid_t clock_id, double *elapsed_time ) {
624  //set all daemon threads to do zax
625  for( size_t i=0; i<P; i++ ) {
626  thread_data[ i ].mode = 2;
627  thread_data[ i ].repeat = repeat;
628  }
629 
630  //set global clock ID
631  global_clock_id = clock_id;
632 
633  //set input vector
635 
636  //set output vector
638 
639  //wake up all daemon threads
640  pthread_mutex_lock( &end_mutex );
641  pthread_mutex_lock( &mutex );
642  pthread_cond_broadcast( &cond );
643  pthread_mutex_unlock( &mutex );
644 
645  //wait for end of operation
646  wait();
647 
648  //get elapsed time
649  double maxtime = 0.0;
650  for( size_t i=0; i<P; i++ ) {
651  const double curtime = thread_data[ i ].time;
652  if( curtime > maxtime ) maxtime = curtime;
653  }
654  if( elapsed_time != NULL )
655  *elapsed_time += maxtime;
656 
657  //unset vectors
658  RDScheme<T,DS>::input = NULL;
659  RDScheme<T,DS>::output = NULL;
660  }
661 
663  virtual size_t bytesUsed() {
664  size_t ret = 0;
665  for( size_t s = 0; s < P; ++s )
666  ret += thread_data[ s ].bytes;
667  return ret;
668  }
669 
674  virtual void getFirstIndexPair( ULI &i, ULI &j ) {
675  std::cerr << "Warning: RDScheme::getFirstIndexPair has no unique answer since it implements a parallel multiplication!\nIgnoring call..." << std::endl;
676  }
677 };
678 
679 template< typename T, typename DS > size_t RDScheme< T, DS >::P = 0;
680 
681 template< typename T, typename DS > const T* RDScheme< T, DS >::input = NULL;
682 
683 template< typename T, typename DS > T* RDScheme< T, DS >::output = NULL;
684 
685 template< typename T, typename DS > clockid_t RDScheme< T, DS >::global_clock_id = 0;
686 
687 #endif
688 
ULI nnz
Number of non-zeros.
Definition: SparseMatrix.hpp:58
pthread_cond_t end_cond
Wait for end mechanism: condition.
Definition: RDScheme.hpp:197
virtual void getFirstIndexPair(ULI &i, ULI &j)
Function disabled for parallel schemes!
Definition: RDScheme.hpp:674
static const T * input
Input vector.
Definition: RDScheme.hpp:173
size_t P
Total number of processors.
Definition: RDScheme.hpp:79
virtual ~RDScheme()
Base deconstructor.
Definition: RDScheme.hpp:218
pthread_cond_t * cond
Condition used for synchronisation.
Definition: RDScheme.hpp:103
pthread_mutex_t mutex
Stop/continue mechanism: mutex.
Definition: RDScheme.hpp:188
size_t output_vector_size
Length of the local output vector.
Definition: RDScheme.hpp:118
T * local_y
Pointer to the local output vector.
Definition: RDScheme.hpp:124
pthread_t * threads
p_threads associated to this data strcuture
Definition: RDScheme.hpp:179
static size_t P
Number of threads to fire up.
Definition: RDScheme.hpp:170
virtual void zxa(const T *x, T *z, const unsigned long int repeat)
Definition: RDScheme.hpp:590
RDScheme_shared_data< T > * thread_data
array of initial thread data
Definition: RDScheme.hpp:182
unsigned long int cores() const
The number of available cores.
Definition: MachineInfo.cpp:77
pthread_mutex_t * mutex
Mutex used for synchronisation.
Definition: RDScheme.hpp:100
size_t sync
Used for synchronising threads.
Definition: RDScheme.hpp:200
virtual unsigned long int m()
Queries the number of rows this matrix contains.
Definition: SparseMatrix.hpp:107
Full parallel row-distributed SpMV, based on CSB (Morton curve + Cilk) and PThreads.
Definition: RDScheme.hpp:163
static void end(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
End synchronisation code.
Definition: RDScheme.hpp:322
static T * output
Output vector.
Definition: RDScheme.hpp:176
pthread_cond_t * end_cond
Condition used for end sync.
Definition: RDScheme.hpp:109
std::vector< Triplet< T > > * original
Array of vectors of thread-local nonzeroes.
Definition: RDScheme.hpp:88
static const MachineInfo & getInstance()
Gets a singleton instance.
Definition: MachineInfo.cpp:38
virtual T * mv(const T *x)
Overloaded mv call; allocates output vector using numa_interleaved.
Definition: RDScheme.hpp:576
size_t * nzb
Will store rowsums.
Definition: RDScheme.hpp:91
RDScheme_shared_data(size_t _id, size_t _P, std::vector< Triplet< double > > *_original, size_t *_nzb, pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond, size_t *_sync, size_t *_end_sync, size_t _ovsize, size_t _ovoffset)
Default constructor.
Definition: RDScheme.hpp:148
RDScheme(const std::string file, T zero)
Base constructor.
Definition: RDScheme.hpp:208
pthread_mutex_t * end_mutex
Mutex used for end sync.
Definition: RDScheme.hpp:106
static void * thread(void *data)
SPMD code for each thread involved with parallel SpMV multiplication.
Definition: RDScheme.hpp:346
static void collectY(RDScheme_shared_data< T > *shared)
Reduces a distributed output vector set into a single contiguous output vector at process 0...
Definition: RDScheme.hpp:554
size_t output_vector_offset
Offset of the local output vector compared to global indices.
Definition: RDScheme.hpp:121
void loadFromFile(const std::string file, const T zero=0)
Function which loads a matrix from a matrix market file.
Definition: SparseMatrix.hpp:89
Interface common to all sparse matrix storage schemes.
Definition: SparseMatrix.hpp:46
double time
Will store local timing.
Definition: RDScheme.hpp:94
ULI noc
Number of columns.
Definition: SparseMatrix.hpp:55
size_t bytes
Will store memory use.
Definition: RDScheme.hpp:97
unsigned long int repeat
how many times to repeat the operation set in `mode' (above, only for 2 and 3)
Definition: RDScheme.hpp:85
size_t * end_sync
Counter used for end sync.
Definition: RDScheme.hpp:115
virtual void load(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero)
Loads a sparse matrix from an input set of triplets.
Definition: RDScheme.hpp:256
size_t id
Thread ID.
Definition: RDScheme.hpp:76
size_t * sync
Counter used for synchronisation.
Definition: RDScheme.hpp:112
static void synchronise(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
Synchronises all threads.
Definition: RDScheme.hpp:334
virtual void zax(const T *x, T *z, const unsigned long int repeat, const clockid_t clock_id, double *elapsed_time)
See SparseMatrix::zax.
Definition: RDScheme.hpp:623
pthread_cond_t cond
Stop/continue mechanism: condition.
Definition: RDScheme.hpp:191
virtual size_t bytesUsed()
Definition: RDScheme.hpp:663
ULI nor
Number of rows.
Definition: SparseMatrix.hpp:52
void wait()
Lets the calling thread wait for the end of the SpMV multiply.
Definition: RDScheme.hpp:242
T zero_element
The element considered to be zero.
Definition: SparseMatrix.hpp:63
RDScheme(std::vector< Triplet< T > > &input, ULI m, ULI n, T zero)
Base constructor.
Definition: RDScheme.hpp:213
unsigned char mode
0 undef, 1 init, 2 zax, 3 zxa, 4 exit
Definition: RDScheme.hpp:82
RDScheme_shared_data()
Base constructor.
Definition: RDScheme.hpp:127
size_t end_sync
Used for construction end signal.
Definition: RDScheme.hpp:203
virtual unsigned long int n()
Queries the number of columns this matrix contains.
Definition: SparseMatrix.hpp:115
A single triplet value.
Definition: Triplet.hpp:52
static clockid_t global_clock_id
Clock type used for thread-local timing.
Definition: RDScheme.hpp:185
virtual void zax(const T *x, T *z)
See SparseMatrix::zax.
Definition: RDScheme.hpp:618
Shared data for RDScheme threads.
Definition: RDScheme.hpp:71
pthread_mutex_t end_mutex
Wait for end mechanism: mutex.
Definition: RDScheme.hpp:194
virtual void zxa(const T *x, T *z)
Definition: RDScheme.hpp:585