d7/d74/RDScheme_8hpp_source.html

 /*

  * Copyright (c) 2007-2014, A. N. Yzelman,   Utrecht University 2007-2011;

  *                                                    KU Leuven 2011-2014.

  *                          R. H. Bisseling, Utrecht University 2007-2014.

  *

  * This file is part of the Sparse Library.

  *

  * This library was developed under supervision of Prof. dr. Rob H. Bisseling at

  * Utrecht University, from 2007 until 2011. From 2011-2014, development continued

  * at KU Leuven, where Prof. dr. Dirk Roose contributed significantly to the ideas

  * behind the newer parts of the library code.

  *

  *     The Sparse Library is free software: you can redistribute it and/or modify

  *     it under the terms of the GNU General Public License as published by the

  *     Free Software Foundation, either version 3 of the License, or (at your

  *     option) any later version.

  *

  *     The Sparse Library is distributed in the hope that it will be useful, but

  *     WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

  *     or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License

  *     for more details.

  *

  *     You should have received a copy of the GNU General Public License along

  *     with the Sparse Library. If not, see <http://www.gnu.org/licenses/>.

  */


 /*

  * File created by:

  *     A. N. Yzelman, Dept. of Computer Science, KU Leuven, 2011.

  */


 #include <iostream>

 #include <vector>

 #include <map>

 #include <pthread.h>


 #ifndef _NO_LIBNUMA

  #include <numa.h>

 #endif


 #include "SparseMatrix.hpp"

 #include "Hilbert.hpp"


 #ifndef _H_RDScheme

 #define _H_RDScheme


 /*

  * When defined, thread 0 will use the global y vector for its local

  * computations. This introduces extra work in the form of one sync,

  * and the amount of available processors for the parallel collect.

  * The advantage is less data replication of the output vector.

  *

  * The synchronisation cannot be prevented. Using all processors in

  * the collect code is possible but has not been programmed currently.

  */

 //#define RDScheme_GLOBAL_Y


 #ifndef _TESTMODE

  /*

   * When defined, RDScheme will not collect output results in the

   * global output vector passed to this library. Used for timing

   * the true SpMV speeds only.

   */

  #define RDScheme_NO_COLLECT

 #endif


 template< typename T >

 class RDScheme_shared_data {


         public:


                 size_t id;


                 size_t P;


                 unsigned char mode;


                 unsigned long int repeat;


                 std::vector< Triplet< T > > *original;


                 size_t *nzb;


                 double time;


                 size_t bytes;


                 pthread_mutex_t* mutex;


                 pthread_cond_t*  cond;


                 pthread_mutex_t* end_mutex;


                 pthread_cond_t*  end_cond;


                 size_t *sync;


                 size_t *end_sync;


                 size_t output_vector_size;


                 size_t output_vector_offset;


                 T *local_y;


                 RDScheme_shared_data(): id( -1 ), P( -1 ), mode( 0 ), repeat( 0 ), original( NULL ), nzb( NULL ), time( 0 ),

                                 mutex( NULL ), cond( NULL ), end_mutex( NULL ), end_cond( NULL ),

                                 sync( NULL ), end_sync( NULL ),

                                 output_vector_size( -1 ), output_vector_offset( -1 ) {}


                 RDScheme_shared_data( size_t _id, size_t _P,

                                 std::vector< Triplet< double > > *_original,

                                 size_t *_nzb,

                                 pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond,

                                 size_t *_sync, size_t *_end_sync,

                                 size_t _ovsize, size_t _ovoffset ):

                                 id( _id ),  P( _P ), mode( 1 ), repeat( 1 ), original( _original ), nzb( _nzb ), time( 0 ),

                                 mutex( _mutex ), cond( _cond ), end_mutex( _end_mutex ), end_cond( _end_cond ),

                                 sync( _sync ), end_sync( _end_sync ),

                                 output_vector_size( _ovsize ), output_vector_offset( _ovoffset ), local_y( NULL ) {}

 };


 template< typename T, typename DS >

 class RDScheme: public SparseMatrix< T, ULI > {


         private:


         protected:


                 static size_t P;


                 static const T* input;


                 static T* output;


                 pthread_t *threads;


                 RDScheme_shared_data<T> *thread_data;


                 static clockid_t global_clock_id;


                 pthread_mutex_t mutex;


                 pthread_cond_t cond;


                 pthread_mutex_t end_mutex;


                 pthread_cond_t end_cond;


                 size_t sync;


                 size_t end_sync;


         public:


                 RDScheme( const std::string file, T zero ) {

                         this->loadFromFile( file, zero );

                 }


                 RDScheme( std::vector< Triplet< T > >& input, ULI m, ULI n, T zero ) {

                         load( input, m, n, zero );

                 }


                 virtual ~RDScheme() {

                         //set all daemon threads to exit mode

                         for( size_t i=0; i<P; i++ )

                                 thread_data[ i ].mode = 4;


                         //wake up all daemon threads

                         pthread_mutex_lock( &mutex );

                         pthread_cond_broadcast( &cond );

                         pthread_mutex_unlock( &mutex );


                         //allow threads to exit gracefully

                         for( size_t i=0; i<P; i++ )

                                 pthread_join( threads[ i ], NULL );


                         //destroy data

                         delete [] thread_data;

                         delete [] threads;

                         pthread_mutex_destroy( &mutex );

                         pthread_cond_destroy(  &cond  );

                 }


                 void wait() {

                         //wait for end signal

                         pthread_cond_wait( &end_cond, &end_mutex );

                         pthread_mutex_unlock( &end_mutex );

                 }


                 virtual void load( std::vector< Triplet< T > >& input, const ULI m, const ULI n, const T zero ) {

                         //get number of cores available

                         P = MachineInfo::getInstance().cores();


 #ifndef _NO_LIBNUMA

                         //set kernel to local thread allocation if it wasn't already the case

                         numa_set_localalloc();

 #endif


                         //base settings

                         this->zero_element = zero;

                         this->nor = m;

                         this->noc = n;

                         this->nnz = input.size();


                         size_t *nzb = new size_t [ this->m() ];

                         for( size_t i=0; i<m; i++ ) nzb[ i ] = 0;


                         //create P threads :)

                         this->threads = new pthread_t[ P ];

                         //initialize local initialisation data

                         thread_data = new RDScheme_shared_data<T>[ P ];

                         //initialize mutexes and conditions and a synchronisation counter

                         pthread_mutex_init( &mutex, NULL );

                         pthread_cond_init ( &cond,  NULL );

                         pthread_mutex_init( &end_mutex, NULL );

                         pthread_cond_init ( &end_cond,  NULL );

                         sync     = 0;

                         end_sync = 0;

                         //lock end mutex (disallow threads that are created to signal for end

                         //before this thread is done with spawning children)

                         pthread_mutex_lock( &end_mutex );

                         //go forth and multiply

                         for( size_t i=0; i<P; i++ ) {

                                 //build thread-local init data

                                 thread_data[ i ] = RDScheme_shared_data<T>( i, P, &input, nzb, &mutex, &cond, &end_mutex, &end_cond, &sync, &end_sync, -1, -1 );

                                 //set fixed affinity for threads

                                 cpu_set_t mask;

                                 CPU_ZERO( &mask );

                                 CPU_SET ( i, &mask );


                                 //TODO: use hwloc for better numa-aware pinning

                                 /*hwloc_topology_t topology;

                                 hwloc_topology_init ( &topology );

                                 hwloc_topology_load( topology );

                                 hwloc_bitmap_t cpuset;*/


                                 //prepare attributes

                                 pthread_attr_t attr;

                                 pthread_attr_init( &attr );

                                 //set fixed affinity in attribute, so that it starts binded immediately

                                 pthread_attr_setaffinity_np( &attr, sizeof( cpu_set_t ), &mask );

                                 //fire up thread

                                 pthread_create( &threads[i], &attr, &RDScheme::thread, (void*) &thread_data[i] );

                                 //free attr

                                 pthread_attr_destroy( &attr );

                         }


                         //wait for threads to finish initialisation

                         wait();


                         //delete temporary array

                         delete [] nzb;

                 }


                 static void end( pthread_mutex_t* mutex, pthread_cond_t* cond, size_t *sync, const size_t P ) {

                         pthread_mutex_lock( mutex );

                         (*sync)++;

                         if( *sync == P ) {

                                 //only one thread is waiting on this condition, use signal

                                 pthread_cond_signal( cond );

                                 *sync = 0;

                         }

                         pthread_mutex_unlock( mutex );

                 }


                 static void synchronise( pthread_mutex_t* mutex, pthread_cond_t* cond, size_t *sync, const size_t P ) {

                         pthread_mutex_lock( mutex );

                         (*sync)++;

                         if( *sync == P ) {

                                 *sync = 0;

                                 pthread_cond_broadcast( cond );

                         } else

                                 pthread_cond_wait( cond, mutex );

                         pthread_mutex_unlock( mutex );

                 }


                 static void* thread( void *data ) {

                         //get short-hand notation

                         RDScheme_shared_data<T>* shared  = (RDScheme_shared_data<T>*)data;

                         const size_t id  = shared->id;

                         const size_t P   = shared->P;

                         const size_t nnz = shared->original->size();

                         pthread_mutex_t *mutex      = shared->mutex;

                         pthread_cond_t  *cond       = shared->cond;


                         cpu_set_t mask;

                         CPU_ZERO( &mask );

                         pthread_getaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask );


                         //sanity checks

                         if( !CPU_ISSET( id, &mask ) ) {

                                 std::cerr << "Incorrect pinning for thread " << id << "!" << std::endl;

                                 exit( 1 );

                         }

                         for( size_t s=0; s<P; s++ ) {

                                 if( s==id ) continue;

                                 if( CPU_ISSET( s, &mask ) ) {

                                         std::cerr << "Thread " << id << " mask is larger than one core" << " (" << s << " is set)!" << std::endl;

                                         exit( 1 );

                                 }

                         }


                         //prepare to get global matrix dimensions

                         ULI m, n;

                         m = n = 0;

                         //put rowsums in nzb

                         const size_t blocksize = (nnz % P) > 0 ? nnz / P + 1 : nnz / P;

                         for( size_t i=0; i<nnz; i++ ) {

                                 const unsigned long int currow = (*(shared->original))[ i ].i();

                                 const unsigned long int curcol = (*(shared->original))[ i ].j();

                                 if( currow >= id * blocksize && currow < (id + 1) * blocksize )

                                         shared->nzb[ currow ]++;

                                 if( currow > m ) m = currow;

                                 if( curcol > n ) n = curcol;

                         }


                         //dimensions are one higher than max indices

                         m++;

                         n++;


                         //sync

                         RDScheme::synchronise( mutex, cond, shared->sync, shared->P );


                         //determine distribution

                         const size_t nnz_target = nnz / P;

                         size_t cursum = 0;


                         //first sanity check

                         for( unsigned long int i=0; i<m; i++ ) cursum += shared->nzb[ i ];

                         assert( cursum == nnz );


                         //continue

                         cursum = 0;

                         size_t start, end, k = 0;

                         start = end = -1;

                         //get start position for s=0 correct

                         if( id == 0 ) start = 0;

                         //do greedy load balancing to get ranges for prcoessors 0 to P-1

                         for( size_t i = 0; i < m; i++ ) {

                                 cursum += shared->nzb[ i ];

                                 if( cursum >= nnz_target ) {

                                         if( k == id ) end   = i + 1;

                                         if(k+1== id ) start = i + 1;

                                         k++;

                                         cursum = 0;

                                 }

                         }

                         //see if we missed out on any nonzeroes, and set an empty range if so

                         if( start == static_cast< size_t >(-1) ) start = m;

                         if(  end  == static_cast< size_t >(-1) ) end   = m;

                         //get end position for s=P-1 correct

                         if( id == P-1 ) end = m;

                         //derive output vector sizes

                         shared->output_vector_size   = end - start;

                         shared->output_vector_offset = start;

                         assert( shared->output_vector_size <= m );

                         assert( shared->output_vector_offset + shared->output_vector_size <= m );


                         //copy to local first

                         std::vector< Triplet< T > > local;

                         for( size_t i = 0; i < static_cast< size_t >(nnz); i++ ) {

                                 const size_t currow = (*(shared->original))[ i ].i();

                                 if( currow >= start && currow < end )

                                         local.push_back(

                                                 Triplet< T >( (*(shared->original))[ i ].i() - start,

                                                         (*(shared->original))[ i ].j(),

                                                         (*(shared->original))[ i ].value )

                                         );

                         }

                         m = shared->output_vector_size; //new matrix size is new m times old n


                         //load into datastructure

                         DS dss( local, m, n, 0 );


                         //remember memory usage

                         shared->bytes = dss.bytesUsed();


                         //create local shadow of y to avoid write-contention

                         T* y = NULL;

 #ifdef RDScheme_GLOBAL_Y

                         if( id > 0 ) {

 #endif

                                 if( shared->output_vector_size > 0 ) {

                                         y = new T[ shared->output_vector_size ];

                                         for( size_t i=0; i<shared->output_vector_size; i++ )

                                                 y[ i ] = 0.0;

                                 } else

                                         y = NULL;

 #ifdef RDScheme_GLOBAL_Y

                         }

 #endif

                         shared->local_y = y;


                         //exit construction mode

                         shared->mode = 0;


                         //signal end of construction

                         pthread_mutex_lock( mutex );

                         RDScheme::end( shared->end_mutex, shared->end_cond, shared->end_sync, shared->P );


                         //enter daemon mode

                         while( true ) {

                                 struct timespec clk_start, clk_stop;

                                 pthread_cond_wait(  cond, mutex );

                                 pthread_mutex_unlock( mutex );


                                 if( shared->mode == 4 ) break;


 #ifndef NDEBUG

                                 const double * const p_input  = RDScheme<T,DS>::input;

                                 const double * const p_output = RDScheme<T,DS>::output;

 #endif

                                 switch( shared->mode ) {

                                 case 3:

                                         assert( p_input  != NULL );

                                         assert( p_output != NULL );

 #ifdef RDScheme_GLOBAL_Y

                                         if( id == 0 ) {

                                                 y = RDScheme::output;

                                                 shared->local_y = y;

                                         }

 #endif

                                         assert( y != NULL );


                                         clock_gettime( global_clock_id, &clk_start);

                                         shared->time = 0.0;

                                         for( unsigned long int i=0; i<shared->repeat; ++i )

                                                 dss.zxa( RDScheme<T,DS>::input, y );

                                         clock_gettime( global_clock_id, &clk_stop);

                                         shared->time  = (clk_stop.tv_sec-clk_start.tv_sec)*1000;

                                         shared->time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;


 #ifndef RDScheme_NO_COLLECT

                                         collectY( shared );

 #endif

                                         break;

                                 case 2:

                                         assert( p_input  != NULL );

                                         assert( p_output != NULL );

 #ifdef RDScheme_GLOBAL_Y

                                         if( id == 0 ) {

                                                 y = RDScheme::output;

                                                 shared->local_y = y;

                                         }

 #endif

                                         assert( y != NULL );


                                         clock_gettime( global_clock_id, &clk_start);

                                         shared->time = 0.0;

                                         for( unsigned long int i=0; i<shared->repeat; ++i )

                                                 dss.zax( RDScheme<T,DS>::input, y );

                                         clock_gettime( global_clock_id, &clk_stop);

                                         shared->time  = (clk_stop.tv_sec-clk_start.tv_sec)*1000;

                                         shared->time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;


 #ifndef RDScheme_NO_COLLECT

                                         collectY( shared );

 #endif

                                         break;

                                 default:

                                         std::cout << "Thread " << id << ": Error, undefined operation (" << shared->mode << ")!" << std::endl;

                                         exit( -1 );

                                 }

                                 shared->mode = 0;


                                 //signal end of operation

                                 pthread_mutex_lock( mutex );

                                 RDScheme::end( shared->end_mutex, shared->end_cond, shared->sync, shared->P );

                         }


                         //done

 #ifdef RDScheme_GLOBAL_Y

                         if( id != 0 )

 #endif

                                 delete [] y;

                         return (NULL);

                 }


                 static void collectY( RDScheme_shared_data<T> *shared ) {


 #ifdef RDScheme_GLOBAL_Y

                         //FIXME It could be possible to distribute work over all processors

                         //instead of p-1 processors, but this requires some extra balancing.

                         const size_t s = shared->id;

                         if( s == 0 ) return;

 #endif


                         //do collect items of own block

                         for( size_t i = 0; i < shared->output_vector_size; i++ ) {

 #ifndef NDEBUG

                                 const double * const p_output = RDScheme<T,DS>::output;

                                 assert( p_output != NULL );

                                 assert( shared->local_y != NULL );

 #endif

                                 RDScheme<T,DS>::output[ shared->output_vector_offset + i ] += shared->local_y[ i ];

                         }

                 }


 #ifndef _NO_LIBNUMA


                 virtual T* mv( const T* x ) {

                         T* ret = (T*) numa_alloc_interleaved( this->nor * sizeof( T ) );

                         for( ULI i=0; i<this->nor; i++ ) ret[ i ] = this->zero_element;

                         zax( x, ret );

                         return ret;

                 }

 #endif


                 virtual void zxa( const T* x, T* z ) {

                         zxa( x, z, 1 );

                 }


                 virtual void zxa( const T* x, T* z, const unsigned long int repeat ) {

                         //set all daemon threads to do zxa

                         for( size_t i=0; i<P; i++ ) {

                                 thread_data[ i ].mode   = 3;

                                 thread_data[ i ].repeat = repeat;

                         }


                         //set input vector

                         RDScheme<T,DS>::input = x;


                         //set output vector

                         RDScheme<T,DS>::output = z;


                         //wake up all daemon threads

                         pthread_mutex_lock( &end_mutex );

                         pthread_mutex_lock( &mutex );

                         pthread_cond_broadcast( &cond );

                         pthread_mutex_unlock( &mutex );


                         //wait for end of operation

                         wait();


                         //unset vectors

                         RDScheme<T,DS>::input  = NULL;

                         RDScheme<T,DS>::output = NULL;

                 }


                 virtual void zax( const T* x, T* z ) {

                         zax( x, z, 1, 0, NULL );

                 }


                 virtual void zax( const T* x, T* z, const unsigned long int repeat, const clockid_t clock_id, double *elapsed_time ) {

                         //set all daemon threads to do zax

                         for( size_t i=0; i<P; i++ ) {

                                 thread_data[ i ].mode   = 2;

                                 thread_data[ i ].repeat = repeat;

                         }


                         //set global clock ID

                         global_clock_id = clock_id;


                         //set input vector

                         RDScheme<T,DS>::input = x;


                         //set output vector

                         RDScheme<T,DS>::output = z;


                         //wake up all daemon threads

                         pthread_mutex_lock( &end_mutex );

                         pthread_mutex_lock( &mutex );

                         pthread_cond_broadcast( &cond );

                         pthread_mutex_unlock( &mutex );


                         //wait for end of operation

                         wait();


                         //get elapsed time

                         double maxtime = 0.0;

                         for( size_t i=0; i<P; i++ ) {

                                 const double curtime = thread_data[ i ].time;

                                 if( curtime > maxtime ) maxtime = curtime;

                         }

                         if( elapsed_time != NULL )

                                 *elapsed_time += maxtime;


                         //unset vectors

                         RDScheme<T,DS>::input  = NULL;

                         RDScheme<T,DS>::output = NULL;

                 }


                 virtual size_t bytesUsed() {

                         size_t ret = 0;

                         for( size_t s = 0; s < P; ++s )

                                 ret += thread_data[ s ].bytes;

                         return ret;

                 }


                 virtual void getFirstIndexPair( ULI &i, ULI &j ) {

                         std::cerr << "Warning: RDScheme::getFirstIndexPair has no unique answer since it implements a parallel multiplication!\nIgnoring call..." << std::endl;

                 }

 };


 template< typename T, typename DS > size_t RDScheme< T, DS >::P = 0;


 template< typename T, typename DS > const T* RDScheme< T, DS >::input  = NULL;


 template< typename T, typename DS > T* RDScheme< T, DS >::output = NULL;


 template< typename T, typename DS > clockid_t RDScheme< T, DS >::global_clock_id = 0;


 #endif


SparseMatrix< T, ULI >::nnz
ULI nnz
Number of non-zeros.
Definition: SparseMatrix.hpp:58

RDScheme::end_cond
pthread_cond_t end_cond
Wait for end mechanism: condition.
Definition: RDScheme.hpp:197

RDScheme::getFirstIndexPair
virtual void getFirstIndexPair(ULI &i, ULI &j)
Function disabled for parallel schemes!
Definition: RDScheme.hpp:674

RDScheme::input
static const T * input
Input vector.
Definition: RDScheme.hpp:173

RDScheme_shared_data::P
size_t P
Total number of processors.
Definition: RDScheme.hpp:79

RDScheme::~RDScheme
virtual ~RDScheme()
Base deconstructor.
Definition: RDScheme.hpp:218

RDScheme_shared_data::cond
pthread_cond_t * cond
Condition used for synchronisation.
Definition: RDScheme.hpp:103

RDScheme::mutex
pthread_mutex_t mutex
Stop/continue mechanism: mutex.
Definition: RDScheme.hpp:188

RDScheme_shared_data::output_vector_size
size_t output_vector_size
Length of the local output vector.
Definition: RDScheme.hpp:118

RDScheme_shared_data::local_y
T * local_y
Pointer to the local output vector.
Definition: RDScheme.hpp:124

RDScheme::threads
pthread_t * threads
p_threads associated to this data strcuture
Definition: RDScheme.hpp:179

RDScheme::P
static size_t P
Number of threads to fire up.
Definition: RDScheme.hpp:170

RDScheme::zxa
virtual void zxa(const T *x, T *z, const unsigned long int repeat)
Definition: RDScheme.hpp:590

RDScheme::thread_data
RDScheme_shared_data< T > * thread_data
array of initial thread data
Definition: RDScheme.hpp:182

MachineInfo::cores
unsigned long int cores() const
The number of available cores.
Definition: MachineInfo.cpp:77

RDScheme_shared_data::mutex
pthread_mutex_t * mutex
Mutex used for synchronisation.
Definition: RDScheme.hpp:100

RDScheme::sync
size_t sync
Used for synchronising threads.
Definition: RDScheme.hpp:200

SparseMatrix< T, ULI >::m
virtual unsigned long int m()
Queries the number of rows this matrix contains.
Definition: SparseMatrix.hpp:107

RDScheme
Full parallel row-distributed SpMV, based on CSB (Morton curve + Cilk) and PThreads.
Definition: RDScheme.hpp:163

RDScheme::end
static void end(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
End synchronisation code.
Definition: RDScheme.hpp:322

RDScheme::output
static T * output
Output vector.
Definition: RDScheme.hpp:176

RDScheme_shared_data::end_cond
pthread_cond_t * end_cond
Condition used for end sync.
Definition: RDScheme.hpp:109

RDScheme_shared_data::original
std::vector< Triplet< T > > * original
Array of vectors of thread-local nonzeroes.
Definition: RDScheme.hpp:88

MachineInfo::getInstance
static const MachineInfo & getInstance()
Gets a singleton instance.
Definition: MachineInfo.cpp:38

RDScheme::mv
virtual T * mv(const T *x)
Overloaded mv call; allocates output vector using numa_interleaved.
Definition: RDScheme.hpp:576

RDScheme_shared_data::nzb
size_t * nzb
Will store rowsums.
Definition: RDScheme.hpp:91

RDScheme_shared_data::RDScheme_shared_data
RDScheme_shared_data(size_t _id, size_t _P, std::vector< Triplet< double > > *_original, size_t *_nzb, pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond, size_t *_sync, size_t *_end_sync, size_t _ovsize, size_t _ovoffset)
Default constructor.
Definition: RDScheme.hpp:148

RDScheme::RDScheme
RDScheme(const std::string file, T zero)
Base constructor.
Definition: RDScheme.hpp:208

RDScheme_shared_data::end_mutex
pthread_mutex_t * end_mutex
Mutex used for end sync.
Definition: RDScheme.hpp:106

RDScheme::thread
static void * thread(void *data)
SPMD code for each thread involved with parallel SpMV multiplication.
Definition: RDScheme.hpp:346

RDScheme::collectY
static void collectY(RDScheme_shared_data< T > *shared)
Reduces a distributed output vector set into a single contiguous output vector at process 0...
Definition: RDScheme.hpp:554

RDScheme_shared_data::output_vector_offset
size_t output_vector_offset
Offset of the local output vector compared to global indices.
Definition: RDScheme.hpp:121

SparseMatrix< T, ULI >::loadFromFile
void loadFromFile(const std::string file, const T zero=0)
Function which loads a matrix from a matrix market file.
Definition: SparseMatrix.hpp:89

SparseMatrix
Interface common to all sparse matrix storage schemes.
Definition: SparseMatrix.hpp:46

RDScheme_shared_data::time
double time
Will store local timing.
Definition: RDScheme.hpp:94

SparseMatrix< T, ULI >::noc
ULI noc
Number of columns.
Definition: SparseMatrix.hpp:55

RDScheme_shared_data::bytes
size_t bytes
Will store memory use.
Definition: RDScheme.hpp:97

RDScheme_shared_data::repeat
unsigned long int repeat
how many times to repeat the operation set in `mode' (above, only for 2 and 3)
Definition: RDScheme.hpp:85

RDScheme_shared_data::end_sync
size_t * end_sync
Counter used for end sync.
Definition: RDScheme.hpp:115

RDScheme::load
virtual void load(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero)
Loads a sparse matrix from an input set of triplets.
Definition: RDScheme.hpp:256

RDScheme_shared_data::id
size_t id
Thread ID.
Definition: RDScheme.hpp:76

RDScheme_shared_data::sync
size_t * sync
Counter used for synchronisation.
Definition: RDScheme.hpp:112

RDScheme::synchronise
static void synchronise(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
Synchronises all threads.
Definition: RDScheme.hpp:334

RDScheme::zax
virtual void zax(const T *x, T *z, const unsigned long int repeat, const clockid_t clock_id, double *elapsed_time)
See SparseMatrix::zax.
Definition: RDScheme.hpp:623

RDScheme::cond
pthread_cond_t cond
Stop/continue mechanism: condition.
Definition: RDScheme.hpp:191

RDScheme::bytesUsed
virtual size_t bytesUsed()
Definition: RDScheme.hpp:663

SparseMatrix< T, ULI >::nor
ULI nor
Number of rows.
Definition: SparseMatrix.hpp:52

RDScheme::wait
void wait()
Lets the calling thread wait for the end of the SpMV multiply.
Definition: RDScheme.hpp:242

SparseMatrix< T, ULI >::zero_element
T zero_element
The element considered to be zero.
Definition: SparseMatrix.hpp:63

RDScheme::RDScheme
RDScheme(std::vector< Triplet< T > > &input, ULI m, ULI n, T zero)
Base constructor.
Definition: RDScheme.hpp:213

RDScheme_shared_data::mode
unsigned char mode
0 undef, 1 init, 2 zax, 3 zxa, 4 exit
Definition: RDScheme.hpp:82

RDScheme_shared_data::RDScheme_shared_data
RDScheme_shared_data()
Base constructor.
Definition: RDScheme.hpp:127

RDScheme::end_sync
size_t end_sync
Used for construction end signal.
Definition: RDScheme.hpp:203

SparseMatrix< T, ULI >::n
virtual unsigned long int n()
Queries the number of columns this matrix contains.
Definition: SparseMatrix.hpp:115

Triplet
A single triplet value.
Definition: Triplet.hpp:52

RDScheme::global_clock_id
static clockid_t global_clock_id
Clock type used for thread-local timing.
Definition: RDScheme.hpp:185

RDScheme::zax
virtual void zax(const T *x, T *z)
See SparseMatrix::zax.
Definition: RDScheme.hpp:618

RDScheme_shared_data
Shared data for RDScheme threads.
Definition: RDScheme.hpp:71

RDScheme::end_mutex
pthread_mutex_t end_mutex
Wait for end mechanism: mutex.
Definition: RDScheme.hpp:194

RDScheme::zxa
virtual void zxa(const T *x, T *z)
Definition: RDScheme.hpp:585