43 #define SIZE_MAX static_cast< size_t >( -1 )
46 #include "SparseMatrix.hpp"
47 #include "Matrix2HilbertCoordinates.hpp"
49 #include "MachineInfo.hpp"
51 #ifndef _H_BETAHILBERT
52 #define _H_BETAHILBERT
72 #define BH_REDUCE_BASE 2
83 #define BH_USE_GLOBAL_Y 1
88 template<
typename T >
178 pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond,
179 size_t *_sync,
size_t *_end_sync,
size_t _m,
180 const T **_in, T**_out ):
188 int beta_uli_compare(
const void *a,
const void *b ) {
189 return ( *(
size_t*)a - *(
size_t*)b );
194 template<
typename T >
248 if( _p_translate == NULL ) {
251 for(
unsigned short int i = 0; i <
P; ++i )
p_translate.push_back( i );
267 BetaHilbert(
const std::string file, T zero = 0, std::vector< unsigned short int > *_p_translate = NULL ):
input( NULL ),
output( NULL ) {
289 for(
size_t i=0; i<
P; i++ ) {
294 pthread_mutex_lock( &
mutex );
295 pthread_cond_broadcast( &
cond );
296 pthread_mutex_unlock( &
mutex );
299 for(
size_t i=0; i<
P; i++ )
300 pthread_join(
threads[ i ], NULL );
305 pthread_mutex_destroy( &
mutex );
306 pthread_cond_destroy( &
cond );
327 numa_set_localalloc();
349 size_t *nzb =
new size_t [ this->
nnz +
P ];
350 size_t **nzc =
new size_t*[
P ];
351 for(
size_t i=0; i<
P; i++ ) nzc[ i ] = NULL;
358 pthread_mutex_init( &
mutex, NULL );
359 pthread_cond_init ( &
cond, NULL );
361 pthread_cond_init ( &
end_cond, NULL );
368 for(
size_t i=0; i<
P; i++ ) {
370 thread_data[ i ] =
shared_data< T >( i,
P, &
input, nzb, nzc, &
mutex, &
cond, &
end_mutex, &
end_cond, &
sync, &
end_sync, this->
m(), &(this->
input), &
output );
377 pthread_attr_init( &attr );
379 pthread_attr_setaffinity_np( &attr,
sizeof( cpu_set_t ), &mask );
383 pthread_attr_destroy( &attr );
403 pthread_mutex_lock( mutex );
406 pthread_cond_signal( cond );
409 pthread_mutex_unlock( mutex );
421 pthread_mutex_lock( mutex );
425 pthread_cond_broadcast( cond );
427 pthread_cond_wait( cond, mutex );
428 pthread_mutex_unlock( mutex );
439 const size_t id = shared->
id;
440 const size_t P = shared->
P;
443 pthread_cond_t *
cond = shared->
cond;
448 pthread_getaffinity_np( pthread_self(),
sizeof( cpu_set_t ), &mask );
449 for(
size_t s=0; s<
P; s++ ) {
450 if( s==
id )
continue;
451 if( CPU_ISSET( s, &mask ) ) {
452 std::cerr <<
"Thread " <<
id <<
" mask is larger than one core" <<
" (" << s <<
" is set)!" << std::endl;
457 std::cout <<
"Phase 1 at thread " << shared->
id <<
": cache Hilbert values and get maximum one" << std::endl;
459 size_t h1, h2, max1, max2;
460 size_t *h2s = shared->
nzb;
461 const size_t blocksize = (nnz %
P) > 0 ? nnz / P + 1 : nnz / P;
463 h2s[
id * blocksize ] = h2;
466 for(
size_t i=
id*blocksize+1; i<shared->
original->size() && i < (
id+1)*blocksize; ++i ) {
470 if( h1 > max1 ) { max1 = h1; max2 = h2; }
471 else if( h1 == max1 && h2 > max2 ) max2 = h2;
476 std::cerr <<
"Hilbert coordinate range is larger than 2^64-1. Current counting sort mechanism thus requires an array of 2^64 integers-->quitting." << std::endl;
483 h2s[ nnz + id ] = max2;
489 for(
size_t i=0; i<
P; ++i )
490 if( h2s[ shared->
original->size() + i ] > max2 )
491 max2 = h2s[ shared->
original->size() + i ];
495 unsigned char proceed;
498 std::cout <<
"Number of beta_m x beta_n blocks exceeds actual number of nonzeroes; going for (sequential) quicksort implementation." << std::endl;
503 std::cout <<
"Choosing counting-sort implementation." << std::endl;
509 size_t m,
n, start,
end;
510 std::vector< std::vector< Triplet< T > > > beta;
514 size_t *horig = NULL;
518 horig =
new size_t[ shared->
original->size() ];
519 for(
size_t i=0; i<shared->
original->size(); ++i )
520 horig[ i ] = h2s[ i ];
521 qsort( h2s, nnz,
sizeof(
size_t ), beta_uli_compare );
522 assert( h2s[ nnz-1 ]+1 == max2 );
555 start =
id * blocksize;
556 end = start + blocksize;
557 if( end > max2 ) end = max2;
558 const size_t target_s = h2s[ start ];
559 while( start < max2 && h2s[ start ] == target_s ) start--;
561 const size_t target_e = h2s[
end ];
562 while( end < max2 && h2s[ end ] == target_e ) end--;
564 if( shared->
id + 1 == shared->
P ) end = nnz - 1;
565 start = h2s[ start ];
568 std::cout <<
"Processor " <<
id <<
" range is " << start <<
" to " << end <<
", max = " << max2 << std::endl;
570 assert( start <= end );
571 assert( end <= max2 );
575 std::map< size_t, size_t > h2b;
587 for(
size_t i=1; i<shared->
original->size(); ++i ) {
594 size_t smin_m = SIZE_MAX;
595 size_t smin_n = SIZE_MAX;
596 size_t smax_m, smax_n;
597 smax_m = smax_n = m = n = 0;
598 size_t *ms =
new size_t[ end - start ];
599 size_t *ns =
new size_t[ end - start ];
600 size_t *minms =
new size_t[ end - start ];
601 size_t *minns =
new size_t[ end - start ];
602 for(
size_t i=0; i<end-start; i++ ) {
605 minms[ i ] = SIZE_MAX;
606 minns[ i ] = SIZE_MAX;
610 for(
size_t i=0; i<
nnz; i++ )
611 h2s[ i ] = horig[ i ];
615 for(
size_t i=0; i<shared->
original->size(); i++ ) {
616 const ULI row = (*(shared->
original))[ i ].i();
617 const ULI col = (*(shared->
original))[ i ].j();
619 if( row > m ) m = row;
620 if( col > n ) n = col;
621 if( h2 >= start && h2 < end ) {
622 if( row > ms[ h2-start ] ) ms[ h2-start ] = row;
623 if( col > ns[ h2-start ] ) ns[ h2-start ] = col;
624 if( row < minms[ h2-start ] ) minms[ h2-start ] = row;
625 if( col < minns[ h2-start ] ) minns[ h2-start ] = col;
626 if( row < smin_m ) smin_m = row;
627 if( col < smin_n ) smin_n = col;
628 if( row > smax_m ) smax_m = row;
629 if( col > smax_n ) smax_n = col;
639 for(
size_t i=0; i<shared->
original->size(); i++ ) {
642 if( h2 >= start && h2 < end ) {
643 beta[ h2b[ h2 ] ].push_back( (*(shared->
original))[ i ] );
648 for(
size_t i=0; i<beta.size(); i++ )
649 cursum += beta[i].size();
651 std::cout <<
"Thread " << shared->
id <<
": " << smin_m <<
"," << smax_m <<
" times " << smin_n <<
"," << smax_n <<
" holding " << cursum <<
" nonzeroes." << std::endl;
663 }
else if( proceed == 1 ) {
666 std::cout <<
"Phase 2: count nonzeroes in each beta_m x beta_n block (counting sort step 1), and derive distribution" << std::endl;
668 size_t *nzc =
new size_t[ max2 ];
669 for(
size_t i = 0; i < max2; ++i ) nzc[ i ] = 0;
670 for(
size_t i=
id*blocksize; i<nnz && i < (
id+1)*blocksize; ++i ) nzc[ h2s[ i ] ]++;
671 shared->
nzc[ id ] = nzc;
680 for(
size_t i=0; i<max2; i++ )
681 for(
size_t s=0; s<
P; s++ )
682 sum += shared->
nzc[ s ][ i ];
683 assert( sum == nnz );
690 for(
size_t i=
id*(max2/P+1); i<(
id+1)*(max2/P+1)&&i<max2; ++i ) {
691 for(
size_t s=1; s<
P; ++s )
692 shared->
nzc[ 0 ][ i ] += shared->
nzc[ s ][ i ];
701 for(
size_t i=0; i<max2; i++ ) sum += shared->
nzc[ 0 ][ i ];
702 assert( sum == nnz );
707 const size_t nnz_target = nnz != 0 ? nnz / P + 1 : nnz /
P;
710 size_t cursum = shared->
nzc[ 0 ][ 0 ];
711 for( ; end<max2 && cursum < nnz_target; end++ )
712 cursum += shared->
nzc[ 0 ][ end ];
715 for(
size_t i=0; i<P; i++ ) {
716 if( shared->
id == i ) {
717 std::cout <<
"Thread " << i <<
": local nnz count is " << cursum <<
" (storing block indices " << start <<
", " << end <<
") out of " << max2 <<
" blocks present." << std::endl;
723 for( ; end < max2 && ( (i == P-1) || (cursum < nnz_target) ); end++ )
724 cursum += shared->
nzc[ 0 ][ end ];
728 std::cout <<
"Phase 3 at processor " << shared->
id <<
": getting local nonzeroes (counting sort step 2)" << std::endl;
732 beta.resize( end - start );
733 for(
size_t i=0; i<end-start; i++ )
734 beta[ i ] = std::vector<
Triplet< T > >( shared->
nzc[ 0 ][ i + start ] );
737 size_t smin_m, smin_n;
738 size_t smax_m, smax_n;
739 smin_m = smin_n = ULONG_MAX;
740 smax_m = smax_n = m = n = 0;
741 size_t *ms =
new size_t[ end - start ];
742 size_t *ns =
new size_t[ end - start ];
743 size_t *minms =
new size_t[ end - start ];
744 size_t *minns =
new size_t[ end - start ];
745 for(
size_t i=0; i<end-start; i++ ) {
748 minms[ i ] = ULONG_MAX;
749 minns[ i ] = ULONG_MAX;
753 for(
size_t i=0; i<end-start; i++ )
754 assert( beta[ i ].size() == shared->
nzc[ 0 ][ i + start ] );
756 for(
size_t i=0; i<
nnz; ++i ) {
757 const ULI row = (*(shared->
original))[ i ].
i();
758 const ULI col = (*(shared->
original))[ i ].j();
759 h2 = shared->
nzb[ i ];
760 if( row > m ) m = row;
761 if( col > n ) n = col;
762 if( h2 >= start && h2 < end ) {
763 if( row > ms[ h2-start ] ) ms[ h2-start ] = row;
764 if( col > ns[ h2-start ] ) ns[ h2-start ] = col;
765 if( row < minms[ h2-start ] ) minms[ h2-start ] = row;
766 if( col < minns[ h2-start ] ) minns[ h2-start ] = col;
767 if( row < smin_m ) smin_m = row;
768 if( col < smin_n ) smin_n = col;
769 if( row > smax_m ) smax_m = row;
770 if( col > smax_n ) smax_n = col;
781 for(
size_t i=0; i<
nnz; ++i ) {
783 const ULI row = (*(shared->
original))[ i ].i();
784 const ULI col = (*(shared->
original))[ i ].j();
786 h2 = shared->
nzb[ i ];
788 if( h2 >= start && h2 < end ) {
791 if( beta[ h2-start ].size() > shared->
nzc[ 0 ][ h2 ] ) {
794 beta[ h2-start ][ shared->
nzc[ 0 ][ h2 ] ].j() /
max_n,
797 assert( h2 == temp );
799 assert( row /
max_m == beta[ h2-start ][ shared->
nzc[ 0 ][ h2 ] ].i() /
max_m );
800 assert( col /
max_n == beta[ h2-start ][ shared->
nzc[ 0 ][ h2 ] ].j() /
max_n );
803 beta[ h2-start ][ --(shared->
nzc[ 0 ][ h2 ]) ] = (*(shared->
original))[ i ];
804 assert( row == beta[ h2-start ][ shared->
nzc[ 0 ][ h2 ] ].i() );
805 assert( col == beta[ h2-start ][ shared->
nzc[ 0 ][ h2 ] ].j() );
810 std::cout <<
"Thread " << shared->
id <<
": " << smin_m <<
"," << smax_m <<
" times " << smin_n <<
"," << smax_n << std::endl;
814 for(
size_t i=0; i<end-start; i++ ) {
815 assert( shared->
nzc[ 0 ][ i + start ] == 0 );
816 if( beta[ i ].size() == 0 )
continue;
817 if( ms[ i ] - minms[ i ] >=
max_m ) {
818 std::cerr <<
"BetaHilbert thread construction: rowwise range (" << (ms[ i ]) <<
" to " << minms[ i ] <<
") over maximum size! (h2=" << (i+start) <<
")" << std::endl;
821 if( ns[ i ] - minns[ i ] >=
max_n ) {
822 std::cerr <<
"BetaHilbert thread construction: columnwise range over maximum size!" << std::endl;
839 std::cerr <<
"Invalid value for proceed: " << proceed << std::endl;
845 std::cout <<
"Processor " << shared->
id <<
": loading into local FBICRS data structure..." << std::endl;
855 if( BH_USE_GLOBAL_Y &&
id == 0 )
856 std::cerr <<
"Warning: thread 0 will use global y as local y (see BH_USE_GLOBAL_Y)" << std::endl;
857 if( !BH_USE_GLOBAL_Y || shared->
id > 0 ) {
870 pthread_mutex_lock( mutex );
875 std::cout <<
"Thread " <<
id <<
" construction done, entering daemon mode." << std::endl;
878 struct timespec clk_start, clk_stop;
879 pthread_cond_wait( cond, mutex );
880 pthread_mutex_unlock( mutex );
882 if( shared->
mode == 4 )
break;
884 switch( shared->
mode ) {
886 if( !(BH_USE_GLOBAL_Y && shared->
id == 0) ) {
893 assert( *(shared->
input) != NULL );
894 assert( *(shared->
output) != NULL );
895 if( BH_USE_GLOBAL_Y && shared->
id == 0 ) {
903 for(
size_t i=0; i<shared->
repeat; i++ ) {
905 if( BH_COLLECT == -1 )
907 else if( BH_COLLECT != 0 )
912 shared->
time = (clk_stop.tv_sec-clk_start.tv_sec)*1000;
913 shared->
time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;
916 assert( *(shared->
input) != NULL );
917 assert( *(shared->
output) != NULL );
918 if( BH_USE_GLOBAL_Y && shared->
id == 0 ) {
925 for(
size_t i=0; i<shared->
repeat; i++ ) {
927 if( BH_COLLECT == -1 )
929 else if( BH_COLLECT != 0 )
934 shared->
time = (clk_stop.tv_sec-clk_start.tv_sec)*1000;
935 shared->
time += (clk_stop.tv_nsec-clk_start.tv_nsec)/1000000.0;
938 std::cout <<
"Thread " <<
id <<
": Error, undefined operation (" << shared->
mode <<
")!" << std::endl;
944 pthread_mutex_lock( mutex );
949 if( !BH_USE_GLOBAL_Y || shared->
id > 0 )
960 const ULI s = shared->
id;
961 const ULI p = shared->
P;
965 if( BH_COLLECT == 0 ) {
967 }
else if( BH_COLLECT == 1 ) {
969 const ULI blocksize = (m % p == 0 ) ? m / p : m / p + 1;
970 const ULI m_start = s * blocksize;
971 const ULI m_end = m_start + blocksize > m ? m : m_start + blocksize;
975 for(
size_t i = m_start; i<m_end; i++ ) {
976 for(
size_t k = BH_USE_GLOBAL_Y; k < p; k++ ) {
980 }
else if( BH_COLLECT == 2 ) {
982 size_t step_p = BH_REDUCE_BASE;
983 size_t prev_step_p = 1;
984 const ULI blocksize = (m % p == 0 ) ? m / p : m / p + 1;
985 const ULI m_start = s * blocksize;
986 const ULI m_end = m_start + blocksize > m ? m : m_start + blocksize;
987 while( prev_step_p < p ) {
988 for(
size_t start_p = 0; start_p < p; start_p += step_p ) {
990 for(
size_t k = start_p + prev_step_p; k < p && k < start_p + step_p; k += prev_step_p ) {
991 for(
size_t i = m_start; i<m_end; i++ )
992 datas[ start_p ].local_y[ i ] += datas[ k ].local_y[ i ];
995 prev_step_p = step_p;
996 step_p *= BH_REDUCE_BASE;
999 if( !BH_USE_GLOBAL_Y )
1000 for(
size_t i = m_start; i<m_end; i++ )
1001 (*(shared->
output))[ i ] += datas[ 0 ].local_y[ i ];
1002 }
else if( BH_COLLECT == 3 ) {
1004 size_t prev_step_p = 1;
1005 while( prev_step_p < p ) {
1006 for(
size_t start_p = 0; start_p < p; start_p += step_p ) {
1007 if( prev_step_p > 1 )
1009 if( start_p != s )
continue;
1010 for(
size_t k = start_p + prev_step_p; k < p && k < start_p + step_p; k += prev_step_p )
1011 for(
size_t i = 0; i <
m; i++ )
1012 datas[ start_p ].local_y[ i ] += datas[ k ].local_y[ i ];
1014 prev_step_p = step_p;
1018 if( !BH_USE_GLOBAL_Y && s == 0 )
1019 for(
size_t i = 0; i <
m; i++ )
1022 std::cerr <<
"Error: output vector collection strategy " << BH_COLLECT <<
" not implemented!" << std::endl;
1033 virtual void zxa(
const T* x, T* z ) {
1044 virtual void zxa(
const T* x, T* z,
const size_t repeat ) {
1046 for(
size_t i=0; i<
P; i++ ) {
1059 pthread_mutex_lock( &
mutex );
1060 pthread_cond_broadcast( &
cond );
1061 pthread_mutex_unlock( &
mutex );
1067 if( BH_COLLECT == 0 )
1068 for(
size_t i=0; i<this->
nor; i++ )
1069 for(
size_t s=BH_USE_GLOBAL_Y; s<
P; s++ )
1082 for(
size_t i=0; i<
P; i++ ) {
1088 pthread_mutex_lock( &
mutex );
1089 pthread_cond_broadcast( &
cond );
1090 pthread_mutex_unlock( &
mutex );
1102 virtual T*
mv(
const T* x ) {
1105 size_t allocsize = (this->
nor + 1) *
sizeof( T );
1109 T* ret = (T*) numa_alloc_interleaved( allocsize );
1111 T* ret = (T*) _mm_malloc( allocsize, 64 );
1114 for( ULI i=0; i<this->
nor; i++ ) {
1134 virtual void zax(
const T* x, T* z ) {
1135 zax( x, z, 1, 0, NULL );
1147 virtual void zax(
const T* x, T* z,
const size_t repeat,
const clockid_t clock_id,
double *elapsed_time ) {
1149 for(
size_t i=0; i<
P; i++ ) {
1165 pthread_mutex_lock( &
mutex );
1166 pthread_cond_broadcast( &
cond );
1167 pthread_mutex_unlock( &
mutex );
1173 if( BH_COLLECT == 0 )
1174 for(
size_t i=0; i<this->
nor; i++ )
1175 for(
size_t s=BH_USE_GLOBAL_Y; s<
P; s++ )
1179 double maxtime = 0.0;
1180 for(
size_t i=0; i<
P; i++ ) {
1182 if( curtime > maxtime ) maxtime = curtime;
1184 if( elapsed_time != NULL )
1185 *elapsed_time += maxtime;
1194 std::cerr <<
"Warning: BetaHilbert::getFirstIndexPair has no unique answer since it implements a parallel multiplication!\nIgnoring call..." << std::endl;
1200 for(
size_t s = 0; s <
P; ++s )
Hierarchical BICRS with fixed subblock size and distribution.
Definition: FBICRS.hpp:75
ULI nnz
Number of non-zeros.
Definition: SparseMatrix.hpp:58
size_t end_sync
Used for construction end signal.
Definition: BetaHilbert.hpp:238
virtual ~BetaHilbert()
Base deconstructor.
Definition: BetaHilbert.hpp:287
shared_data()
Base constructor.
Definition: BetaHilbert.hpp:154
double time
Will store local timing.
Definition: BetaHilbert.hpp:115
The Beta Hilbert triplet scheme.
Definition: BetaHilbert.hpp:195
virtual void zxa(const T *x, T *z, const size_t repeat)
Computes z=xA in place, a given number of times successively.
Definition: BetaHilbert.hpp:1044
void set_p_translate(std::vector< unsigned short int > *_p_translate)
Sets p_translate to 0..P-1 by default, or equal to the optionally supplied vector.
Definition: BetaHilbert.hpp:247
virtual void zxa_fb(const _t_value *__restrict__ x_p, _t_value *__restrict__ y_p)
Interleaved zxa kernel for use with the BetaHilbert scheme.
Definition: FBICRS.hpp:501
size_t P
Total number of processors.
Definition: BetaHilbert.hpp:97
pthread_mutex_t * end_mutex
Mutex used for end sync.
Definition: BetaHilbert.hpp:127
shared_data(size_t _id, size_t _P, std::vector< Triplet< double > > *_original, size_t *_nzb, size_t **_nzc, pthread_mutex_t *_mutex, pthread_cond_t *_cond, pthread_mutex_t *_end_mutex, pthread_cond_t *_end_cond, size_t *_sync, size_t *_end_sync, size_t _m, const T **_in, T **_out)
Default constructor.
Definition: BetaHilbert.hpp:177
static clockid_t global_clock_id
Clock type used for thread-local timing.
Definition: BetaHilbert.hpp:220
static size_t P
Number of threads to fire up.
Definition: BetaHilbert.hpp:202
size_t sync
Used for synchronising threads.
Definition: BetaHilbert.hpp:235
T * output
Output vector.
Definition: BetaHilbert.hpp:211
ULI i() const
Definition: Triplet.hpp:70
const T * input
Input vector.
Definition: BetaHilbert.hpp:208
shared_data< T > * thread_data
array of initial thread data
Definition: BetaHilbert.hpp:217
unsigned long int cores() const
The number of available cores.
Definition: MachineInfo.cpp:77
virtual unsigned long int m()
Queries the number of rows this matrix contains.
Definition: SparseMatrix.hpp:107
pthread_cond_t end_cond
Wait for end mechanism: condition.
Definition: BetaHilbert.hpp:232
static void collectY(shared_data< T > *shared)
Code that collects a distributed output vector.
Definition: BetaHilbert.hpp:959
virtual void getFirstIndexPair(ULI &i, ULI &j)
Definition: BetaHilbert.hpp:1193
static const MachineInfo & getInstance()
Gets a singleton instance.
Definition: MachineInfo.cpp:38
Shared data for BetaHilbert threads.
Definition: BetaHilbert.hpp:89
size_t ** nzc
Will contain the nonzero counts of separate blocks.
Definition: BetaHilbert.hpp:112
static void IntegerToHilbert(const size_t i, const size_t j, size_t &h1, size_t &h2)
New method, October 2010.
Definition: Matrix2HilbertCoordinates.cpp:48
pthread_mutex_t end_mutex
Wait for end mechanism: mutex.
Definition: BetaHilbert.hpp:229
const T ** input
Array of all local input vectors of all SPMD processes.
Definition: BetaHilbert.hpp:148
size_t repeat
how many times to repeat the operation set in `mode' (above, only for 2 and 3)
Definition: BetaHilbert.hpp:103
pthread_cond_t * cond
Condition used for synchronisation.
Definition: BetaHilbert.hpp:124
size_t * nzb
Will cache block numbers of nonzeroes.
Definition: BetaHilbert.hpp:109
void loadFromFile(const std::string file, const T zero=0)
Function which loads a matrix from a matrix market file.
Definition: SparseMatrix.hpp:89
pthread_mutex_t mutex
Stop/continue mechanism: mutex.
Definition: BetaHilbert.hpp:223
Interface common to all sparse matrix storage schemes.
Definition: SparseMatrix.hpp:46
T * local_y
Pointer to the local output vector.
Definition: BetaHilbert.hpp:145
std::vector< unsigned short int > p_translate
Which processors to pin threads to.
Definition: BetaHilbert.hpp:205
size_t output_vector_offset
Offset of the local output vector compared to global indices.
Definition: BetaHilbert.hpp:142
virtual void zxa(const T *x, T *z)
Computes z=xA in place.
Definition: BetaHilbert.hpp:1033
static const ULI max_n
Given FBICRS, the maximum value for columnwise matrix size.
Definition: BetaHilbert.hpp:241
unsigned char mode
0 undef, 1 init, 2 zax, 3 zxa, 4 exit, 5 reset
Definition: BetaHilbert.hpp:100
ULI noc
Number of columns.
Definition: SparseMatrix.hpp:55
size_t * end_sync
Counter used for end sync.
Definition: BetaHilbert.hpp:136
virtual void zax(const T *x, T *z, const size_t repeat, const clockid_t clock_id, double *elapsed_time)
Computes z=Ax in place, a given number of times in succession, and measures the time taken...
Definition: BetaHilbert.hpp:1147
static void synchronise(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
Thread synchronisation function.
Definition: BetaHilbert.hpp:420
pthread_cond_t cond
Stop/continue mechanism: condition.
Definition: BetaHilbert.hpp:226
static const ULI max_m
Given FBICRS, the maximum value for the rowwise matrix size, assuming short ints on ICRS at the lower...
Definition: BetaHilbert.hpp:244
size_t id
Thread ID.
Definition: BetaHilbert.hpp:94
virtual size_t bytesUsed()
Function to query the amount of storage required by this sparse matrix.
Definition: FBICRS.hpp:664
static void end(pthread_mutex_t *mutex, pthread_cond_t *cond, size_t *sync, const size_t P)
End synchronisation function.
Definition: BetaHilbert.hpp:402
pthread_t * threads
p_threads associated to this data strcuture
Definition: BetaHilbert.hpp:214
virtual size_t bytesUsed()
Definition: BetaHilbert.hpp:1198
pthread_mutex_t * mutex
Mutex used for synchronisation.
Definition: BetaHilbert.hpp:121
ULI nor
Number of rows.
Definition: SparseMatrix.hpp:52
void reset()
Reset all local output vectors to zero.
Definition: BetaHilbert.hpp:1080
T zero_element
The element considered to be zero.
Definition: SparseMatrix.hpp:63
size_t bytes
Local memory use.
Definition: BetaHilbert.hpp:118
void wait()
Callee will Wait for end of SpMV.
Definition: BetaHilbert.hpp:310
std::vector< Triplet< T > > * original
Array of local sparse blocks.
Definition: BetaHilbert.hpp:106
virtual void zax_fb(const _t_value *__restrict__ x_p, _t_value *__restrict__ y_p)
Interleaved zax kernel for use with the BetaHilbert scheme.
Definition: FBICRS.hpp:617
pthread_cond_t * end_cond
Condition used for end sync.
Definition: BetaHilbert.hpp:130
size_t * sync
Counter used for synchronisation.
Definition: BetaHilbert.hpp:133
virtual T * mv(const T *x)
Overloaded mv call; allocates output vector using numa_interleaved.
Definition: BetaHilbert.hpp:1102
virtual unsigned long int n()
Queries the number of columns this matrix contains.
Definition: SparseMatrix.hpp:115
A single triplet value.
Definition: Triplet.hpp:52
size_t output_vector_size
Length of the local output vector.
Definition: BetaHilbert.hpp:139
T ** output
Array of all output vectors local to all SPMD processes.
Definition: BetaHilbert.hpp:151
BetaHilbert(const std::string file, T zero=0, std::vector< unsigned short int > *_p_translate=NULL)
File-based constructor.
Definition: BetaHilbert.hpp:267
virtual void load(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero)
Loads from input COO matrix.
Definition: BetaHilbert.hpp:324
BetaHilbert(std::vector< Triplet< T > > &input, ULI m, ULI n, T zero=0, std::vector< unsigned short int > *_p_translate=NULL)
COO-based constructor.
Definition: BetaHilbert.hpp:281
virtual void zax(const T *x, T *z)
Computes z=Ax in place.
Definition: BetaHilbert.hpp:1134
static void * thread(void *data)
SPMD code for each thread doing an SpMV.
Definition: BetaHilbert.hpp:437