34 #include "alignment.hpp"
35 #include "Triplet.hpp"
36 #include "SparseMatrix.hpp"
45 #if defined __INTEL_COMPILER && defined __KNC__
47 #include <immintrin.h>
48 #include <smmintrin.h>
49 #include <zmmintrin.h>
64 template<
typename T,
size_t block_length_row=1,
size_t block_length_column=8,
typename _i_value=LI >
72 static const size_t block_length = block_length_row * block_length_column;
99 #ifdef __INTEL_COMPILER
100 ds = (T*) _mm_malloc( this->
allocsize *
sizeof( T ), _SL_ALIGN_DOUBLE );
101 c_ind = (_i_value*) _mm_malloc( this->
allocsize *
sizeof( _i_value ), _SL_ALIGN_INT16 );
102 r_ind = (_i_value*) _mm_malloc( (this->
jumpsize + 1) *
sizeof( _i_value ), _SL_ALIGN_INT16 );
115 #ifdef __INTEL_COMPILER
120 if(
ds != NULL )
delete []
ds;
135 static void addPadding( std::vector< _i_value > &row_increments,
const std::map< size_t, size_t > &row2local,
const size_t blockingSize,
const size_t prev_row,
const size_t m ) {
136 if( row_increments.size() % blockingSize != 0 ) {
138 size_t base = row2local.rbegin()->first;
140 std::set< size_t > current;
142 size_t index = row_increments.size() - 1;
148 current.insert( base );
150 base -= row_increments[ index ];
155 size_t padding_index = *(current.begin());
156 if( padding_index > 0 ) {
159 const size_t firstguess = padding_index;
163 while( current.find( padding_index ) != current.end() ) {
167 if( padding_index >= m ) {
169 padding_index = firstguess;
171 while( current.find( padding_index ) != current.end() && padding_index <
m ) {
175 if( padding_index >= m ) {
176 std::cerr <<
"Matrix too small to be vectorised! Using this matrix in computations may crash due to illegal reads (by one data element after the output vector end). Overallocating the output vector will ensure proper, error-free execution." << std::endl;
182 row_increments.push_back( padding_index - prev_row );
186 while( row_increments.size() % blockingSize != 0 ) {
188 row_increments.push_back( 0 );
192 assert( row2local.find( padding_index ) == row2local.end() );
222 std::vector< size_t > *
const row_indices,
223 std::vector< _i_value > &row_increments,
229 std::map< size_t, size_t > row2local;
230 size_t k_walk = k + 1;
231 row2local[ input[ k ].i() ] = 0;
232 row_indices[ 0 ].push_back( k );
234 while( k_walk < input.size() ) {
236 const size_t currow = input[ k_walk ].i();
238 std::map< size_t, size_t >::const_iterator it = row2local.find( currow );
239 if( it == row2local.end() ) {
241 const size_t location = row2local.size();
243 if( location == block_length_row ) {
247 row_indices[ location ].push_back( k_walk );
249 row2local[ currow ] = location;
252 row_indices[ it->second ].push_back( k_walk );
258 std::map< size_t, size_t >::const_iterator cit = row2local.begin();
260 assert( cit != row2local.end() );
262 for( ; cit != row2local.end(); ++cit ) {
266 row_increments.push_back( cit->first - prev_row );
270 prev_row = cit->first;
274 addPadding( row_increments, row2local, block_length_row, prev_row, m );
277 if( k_walk == input.size() ) {
303 #if defined __INTEL_COMPILER && defined __KNC__
305 if( block_length_row == 1 ) {
309 if( block_length_row == 2 ) {
319 size_t base = r_ind[ 0 ];
320 size_t index = r_ind[ 0 ];
321 for(
size_t i = 0; i < r_ind.size(); ) {
324 r_ind[ i ] = index - base;
329 index += r_ind[ i + j ];
332 r_ind[ i + j ] = index - base;
334 #if defined __INTEL_COMPILER && defined __KNC__
336 if( block_length_row > 1 && block_length_row < block_length ) {
341 temp[ j ] = r_ind[ i + j ];
344 for(
size_t s = 0; s < block_length_row; ++s ) {
345 for(
size_t t = s; t <
block_length; t += block_length_row ) {
346 r_ind[ i++ ] = temp[ t ];
367 if( one.
i() < two.
i() )
369 if( one.
i() > two.
i() )
371 if( one.
j() < two.
j() )
373 if( one.
j() > two.
j() )
382 if( one->
j() < two->
j() )
384 if( one->
j() > two->
j() )
390 static int pRowSort(
const void * left,
const void * right ) {
393 if( one->
j() < two->
j() )
395 if( one->
j() > two->
j() )
421 vecBICRS(
const ULI number_of_nonzeros,
const ULI number_of_rows,
const ULI number_of_cols, T zero ):
422 SparseMatrix< T, ULI >( number_of_nonzeros, number_of_cols, number_of_rows, zero ) {
437 this->r_start = toCopy.r_start;
438 this->c_start = toCopy.c_start;
442 for(
size_t i = 0; i < this->
allocsize; ++i ) {
443 ds[ i ] = toCopy.
ds[ i ];
446 for(
size_t i = 0; i < this->
jumpsize; ++i ) {
463 load( input, m, n, zero );
474 if( log2(this->
noc) >
sizeof( _i_value ) * 8 ) {
475 std::cerr <<
"Warning: the matrix with " << this->
noc <<
" columns cannot be represented within " << (
sizeof( _i_value )*8) <<
"-bit index values this vecBICRS instance uses! Attempting to continue anyway..." << std::endl;
477 if( log2(this->
nor) >
sizeof( _i_value ) * 8 ) {
478 std::cerr <<
"Warning: the matrix with " << this->
nor <<
" rows cannot be represented within " << (
sizeof( _i_value )*8) <<
"-bit index values this vecBICRS instance uses! Attempting to continue anyway..." << std::endl;
480 if( m==0 || n==0 || input.size() == 0 ) {
503 std::vector< size_t > indices[ block_length_row ];
509 std::vector< _i_value >
c_ind;
510 std::vector< _i_value >
r_ind;
517 bool first_row =
false;
520 _i_value prev_row = 0;
523 _i_value prev_col = 0;
533 size_t cit[ block_length_row ];
534 for(
size_t i = 0; i < block_length_row; ++i ) {
544 const size_t top_left = c_ind.size();
546 for(
size_t i = 0; i < block_length_row; ++i ) {
548 for(
size_t j = 0; j < block_length_column; ++j ) {
550 if( cit[ i ] == indices[ i ].size() ) {
552 c_ind.push_back( 0 );
558 const size_t k = indices[ i ][ cit[ i ] ];
562 c_ind.push_back( input[ k ].j() - prev_col );
563 ds .push_back( input[ k ].value );
566 prev_col = input[ k ].j();
571 c_ind[ c_ind.size() - 1 ] += this->
noc;
577 std::swap( c_ind[ c_ind.size() - 1 ], c_ind[ top_left ]);
586 for(
size_t i = 0; i < block_length_row; ++i ) {
587 if( cit[ i ] < indices[ i ].size() ) {
592 }
while( !depleted );
594 for(
size_t i = 0; i < block_length_row; ++i ) {
595 indices[ i ].clear();
599 }
while( k < input.size() );
602 assert( c_ind.size() == ds.size() );
611 c_ind.push_back( this->
noc );
616 c_ind.push_back( 0 );
621 assert( c_ind.size() == ds.size() );
624 assert( c_ind.size() % block_length == 0 );
630 this->
nnz = input.size();
633 const size_t wasted_space = allocsize - this->
nnz;
646 for(
size_t i = 0; i < r_ind.size(); ++i ) {
648 this->r_ind[ i ] = r_ind[ i ];
652 for(
size_t i = r_ind.size(); i < r_ind.size() +
block_length; ++i ) {
653 this->r_ind[ i ] = 0;
657 for(
size_t i = 0; i < c_ind.size(); ++i ) {
658 this->c_ind[ i ] = c_ind[ i ];
659 this->ds [ i ] = ds [ i ];
667 row =
static_cast< ULI
>( *( this->
r_ind ) );
668 col =
static_cast< ULI
>( *( this->
c_ind ) );
696 virtual void zxa(
const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
697 if( this->
nor == 0 || this->
noc == 0 || this->
nnz == 0 )
return;
698 T *__restrict__ pDataA =
ds;
701 const T *
const pDataZend = pDataZ + this->
noc;
704 _i_value *__restrict__ pIncRow =
r_ind;
705 _i_value *__restrict__ pIncCol =
c_ind;
708 #ifdef __INTEL_COMPILER
709 __declspec(align(64)) _i_value c_ind_buffer[
block_length ];
713 _i_value c_ind_buffer[
block_length ] __attribute__((aligned));
714 T input_buffer[
block_length ] __attribute__((aligned));
715 T outputbuffer[
block_length ] __attribute__((aligned));
720 c_ind_buffer[ i ] = *pIncCol++;
722 pDataZ += c_ind_buffer[ 0 ];
724 c_ind_buffer[ 0 ] = 0;
726 pDataX += *pIncRow++;
728 while( pDataA < pDataAend ) {
730 while( pDataZ < pDataZend ) {
733 input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
736 outputbuffer[ i ] = pDataA[ i ] * input_buffer[ i ];
739 *pDataZ += outputbuffer[ i ];
744 c_ind_buffer[ i ] = *pIncCol++;
746 pDataZ += c_ind_buffer[ 0 ];
748 c_ind_buffer[ 0 ] = 0;
753 pDataX += *pIncRow++;
758 #if defined __INTEL_COMPILER && defined __KNC__
760 void zax1by8(
const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
762 if( this->
nor == 0 || this->
noc == 0 || this->
nnz == 0 )
return;
763 T *__restrict__ pDataA =
ds;
767 const T *
const pDataXend = pDataX + this->
noc;
769 const T *
const pDataZst = pDataZ;
770 const T *
const pDataXst = pDataX;
771 const T *
const pDataZend = pDataZ + this->
nor;
773 _i_value *__restrict__ pIncRow =
r_ind;
774 _i_value *__restrict__ pIncCol =
c_ind;
777 assert( static_cast< ULI >( *
r_ind ) < this->nor );
778 assert( static_cast< ULI >( *
c_ind ) < this->noc );
781 #ifdef __INTEL_COMPILER
782 __declspec(align(32)) _i_value c_ind_buffer[ block_length ];
783 __declspec(align(32)) _i_value r_ind_buffer[ block_length ];
784 __declspec(align(32)) T input_buffer[ block_length ];
785 __declspec(align(32)) T outputbuffer[ block_length ];
787 _i_value c_ind_buffer[
block_length ] __attribute__((aligned));
788 _i_value r_ind_buffer[
block_length ] __attribute__((aligned));
789 T input_buffer[
block_length ] __attribute__((aligned));
790 T outputbuffer[
block_length ] __attribute__((aligned));
795 c_ind_buffer[ i ] = *pIncCol++;
796 if( block_length_row > 1 ) {
797 r_ind_buffer[ i ] = *pIncRow++;
799 outputbuffer[ i ] = 0;
802 pDataX += c_ind_buffer[ 0 ];
803 pDataZ += *pIncRow++;
805 c_ind_buffer[ 0 ] = 0;
806 r_ind_buffer[ 0 ] = 0;
810 while( pDataA < pDataAend ) {
812 while( pDataX < pDataXend ) {
814 assert( pDataA < pDataAend );
815 assert( pDataX >= pDataXst );
816 assert( pDataX < pDataXend );
817 assert( pDataZ >= pDataZst );
818 assert( pDataZ < pDataZend );
819 assert( pDataX + this->noc >= pDataXend );
820 assert( pDataA + this->
allocsize >= pDataAend );
821 assert( pDataZ + this->nor >= pDataZend );
823 std::cout <<
"Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
824 std::cout <<
"Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
825 std::cout <<
"Now at block nr. " << (this->
allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
836 input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
847 outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
853 c_ind_buffer[ i ] = *pIncCol++;
856 pDataX += c_ind_buffer[ 0 ];
858 c_ind_buffer[ 0 ] = 0;
865 *pDataZ += outputbuffer[ i ];
868 pDataZ += *pIncRow++;
871 outputbuffer[ i ] = 0;
887 virtual void zax(
const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
889 if( block_length_row == 1 ) {
890 zax1by8( pDataX, pDataZ );
894 if( this->nor == 0 || this->noc == 0 || this->
nnz == 0 )
return;
895 T *__restrict__ pDataA =
ds;
899 const T *
const pDataXend = pDataX + this->
noc;
901 const T *
const pDataZst = pDataZ;
902 const T *
const pDataXst = pDataX;
903 const T *
const pDataZend = pDataZ + this->
nor;
905 _i_value *__restrict__ pIncRow =
r_ind;
906 _i_value *__restrict__ pIncCol =
c_ind;
909 assert( static_cast< ULI >( *
r_ind ) < this->nor );
910 assert( static_cast< ULI >( *
c_ind ) < this->noc );
913 #ifdef __INTEL_COMPILER
914 __declspec(align(32)) _i_value c_ind_buffer[ block_length ];
915 __declspec(align(32)) _i_value r_ind_buffer[ block_length ];
916 __declspec(align(32)) T input_buffer[ block_length ];
917 __declspec(align(32)) T outputbuffer[ block_length ];
918 __declspec(align(32)) T z_shadow [ block_length ];
920 _i_value c_ind_buffer[
block_length ] __attribute__((aligned));
921 _i_value r_ind_buffer[
block_length ] __attribute__((aligned));
922 T input_buffer[
block_length ] __attribute__((aligned));
923 T outputbuffer[
block_length ] __attribute__((aligned));
929 c_ind_buffer[ i ] = *pIncCol++;
930 if( block_length_row > 1 ) {
931 r_ind_buffer[ i ] = *pIncRow++;
933 outputbuffer[ i ] = 0;
936 pDataX += c_ind_buffer[ 0 ];
937 pDataZ += r_ind_buffer[ 0 ];
939 c_ind_buffer[ 0 ] = 0;
940 r_ind_buffer[ 0 ] = 0;
945 z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
948 while( pDataA < pDataAend ) {
950 while( pDataX < pDataXend ) {
952 assert( pDataA < pDataAend );
953 assert( pDataX >= pDataXst );
954 assert( pDataX < pDataXend );
955 assert( pDataZ >= pDataZst );
956 assert( pDataZ < pDataZend );
957 assert( pDataX + this->noc >= pDataXend );
958 assert( pDataA + this->
allocsize >= pDataAend );
959 assert( pDataZ + this->nor >= pDataZend );
961 std::cout <<
"Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
962 std::cout <<
"Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
963 std::cout <<
"Now at block nr. " << (this->
allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
974 input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
985 outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
991 c_ind_buffer[ i ] = *pIncCol++;
994 pDataX += c_ind_buffer[ 0 ];
996 c_ind_buffer[ 0 ] = 0;
998 switch( block_length_row ) {
1008 z_shadow[ i ] += outputbuffer[ i ];
1013 assert( block_length_row > 1 );
1015 for(
size_t b = 0; b < block_length_row; ++b ) {
1017 const size_t z_shadow_index = b * block_length_column + blockrow;
1019 for(
size_t i = 0; i < block_length_column; ++i ) {
1021 const size_t o_buffer_index = b * block_length_column + i;
1023 z_shadow[ z_shadow_index ] += outputbuffer[ o_buffer_index ];
1032 outputbuffer[ i ] = 0;
1035 pDataX -= this->
noc;
1038 if( blockrow == block_length_column ) {
1041 pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1049 r_ind_buffer[ i ] = *pIncRow++;
1052 pDataZ += r_ind_buffer[ 0 ];
1054 r_ind_buffer[ 0 ] = 0;
1063 z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1072 pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1089 virtual void zax(
const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
1091 if( this->nor == 0 || this->noc == 0 || this->
nnz == 0 )
return;
1092 T *__restrict__ pDataA =
ds;
1096 const T *
const pDataXend = pDataX + this->
noc;
1098 const T *
const pDataZst = pDataZ;
1099 const T *
const pDataXst = pDataX;
1100 const T *
const pDataZend = pDataZ + this->
nor;
1102 _i_value *__restrict__ pIncRow =
r_ind;
1103 _i_value *__restrict__ pIncCol =
c_ind;
1106 assert( static_cast< ULI >( *
r_ind ) < this->nor );
1107 assert( static_cast< ULI >( *
c_ind ) < this->noc );
1110 #ifdef __INTEL_COMPILER
1111 __declspec(align(32)) _i_value c_ind_buffer[
block_length ];
1112 __declspec(align(32)) _i_value r_ind_buffer[
block_length ];
1117 _i_value c_ind_buffer[
block_length ] __attribute__((aligned));
1118 _i_value r_ind_buffer[
block_length ] __attribute__((aligned));
1119 T input_buffer[
block_length ] __attribute__((aligned));
1120 T outputbuffer[
block_length ] __attribute__((aligned));
1126 c_ind_buffer[ i ] = *pIncCol++;
1127 r_ind_buffer[ i ] = *pIncRow++;
1128 outputbuffer[ i ] = 0;
1131 pDataX += c_ind_buffer[ 0 ];
1132 pDataZ += r_ind_buffer[ 0 ];
1134 c_ind_buffer[ 0 ] = 0;
1135 r_ind_buffer[ 0 ] = 0;
1137 size_t blockrow = 0;
1140 z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1143 while( pDataA < pDataAend ) {
1145 while( pDataX < pDataXend ) {
1147 assert( pDataA < pDataAend );
1148 assert( pDataX >= pDataXst );
1149 assert( pDataX < pDataXend );
1150 assert( pDataZ >= pDataZst );
1151 assert( pDataZ < pDataZend );
1152 assert( pDataX + this->noc >= pDataXend );
1153 assert( pDataA + this->
allocsize >= pDataAend );
1154 assert( pDataZ + this->nor >= pDataZend );
1156 std::cout <<
"Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
1157 std::cout <<
"Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
1158 std::cout <<
"Now at block nr. " << (this->
allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
1169 input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
1180 outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
1186 c_ind_buffer[ i ] = *pIncCol++;
1189 pDataX += c_ind_buffer[ 0 ];
1191 c_ind_buffer[ 0 ] = 0;
1194 for(
size_t b = 0; b < block_length_row; ++b ) {
1196 for(
size_t i = 0; i < block_length_column; ++i ) {
1197 z_shadow[ b + block_length_row * blockrow ] += outputbuffer[ b * block_length_column + i ];
1214 outputbuffer[ i ] = 0;
1217 pDataX -= this->
noc;
1220 if( blockrow == block_length_column ) {
1223 pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1231 r_ind_buffer[ i ] = *pIncRow++;
1234 pDataZ += r_ind_buffer[ 0 ];
1236 r_ind_buffer[ 0 ] = 0;
1245 z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1254 pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1277 #if defined __INTEL_COMPILER && defined __KNC__
1280 #define _mm512_prefetch_i32gather_pd_0hint_init() \
1281 register __mmask8 fullMask asm("sil") = 0xff;
1283 #define _mm512_prefetch_i32gather_pd_0hint( index, mv ) \
1285 "vgatherpf0hintdpd (%1, %0, 8) {{%2}}\n" \
1287 : "x" (index), "g" (mv), "r" (fullMask) \
1305 union __m512d_union {
1311 union __m512i_union {
1313 int32_t array[ 16 ];
1318 ds = (
double*) _mm_malloc( this->allocsize *
sizeof(
double ), _SL_ALIGN_DOUBLE );
1319 c_ind = (int16_t*) _mm_malloc( this->allocsize *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1320 r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1321 bytes =
sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) +
sizeof( double ) * this->allocsize;
1326 ds = (
double*) _mm_malloc( this->allocsize *
sizeof(
double ), _SL_ALIGN_DOUBLE );
1327 c_ind = (int16_t*) _mm_malloc( this->allocsize *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1328 r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1329 bytes =
sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) +
sizeof( double ) * this->allocsize;
1334 ds = (
double*) _mm_malloc( this->allocsize *
sizeof(
double ), _SL_ALIGN_DOUBLE );
1335 c_ind = (int16_t*) _mm_malloc( this->allocsize *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1336 r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1337 bytes =
sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) +
sizeof( double ) * this->allocsize;
1342 ds = (
double*) _mm_malloc( this->allocsize *
sizeof(
double ), _SL_ALIGN_DOUBLE );
1343 c_ind = (int16_t*) _mm_malloc( this->allocsize *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1344 r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) *
sizeof( int16_t ), _SL_ALIGN_INT16 );
1345 bytes =
sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) +
sizeof( double ) * this->allocsize;
1350 ds = (
double*) _mm_malloc( this->allocsize *
sizeof(
double ), _SL_ALIGN_DOUBLE );
1351 c_ind = (int32_t*) _mm_malloc( this->allocsize *
sizeof( int32_t ), _SL_ALIGN_INT32 );
1352 r_ind = (int32_t*) _mm_malloc( (this->jumpsize + 1) *
sizeof( int32_t ), _SL_ALIGN_INT32 );
1353 bytes =
sizeof( int32_t ) * (this->jumpsize + this->allocsize + 1) +
sizeof( double ) * this->allocsize;
1360 if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1365 _mm512_prefetch_i32gather_pd_0hint_init();
1368 const double *
const pDataAend = ds + this->allocsize - block_length;
1369 const double *
const pDataXend = pDataX + this->noc;
1370 double *__restrict__ pDataA = ds;
1371 int16_t *__restrict__ pIncRow = r_ind;
1372 int16_t *__restrict__ pIncCol = c_ind;
1375 __m512i c_ind_buffer;
1376 __m512d input_buffer;
1377 __m512d value_buffer;
1378 __m512d outputbuffer;
1379 __m512i zeroF = _mm512_set_epi32(
1380 1, 1, 1, 1, 1, 1, 1, 0,
1381 1, 1, 1, 1, 1, 1, 1, 0
1385 outputbuffer = _mm512_setzero_pd();
1388 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1394 pIncCol += block_length;
1397 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
1400 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1403 pDataZ += *pIncRow++;
1406 while( pDataA < pDataAend ) {
1409 while( pDataX < pDataXend ) {
1412 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1415 value_buffer = _mm512_load_pd( pDataA );
1418 outputbuffer = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer );
1421 pDataA += block_length;
1424 c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1430 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1433 pIncCol += block_length;
1436 if( pDataX >= pDataXend ) {
1439 *pDataZ +=_mm512_reduce_add_pd( outputbuffer );
1442 outputbuffer = _mm512_setzero_pd();
1445 pDataX -= this->noc;
1448 pDataZ += *pIncRow++;
1451 if( pDataA >= pDataAend ) {
1457 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1460 value_buffer = _mm512_load_pd( pDataA );
1463 outputbuffer = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer );
1466 pDataA += block_length;
1469 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1475 pIncCol += block_length;
1478 c_ind_buffer =_mm512_mullo_epi32( c_ind_buffer, zeroF );
1481 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1486 *pDataZ +=_mm512_reduce_add_pd( outputbuffer );
1489 outputbuffer = _mm512_setzero_pd();
1492 pDataX -= this->noc;
1495 pDataZ += *pIncRow++;
1503 if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1508 _mm512_prefetch_i32gather_pd_0hint_init();
1515 const double *
const pDataAend = ds + this->allocsize - block_length;
1516 const double *
const pDataXend = pDataX + this->noc;
1517 double *__restrict__ pDataA = ds;
1518 int16_t *__restrict__ pIncRow = r_ind;
1519 int16_t *__restrict__ pIncCol = c_ind;
1525 __m512i c_ind_buffer;
1526 __m512d input_buffer;
1527 __m512d value_buffer;
1528 __m512d_union outputbuffer;
1529 __m512i zeroF = _mm512_set_epi32(
1530 1, 1, 1, 1, 1, 1, 1, 0,
1531 1, 1, 1, 1, 1, 1, 1, 0
1535 outputbuffer.internal = _mm512_setzero_pd();
1538 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1557 pIncCol += block_length;
1560 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
1563 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1566 pDataZ += *pIncRow++;
1572 while( pDataA < pDataAend ) {
1575 while( pDataX < pDataXend ) {
1593 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1599 value_buffer = _mm512_load_pd( pDataA );
1605 outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1611 pDataA += block_length;
1617 c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1626 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1629 pIncCol += block_length;
1635 if( pDataX >= pDataXend ) {
1638 const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1639 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1641 const __m512d reduce_buffer2 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_BADC );
1642 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer2 );
1649 *pDataZ += outputbuffer.array[ 0 ];
1652 pDataX -= this->noc;
1655 pDataZ += *pIncRow++;
1661 *pDataZ += outputbuffer.array[ 4 ];
1664 outputbuffer.internal = _mm512_setzero_pd();
1667 pDataZ += *pIncRow++;
1670 if( pDataA >= pDataAend ) {
1690 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1696 value_buffer = _mm512_load_pd( pDataA );
1702 outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1708 pDataA += block_length;
1714 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1723 pIncCol += block_length;
1729 c_ind_buffer =_mm512_mullo_epi32( c_ind_buffer, zeroF );
1732 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1740 const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1741 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1743 const __m512d reduce_buffer2 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_BADC );
1744 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer2 );
1751 *pDataZ += outputbuffer.array[ 0 ];
1754 pDataX -= this->noc;
1757 pDataZ += *pIncRow++;
1763 *pDataZ += outputbuffer.array[ 4 ];
1766 outputbuffer.internal = _mm512_setzero_pd();
1769 pDataZ += *pIncRow++;
1781 if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1786 _mm512_prefetch_i32gather_pd_0hint_init();
1793 const double *
const pDataAend = ds + this->allocsize - block_length;
1794 const double *
const pDataXend = pDataX + this->noc;
1795 double *__restrict__ pDataA = ds;
1796 int16_t *__restrict__ pIncRow = r_ind;
1797 int16_t *__restrict__ pIncCol = c_ind;
1804 __m512i c_ind_buffer;
1805 __m512i r_ind_buffer;
1806 __m512d input_buffer;
1807 __m512d value_buffer;
1808 __m512d_union outputbuffer;
1809 __m512i zeroF = _mm512_set_epi32(
1810 1, 1, 1, 1, 1, 1, 1, 0,
1811 1, 1, 1, 1, 1, 1, 1, 0
1813 __mmask8 mask1 = 0x55;
1814 __mmask8 mask2 = 0xAA;
1817 outputbuffer.internal = _mm512_setzero_pd();
1820 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1839 pIncCol += block_length;
1842 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
1845 r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1848 r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
1851 _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
1852 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1858 pIncRow += block_length;
1861 z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1864 bool colshift, rowshift, maskshift;
1865 colshift = rowshift = maskshift =
true;
1871 while( pDataA < pDataAend ) {
1874 while( pDataX < pDataXend ) {
1892 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1898 value_buffer = _mm512_load_pd( pDataA );
1904 outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1910 pDataA += block_length;
1920 c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1929 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1935 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
1940 colshift = !colshift;
1952 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1955 pIncCol += block_length;
1961 if( pDataX >= pDataXend ) {
1966 const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1967 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1984 z_shadow = _mm512_mask_add_pd( z_shadow, mask1, z_shadow, outputbuffer.internal );
1989 z_shadow = _mm512_mask_add_pd( z_shadow, mask2, z_shadow, outputbuffer.internal );
2018 _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2029 r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2032 r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2035 r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
2037 rowshift = !rowshift;
2040 _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2043 pIncRow += block_length;
2049 z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2051 maskshift = !maskshift;
2054 pDataX -= this->noc;
2057 outputbuffer.internal = _mm512_setzero_pd();
2060 if( pDataA >= pDataAend ) {
2084 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2090 value_buffer = _mm512_load_pd( pDataA );
2096 outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
2102 pDataA += block_length;
2112 c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
2121 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2127 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
2132 colshift = !colshift;
2144 pIncCol += block_length;
2147 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2154 const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
2155 outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
2172 z_shadow = _mm512_mask_add_pd( z_shadow, mask1, z_shadow, outputbuffer.internal );
2177 z_shadow = _mm512_mask_add_pd( z_shadow, mask2, z_shadow, outputbuffer.internal );
2206 _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2217 r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2220 r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2223 r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
2225 rowshift = !rowshift;
2228 _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2231 pIncRow += block_length;
2237 z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2239 maskshift = !maskshift;
2242 outputbuffer.internal = _mm512_setzero_pd();
2245 pDataX -= this->noc;
2252 _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2260 if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
2265 _mm512_prefetch_i32gather_pd_0hint_init();
2272 double *__restrict__ pDataA = ds;
2273 const double *
const pDataAend = ds + this->allocsize - block_length;
2274 const double *
const pDataXend = pDataX + this->noc;
2275 int16_t *__restrict__ pIncRow = r_ind;
2276 int16_t *__restrict__ pIncCol = c_ind;
2280 __m512i c_ind_buffer;
2281 __m512i r_ind_buffer;
2282 __m512d input_buffer;
2283 __m512d value_buffer;
2284 __m512i zeroF = _mm512_set_epi32(
2285 1, 1, 1, 1, 1, 1, 1, 0,
2286 1, 1, 1, 1, 1, 1, 1, 0
2290 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2303 pIncCol += block_length;
2306 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
2309 r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2312 r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
2315 _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2316 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2322 pIncRow += block_length;
2325 z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2328 bool colshift, rowshift;
2329 colshift = rowshift =
true;
2335 while( pDataA < pDataAend ) {
2337 while( pDataX < pDataXend ) {
2357 input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2360 value_buffer = _mm512_load_pd( pDataA );
2363 z_shadow = _mm512_fmadd_pd( value_buffer, input_buffer, z_shadow );
2366 pDataA += block_length;
2374 c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
2377 c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2380 c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF );
2382 colshift = !colshift;
2388 _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2391 pIncCol += block_length;
2398 _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2403 r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2406 r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2409 r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
2411 rowshift = !rowshift;
2417 _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2420 pIncRow += block_length;
2423 z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2426 pDataX -= this->noc;
ULI nnz
Number of non-zeros.
Definition: SparseMatrix.hpp:58
_i_value * r_ind
Array containing the row jumps.
Definition: vecBICRS.hpp:81
static int compareTriplets(const void *left, const void *right)
Comparison function used for sorting input data.
Definition: vecBICRS.hpp:364
void postProcessRowIncrements(std::vector< _i_value > &r_ind)
Makes suitable for vectorisation the row increment array.
Definition: vecBICRS.hpp:302
ULI i() const
Definition: Triplet.hpp:70
virtual unsigned long int m()
Queries the number of rows this matrix contains.
Definition: SparseMatrix.hpp:107
virtual size_t bytesUsed()
Function to query the amount of storage required by this sparse matrix.
Definition: vecBICRS.hpp:1270
virtual void zax(const T *__restrict__ pDataX, T *__restrict__ pDataZ)
In-place z=Ax function.
Definition: vecBICRS.hpp:1089
static void addPadding(std::vector< _i_value > &row_increments, const std::map< size_t, size_t > &row2local, const size_t blockingSize, const size_t prev_row, const size_t m)
Helper function for vecBICRS construction.
Definition: vecBICRS.hpp:135
static size_t prepareBlockIteration(std::vector< size_t > *const row_indices, std::vector< _i_value > &row_increments, _i_value &prev_row, const std::vector< Triplet< double > > &input, const size_t k, const size_t m)
Helper method for oBICRS constructor.
Definition: vecBICRS.hpp:221
void deallocate()
Utility function: free all memory areas.
Definition: vecBICRS.hpp:114
virtual void zxa(const T *__restrict__ pDataX, T *__restrict__ pDataZ)
In-place z=xA function.
Definition: vecBICRS.hpp:696
size_t allocsize
The number of nonzeroes allocated (may differ from the actual number of nonzeroes).
Definition: vecBICRS.hpp:87
virtual void getFirstIndexPair(ULI &row, ULI &col)
Returns the first nonzero index, per reference.
Definition: vecBICRS.hpp:666
size_t bytes
Remembers the number of bytes allocated.
Definition: vecBICRS.hpp:84
vecBICRS(vecBICRS< T > &toCopy)
Copy constructor.
Definition: vecBICRS.hpp:431
void loadFromFile(const std::string file, const T zero=0)
Function which loads a matrix from a matrix market file.
Definition: SparseMatrix.hpp:89
Interface common to all sparse matrix storage schemes.
Definition: SparseMatrix.hpp:46
static int pColumnSort(const void *left, const void *right)
Comparison function used for sorting vectorised blocks; in-row column ordering.
Definition: vecBICRS.hpp:379
ULI noc
Number of columns.
Definition: SparseMatrix.hpp:55
~vecBICRS()
Base deconstructor.
Definition: vecBICRS.hpp:1266
virtual void load(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero)
Definition: vecBICRS.hpp:467
vecBICRS()
Base constructor.
Definition: vecBICRS.hpp:401
ULI nor
Number of rows.
Definition: SparseMatrix.hpp:52
The incremental compressed row storage sparse matrix data structure.
Definition: vecBICRS.hpp:65
T zero_element
The element considered to be zero.
Definition: SparseMatrix.hpp:63
vecBICRS(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero=0)
Constructor which transforms a collection of input triplets to CRS format.
Definition: vecBICRS.hpp:462
ULI j() const
Definition: Triplet.hpp:73
void allocate()
Utility function: allocate memory areas using the allocsize and jumpsize fields.
Definition: vecBICRS.hpp:98
static const size_t block_length
Combined blocking size.
Definition: vecBICRS.hpp:72
size_t jumpsize
The number of row jumps plus one; i.e., the length of r_ind.
Definition: vecBICRS.hpp:90
vecBICRS(std::string file, T zero=0)
Base constructor.
Definition: vecBICRS.hpp:407
T * ds
Array containing the actual nnz non-zeros.
Definition: vecBICRS.hpp:75
size_t fillIn
Fill-in (explicitly added nonzeroes to enable vectorisation).
Definition: vecBICRS.hpp:361
virtual unsigned long int n()
Queries the number of columns this matrix contains.
Definition: SparseMatrix.hpp:115
A single triplet value.
Definition: Triplet.hpp:52
vecBICRS(const ULI number_of_nonzeros, const ULI number_of_rows, const ULI number_of_cols, T zero)
Base constructor which only initialises the internal arrays.
Definition: vecBICRS.hpp:421
static int pRowSort(const void *left, const void *right)
Comparison function used for sorting vectorised blocks; row ordering.
Definition: vecBICRS.hpp:390
_i_value * c_ind
Array containing the column jumps.
Definition: vecBICRS.hpp:78