SparseLibrary  Version 1.6.0
vecBICRS.hpp
1 /*
2  * Copyright (c) 2007-2014, A. N. Yzelman, Utrecht University 2007-2011;
3  * KU Leuven 2011-2014.
4  * R. H. Bisseling, Utrecht University 2007-2014.
5  *
6  * This file is part of the Sparse Library.
7  *
8  * This library was developed under supervision of Prof. dr. Rob H. Bisseling at
9  * Utrecht University, from 2007 until 2011. From 2011-2014, development continued
10  * at KU Leuven, where Prof. dr. Dirk Roose contributed significantly to the ideas
11  * behind the newer parts of the library code.
12  *
13  * The Sparse Library is free software: you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by the
15  * Free Software Foundation, either version 3 of the License, or (at your
16  * option) any later version.
17  *
18  * The Sparse Library is distributed in the hope that it will be useful, but
19  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21  * for more details.
22  *
23  * You should have received a copy of the GNU General Public License along
24  * with the Sparse Library. If not, see <http://www.gnu.org/licenses/>.
25  */
26 
27 
28 /*
29  * File created by:
30  * A. N. Yzelman, Dept. of Mathematics, Utrecht University, 2008.
31  */
32 
33 
34 #include "alignment.hpp"
35 #include "Triplet.hpp"
36 #include "SparseMatrix.hpp"
37 
38 #include <map>
39 #include <set>
40 #include <cmath>
41 #include <vector>
42 #include <assert.h>
43 #include <algorithm>
44 
45 #if defined __INTEL_COMPILER && defined __KNC__
46  #include <stdint.h>
47  #include <immintrin.h>
48  #include <smmintrin.h>
49  #include <zmmintrin.h>
50 #endif
51 
52 //#define _DEBUG
53 
54 #ifndef _H_vecBICRS
55 #define _H_vecBICRS
56 
57 #ifdef _DEBUG
58 #include<iostream>
59 #endif
60 
64 template< typename T, size_t block_length_row=1, size_t block_length_column=8, typename _i_value=LI >
65 class vecBICRS: public SparseMatrix< T, ULI > {
66 
67  private:
68 
69  protected:
70 
72  static const size_t block_length = block_length_row * block_length_column;
73 
75  T* ds;
76 
78  _i_value* c_ind;
79 
81  _i_value* r_ind;
82 
84  size_t bytes;
85 
87  size_t allocsize;
88 
90  size_t jumpsize;
91 
98  void allocate() {
99 #ifdef __INTEL_COMPILER
100  ds = (T*) _mm_malloc( this->allocsize * sizeof( T ), _SL_ALIGN_DOUBLE );
101  c_ind = (_i_value*) _mm_malloc( this->allocsize * sizeof( _i_value ), _SL_ALIGN_INT16 );
102  r_ind = (_i_value*) _mm_malloc( (this->jumpsize + 1) * sizeof( _i_value ), _SL_ALIGN_INT16 );
103 #else
104  ds = new T[ this->allocsize ];
105  c_ind = new _i_value[ this->allocsize ];
106  r_ind = new _i_value[ this->jumpsize + 1 ];
107 #endif
108  bytes = sizeof( _i_value ) * (this->jumpsize + this->allocsize + 1) + sizeof( T ) * this->allocsize;
109  }
110 
114  void deallocate() {
115 #ifdef __INTEL_COMPILER
116  _mm_free( ds );
117  _mm_free( c_ind );
118  _mm_free( r_ind );
119 #else
120  if( ds != NULL ) delete [] ds;
121  if( c_ind != NULL ) delete [] c_ind;
122  if( r_ind != NULL ) delete [] r_ind;
123 #endif
124  }
125 
135  static void addPadding( std::vector< _i_value > &row_increments, const std::map< size_t, size_t > &row2local, const size_t blockingSize, const size_t prev_row, const size_t m ) {
136  if( row_increments.size() % blockingSize != 0 ) {
137  //derive the current column index, to prepare for reverse iteration
138  size_t base = row2local.rbegin()->first;
139  //build a set of row indices in the full current block_length rows
140  std::set< size_t > current;
141  //get current index in row increments vector
142  size_t index = row_increments.size() - 1;
143  //walk through current block, in reverse
144  while( index % block_length != block_length - 1 ) {
145  //DBG
146  //std::cout << "Increment at index " << index << " is " << row_increments[ index ] << " and current base equals " << base << std::endl;
147  //add this row index
148  current.insert( base );
149  //derive previous index base
150  base -= row_increments[ index ];
151  //decrement index
152  --index;
153  }
154  //derive padding index; closest unused row index not in current set
155  size_t padding_index = *(current.begin());
156  if( padding_index > 0 ) {
157  --padding_index;
158  }
159  const size_t firstguess = padding_index;
160  //DBG
161  //std::cout << "First guess for padding index: " << firstguess << std::endl;
162  //keep incrementing padding index until it is no longer in current row set
163  while( current.find( padding_index ) != current.end() ) {
164  ++padding_index;
165  }
166  //check for overflow
167  if( padding_index >= m ) {
168  //reset index
169  padding_index = firstguess;
170  //search in the other direction
171  while( current.find( padding_index ) != current.end() && padding_index < m ) {
172  --padding_index;
173  }
174  //sanity check
175  if( padding_index >= m ) {
176  std::cerr << "Matrix too small to be vectorised! Using this matrix in computations may crash due to illegal reads (by one data element after the output vector end). Overallocating the output vector will ensure proper, error-free execution." << std::endl;
177  }
178  }
179  //DBG
180  //std::cout << "Final guess for padding index: " << padding_index << ", blocking on boundaries of " << blockingSize << std::endl;
181  //add the padding
182  row_increments.push_back( padding_index - prev_row );
183  //DBG
184  //std::cout << "Pushed back: " << row_increments[ row_increments.size() - 1 ] << std::endl;
185  //pad the remainder
186  while( row_increments.size() % blockingSize != 0 ) {
187  //stay on that row
188  row_increments.push_back( 0 );
189  //DBG
190  //std::cout << "Pushed back: " << row_increments[ row_increments.size() - 1 ] << std::endl;
191  //sanity check
192  assert( row2local.find( padding_index ) == row2local.end() );
193  }
194  }
195  }
196 
221  static size_t prepareBlockIteration(
222  std::vector< size_t > * const row_indices,
223  std::vector< _i_value > &row_increments,
224  _i_value &prev_row,
225  const std::vector< Triplet< double > > &input,
226  const size_t k,
227  const size_t m
228  ) {
229  std::map< size_t, size_t > row2local; //used to map global row indices to the block row indices
230  size_t k_walk = k + 1; //the nonzero iterator
231  row2local[ input[ k ].i() ] = 0; //initially, only the row of the first nonzero of this block is in the map.
232  row_indices[ 0 ].push_back( k ); //initially, only the first nonzero index is in this block.
233  //walk through input as long as we can add nonzeroes to the current block.
234  while( k_walk < input.size() ) {
235  //current row
236  const size_t currow = input[ k_walk ].i();
237  //check if the new row is already mapped
238  std::map< size_t, size_t >::const_iterator it = row2local.find( currow );
239  if( it == row2local.end() ) {
240  //new row will be at this block-local location
241  const size_t location = row2local.size();
242  //if there is no more room for a new row, exit the loop
243  if( location == block_length_row ) {
244  break;
245  }
246  //record next nonzero index on this row
247  row_indices[ location ].push_back( k_walk );
248  //add to map
249  row2local[ currow ] = location;
250  } else {
251  //add this nonzero to local row set
252  row_indices[ it->second ].push_back( k_walk );
253  }
254  //go to next nonzero
255  ++k_walk;
256  }
257  //iterate through all local rows.
258  std::map< size_t, size_t >::const_iterator cit = row2local.begin();
259  //sanity check; there should be at least one row
260  assert( cit != row2local.end() );
261  //add increments for each local row
262  for( ; cit != row2local.end(); ++cit ) {
263  //DBG
264  //std::cout << "Current row is " << cit->first << " previous row was " << prev_row << std::endl;
265  //put row_incremenet with relative index
266  row_increments.push_back( cit->first - prev_row );
267  //DBG
268  //std::cout << "Pushed back " << row_increments[ row_increments.size() - 1 ] << std::endl;
269  //update prev_row
270  prev_row = cit->first;
271  }
272 
273  //add padding increments
274  addPadding( row_increments, row2local, block_length_row, prev_row, m );
275 
276  //check if we are at the end
277  if( k_walk == input.size() ) {
278  //add final padding to row_increments
279  addPadding( row_increments, row2local, block_length, prev_row, m );
280  }
281 
282  //DBG
283  /*std::cout << "blk_indices = {" << std::endl;
284  for( size_t blockrow = 0; blockrow < block_length_row; ++blockrow ) {
285  std::cout << " ";
286  std::vector< size_t >::const_iterator it = row_indices[ blockrow ].begin();
287  for( ; it != row_indices[ blockrow ].end(); ++it ) {
288  std::cout << *it << " ";
289  }
290  std::cout << std::endl;
291  }
292  std::cout << " }" << std::endl;*/
293  //return current nonzero that did not get included into this block
294  return k_walk;
295  }
296 
302  void postProcessRowIncrements( std::vector< _i_value > &r_ind ) {
303 #if defined __INTEL_COMPILER && defined __KNC__
304  //check if we vectorise in row direction
305  if( block_length_row == 1 ) {
306  //no, so no post-process
307  return;
308  }
309  if( block_length_row == 2 ) {
310  //assume regular, non-blocked in z-direction, is faster
311  return;
312  }
313 #endif
314 
315  //sanity check
316  assert( r_ind.size() % block_length == 0 );
317 
318  //make each block of block_length increments cumulative
319  size_t base = r_ind[ 0 ];
320  size_t index = r_ind[ 0 ];
321  for( size_t i = 0; i < r_ind.size(); ) {
322  //make suitable for vectorisation on output vector
323  index += r_ind[ i ];
324  r_ind[ i ] = index - base;
325  base = index;
326  //DBG
327  //std::cout << "At row base " << base << ", relative increments are:" << std::endl;
328  for( size_t j = 1; j < block_length; ++j ) {
329  index += r_ind[ i + j ];
330  //DBG
331  //std::cout << "\t" << r_ind[ i + j ] << "-->" << (index - base) << " (index presumed to be " << index << ")" << std::endl;
332  r_ind[ i + j ] = index - base;
333  }
334 #if defined __INTEL_COMPILER && defined __KNC__
335  //permute block for Xeon Phi friendliness, in case of block_length_row > 1
336  if( block_length_row > 1 && block_length_row < block_length ) {
337  //commit to cyclic distribution with p = block_length_row
338  _i_value temp[ block_length ];
339  //copy to temporary array
340  for( size_t j = 0; j < block_length; ++j ) {
341  temp[ j ] = r_ind[ i + j ];
342  }
343  //commit
344  for( size_t s = 0; s < block_length_row; ++s ) {
345  for( size_t t = s; t < block_length; t += block_length_row ) {
346  r_ind[ i++ ] = temp[ t ];
347  }
348  }
349  } else {
350  i += block_length;
351  }
352 #else
353  i += block_length;
354 #endif
355  }
356  }
357 
358  public:
359 
361  size_t fillIn;
362 
364  static int compareTriplets( const void * left, const void * right ) {
365  const Triplet< T > one = *( (Triplet< T > *)left );
366  const Triplet< T > two = *( (Triplet< T > *)right );
367  if( one.i() < two.i() )
368  return -1;
369  if( one.i() > two.i() )
370  return 1;
371  if( one.j() < two.j() )
372  return -1;
373  if( one.j() > two.j() )
374  return 1;
375  return 0;
376  }
377 
379  static int pColumnSort( const void * left, const void * right ) {
380  const Triplet< T >* one = *( (Triplet< T >**)left );
381  const Triplet< T >* two = *( (Triplet< T >**)right );
382  if( one->j() < two->j() )
383  return -1;
384  if( one->j() > two->j() )
385  return 1;
386  return 0;
387  }
388 
390  static int pRowSort( const void * left, const void * right ) {
391  const Triplet< T >* one = *( (Triplet< T >**)left ); //takes the first triplet of the row
392  const Triplet< T >* two = *( (Triplet< T >**)right );
393  if( one->j() < two->j() )
394  return -1;
395  if( one->j() > two->j() )
396  return 1;
397  return 0;
398  }
399 
401  vecBICRS() {}
402 
407  vecBICRS( std::string file, T zero = 0 ) {
408  this->loadFromFile( file, zero );
409  }
410 
421  vecBICRS( const ULI number_of_nonzeros, const ULI number_of_rows, const ULI number_of_cols, T zero ):
422  SparseMatrix< T, ULI >( number_of_nonzeros, number_of_cols, number_of_rows, zero ) {
423  this->allocsize = this->jumpsize = this->nnz;
424  allocate();
425  }
426 
431  vecBICRS( vecBICRS< T >& toCopy ) {
432  //rather straightforward implementation
433  this->zero_element = toCopy.zero_element;
434  this->nnz = toCopy.nnz;
435  this->noc = toCopy.noc;
436  this->nor = toCopy.nor;
437  this->r_start = toCopy.r_start;
438  this->c_start = toCopy.c_start;
439  this->allocsize = toCopy.allocsize;
440  this->jumpsize = toCopy.jumpsize;
441  allocate();
442  for( size_t i = 0; i < this->allocsize; ++i ) {
443  ds[ i ] = toCopy.ds[ i ];
444  c_ind[ i ]= toCopy.c_ind[ i ];
445  }
446  for( size_t i = 0; i < this->jumpsize; ++i ) {
447  r_ind[ i ] = toCopy.r_ind[ i ];
448  }
449  }
450 
462  vecBICRS( std::vector< Triplet< T > >& input, const ULI m, const ULI n, const T zero = 0 ) {
463  load( input, m, n, zero );
464  }
465 
467  virtual void load( std::vector< Triplet< T > >& input, const ULI m, const ULI n, const T zero ) {
468  //set superclass fields
469  this->zero_element = zero;
470  this->nor = m;
471  this->noc = n;
472 
473  //sanity checks
474  if( log2(this->noc) > sizeof( _i_value ) * 8 ) {
475  std::cerr << "Warning: the matrix with " << this->noc << " columns cannot be represented within " << (sizeof( _i_value )*8) << "-bit index values this vecBICRS instance uses! Attempting to continue anyway..." << std::endl;
476  }
477  if( log2(this->nor) > sizeof( _i_value ) * 8 ) {
478  std::cerr << "Warning: the matrix with " << this->nor << " rows cannot be represented within " << (sizeof( _i_value )*8) << "-bit index values this vecBICRS instance uses! Attempting to continue anyway..." << std::endl;
479  }
480  if( m==0 || n==0 || input.size() == 0 ) { //empty matrix
481  this->nor = this->noc = this->nnz = 0;
482  ds = NULL;
483  r_ind = NULL;
484  c_ind = NULL;
485  return;
486  }
487 
488  //WARNING: noc*nor typically overflows on 32 bits!
489  // immediately start recording differences
490  // instead of trying to directly calculate
491  // the index as i*noc+j.
492 
493  //TODO this is what makes this an ICRS implementation; removing the sort will result in BICRS.
494  //Complexity dependens on qsort implementation. Probably O(nnz log(nnz)) average, O(nnz^2) worst-case.
495  //for standard C++ sort:
496  //sort( input.begin(), input.end(), compareTriplets );
497  //for C quicksort:
498  qsort( &input[ 0 ], input.size(), sizeof( Triplet< T > ), &compareTriplets );
499 
500  //filtering out zeros is skipped for now (TODO).
501 
502  //column indices per row in each block
503  std::vector< size_t > indices[ block_length_row ];
504 
505  //vector aliases of the internal flat arrays.
506  //The flat arrays will be allocated taking alignment into
507  //account; this is why no C++ vectors are used internally.
508  std::vector< T > ds;
509  std::vector< _i_value > c_ind;
510  std::vector< _i_value > r_ind;
511  //std::vector< _i_value > r_ind2;
512 
513  //nonzero iterator
514  size_t k = 0;
515 
516  //first block row tracker
517  bool first_row = false; //set to false, since the very first row of blocks should not trigger an overflow
518 
519  //row tracker
520  _i_value prev_row = 0;
521 
522  //column tracker
523  _i_value prev_col = 0;
524 
525  //indicates when there are no more nonzeroes to add
526  bool depleted;
527 
528  //start getting block rows
529  do {
530  //get next block indices
531  k = prepareBlockIteration( indices, r_ind, prev_row, input, k, this->nor );
532  //get iterators
533  size_t cit[ block_length_row ];
534  for( size_t i = 0; i < block_length_row; ++i ) {
535  cit[ i ] = 0;
536  }
537  //fill up c_ind and ds; loop until there are no more nonzeroes to add
538  do {
539  //default value
540  depleted = true;
541  //indicates when a nonzero has been added to the current block
542  bool first = true;
543  //remember top-left block index
544  const size_t top_left = c_ind.size();
545  //for each block row
546  for( size_t i = 0; i < block_length_row; ++i ) {
547  //add a maximum of block_length_col indices
548  for( size_t j = 0; j < block_length_column; ++j ) {
549  //check if this row is depleted
550  if( cit[ i ] == indices[ i ].size() ) {
551  //yes; add dummy zeroes
552  c_ind.push_back( 0 ); //no column increment (so not to incur unnecessary cache misses)
553  ds.push_back( 0 ); //zero value
554  //DBG
555  //std::cout << "Adding dummy zeros at block index " << i << ", " << j << std::endl;
556  } else {
557  //current nonzero index
558  const size_t k = indices[ i ][ cit[ i ] ];
559  //DBG
560  //std::cout << "Column of nonzero " << k << " is " << input[ k ].j() << ", storing increment " << static_cast< _i_value >( input[ k ].j() - prev_col ) << " which is relative to " << prev_col << std::endl;
561  //add this nonzero
562  c_ind.push_back( input[ k ].j() - prev_col );
563  ds .push_back( input[ k ].value );
564  //update prev_col if this was the first nonzero
565  if( first ) {
566  prev_col = input[ k ].j();
567  first = false;
568  //if this was not the very first block row, we need to signal an overflow to handle row changes
569  if( first_row ) {
570  //record overflow
571  c_ind[ c_ind.size() - 1 ] += this->noc;
572  //subsequent blocks will not trigger a row change
573  first_row = false;
574  }
575  //if the first nonzero was not on the first row, we need to swap this increment with that of block index 0, 0
576  if( i != 0 ) {
577  std::swap( c_ind[ c_ind.size() - 1 ], c_ind[ top_left ]);
578  }
579  }
580  //increment this row iterator
581  ++cit[ i ];
582  }
583  }
584  }
585  //check if depleted
586  for( size_t i = 0; i < block_length_row; ++i ) {
587  if( cit[ i ] < indices[ i ].size() ) {
588  depleted = false;
589  break;
590  }
591  }
592  } while( !depleted );
593  //clear all vectors
594  for( size_t i = 0; i < block_length_row; ++i ) {
595  indices[ i ].clear();
596  }
597  //the first nonzero that follows will be the first nonzero in this row of blocks
598  first_row = true;
599  } while( k < input.size() );
600 
601  //sanity check
602  assert( c_ind.size() == ds.size() );
603 
604  //check allocsize is a multiple of block_length
605  assert( c_ind.size() % block_length == 0 );
606 
607  //postprocess r_ind array
608  postProcessRowIncrements( r_ind );
609 
610  //coda
611  c_ind.push_back( this->noc ); //force overflow
612  ds. push_back( 0 ); //dummy nonzero
613 
614  //round up to block size
615  for( size_t i = 1; i < block_length; ++i ) {
616  c_ind.push_back( 0 ); //dummy increment
617  ds .push_back( 0 ); //dummy nonzero
618  }
619 
620  //sanity check
621  assert( c_ind.size() == ds.size() );
622 
623  //check allocsize is a multiple of block_length
624  assert( c_ind.size() % block_length == 0 );
625 
626  //update allocsize
627  allocsize = c_ind.size();
628 
629  //set the number of nonzeroes
630  this->nnz = input.size();
631 
632  //calculate wasted space
633  const size_t wasted_space = allocsize - this->nnz;
634  fillIn = wasted_space;
635 
636  //DBG
637  //std::cout << "Info: added " << wasted_space << " explicit nonzeroes (" << (100.0*((double)wasted_space)/((double)(this->nnz))) << "%) to enable full vectorisation.\n";
638 
639  //update jump size, add one more extra block that might be read during zax, but not used.
640  jumpsize = r_ind.size() + block_length;
641 
642  //allocate arrays
643  allocate();
644 
645  //copy row-jump vector
646  for( size_t i = 0; i < r_ind.size(); ++i ) {
647  //actual copy
648  this->r_ind[ i ] = r_ind[ i ];
649  }
650 
651  //pad row jump vector with zeroes
652  for( size_t i = r_ind.size(); i < r_ind.size() + block_length; ++i ) {
653  this->r_ind[ i ] = 0;
654  }
655 
656  //copy column increments vector and nonzero values vector
657  for( size_t i = 0; i < c_ind.size(); ++i ) {
658  this->c_ind[ i ] = c_ind[ i ];
659  this->ds [ i ] = ds [ i ];
660  }
661 
662  //done
663  }
664 
666  virtual void getFirstIndexPair( ULI &row, ULI &col ) {
667  row = static_cast< ULI >( *( this->r_ind ) );
668  col = static_cast< ULI >( *( this->c_ind ) );
669  }
670 
671  /* Gets starting position (first nonzero location)
672  void getStartingPos( ULI &row_start, ULI &column_start ) {
673  row_start = static_cast< ULI >( *( this->r_ind ) );
674  column_start = static_cast< ULI >( *( this->c_ind ) );
675  }*/
676 
677  /*
678  * Sets starting position of matrix multiplication.
679  * (Useful for example when the input/output vectors will be
680  * shifted before passed on to this class' zax method.)
681  *
682  * @param row_start New row start location
683  * @param column_start New column start location
684  /
685  void setStartingPos( const ULI row_start, const ULI column_start ) {
686  *( this->r_ind ) = row_start;
687  *( this->c_ind ) = column_start;
688  }*/
689 
696  virtual void zxa( const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
697  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) return;
698  T *__restrict__ pDataA = ds;
699  const T *__restrict__ pDataAend = ds + this->allocsize - block_length;
700  //const T *__restrict__ const pDataXend = pDataX + this->nor;
701  const T * const pDataZend = pDataZ + this->noc;
702  //const T *pDataZend = z + nor; //unused
703 
704  _i_value *__restrict__ pIncRow = r_ind;
705  _i_value *__restrict__ pIncCol = c_ind;
706 
707  //define buffers
708 #ifdef __INTEL_COMPILER
709  __declspec(align(64)) _i_value c_ind_buffer[ block_length ];
710  __declspec(align(64)) T input_buffer[ block_length ];
711  __declspec(align(64)) T outputbuffer[ block_length ];
712 #else
713  _i_value c_ind_buffer[ block_length ] __attribute__((aligned));
714  T input_buffer[ block_length ] __attribute__((aligned));
715  T outputbuffer[ block_length ] __attribute__((aligned));
716 #endif
717 
718  //initialise kernel; fill c_ind_buffer
719  for( size_t i = 0; i < block_length; ++i )
720  c_ind_buffer[ i ] = *pIncCol++;
721  //shift input vector
722  pDataZ += c_ind_buffer[ 0 ];
723  //reset column increment
724  c_ind_buffer[ 0 ] = 0;
725  //shift output vector
726  pDataX += *pIncRow++;
727  //start kernel
728  while( pDataA < pDataAend ) {
729  //while the row is not empty
730  while( pDataZ < pDataZend ) {
731  //fill input buffer
732  for( size_t i = 0; i < block_length; ++i )
733  input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
734  //do vectorised multiplication
735  for( size_t i = 0; i < block_length; ++i )
736  outputbuffer[ i ] = pDataA[ i ] * input_buffer[ i ];
737  //reduce into actual output vector
738  for( size_t i = 0; i < block_length; ++i )
739  *pDataZ += outputbuffer[ i ];
740  //shift input data
741  pDataA += block_length;
742  //fill c_ind_buffer
743  for( size_t i = 0; i < block_length; ++i )
744  c_ind_buffer[ i ] = *pIncCol++;
745  //shift input vector
746  pDataZ += c_ind_buffer[ 0 ];
747  //reset column increment
748  c_ind_buffer[ 0 ] = 0;
749  }
750  //row jump, shift back input vector
751  pDataZ -= this->noc;
752  //shift output vector
753  pDataX += *pIncRow++;
754  }
755  }
756 
757 //KNC version of the generic z=Ax function
758 #if defined __INTEL_COMPILER && defined __KNC__
759 
760  void zax1by8( const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
761  //boundary checks
762  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) return;
763  T *__restrict__ pDataA = ds;
764 
765  //pointer aliases
766  const T * const pDataAend = ds + this->allocsize - block_length;
767  const T * const pDataXend = pDataX + this->noc;
768 #ifndef NDEBUG
769  const T * const pDataZst = pDataZ;
770  const T * const pDataXst = pDataX;
771  const T * const pDataZend = pDataZ + this->nor;
772 #endif
773  _i_value *__restrict__ pIncRow = r_ind;
774  _i_value *__restrict__ pIncCol = c_ind;
775 
776  //sanity checks on start position
777  assert( static_cast< ULI >( *r_ind ) < this->nor );
778  assert( static_cast< ULI >( *c_ind ) < this->noc );
779 
780  //define buffers
781 #ifdef __INTEL_COMPILER
782  __declspec(align(32)) _i_value c_ind_buffer[ block_length ];
783  __declspec(align(32)) _i_value r_ind_buffer[ block_length ];
784  __declspec(align(32)) T input_buffer[ block_length ];
785  __declspec(align(32)) T outputbuffer[ block_length ];
786 #else
787  _i_value c_ind_buffer[ block_length ] __attribute__((aligned));
788  _i_value r_ind_buffer[ block_length ] __attribute__((aligned));
789  T input_buffer[ block_length ] __attribute__((aligned));
790  T outputbuffer[ block_length ] __attribute__((aligned));
791 #endif
792 
793  //initialise kernel; fill c_ind_buffer, fill r_ind_buffer, zero output buffer
794  for( size_t i = 0; i < block_length; ++i ) {
795  c_ind_buffer[ i ] = *pIncCol++;
796  if( block_length_row > 1 ) {
797  r_ind_buffer[ i ] = *pIncRow++;
798  }
799  outputbuffer[ i ] = 0; //note: not this->zero_element, since those correspond to matrix `nonzeroes' only
800  }
801  //shift input vector, output vector
802  pDataX += c_ind_buffer[ 0 ];
803  pDataZ += *pIncRow++;
804  //reset start column index, start row index
805  c_ind_buffer[ 0 ] = 0;
806  r_ind_buffer[ 0 ] = 0;
807  //keep track of z_shadow usage
808  size_t blockrow = 0;
809  //start kernel
810  while( pDataA < pDataAend ) {
811  //process row
812  while( pDataX < pDataXend ) {
813  //sanity checks
814  assert( pDataA < pDataAend );
815  assert( pDataX >= pDataXst );
816  assert( pDataX < pDataXend );
817  assert( pDataZ >= pDataZst );
818  assert( pDataZ < pDataZend );
819  assert( pDataX + this->noc >= pDataXend ); //otherwise pDataX is before the start of x!
820  assert( pDataA + this->allocsize >= pDataAend );
821  assert( pDataZ + this->nor >= pDataZend );
822 #ifdef _DEBUG
823  std::cout << "Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
824  std::cout << "Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
825  std::cout << "Now at block nr. " << (this->allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
826 #endif
827  //DBG
828  /*std::cout << "Input buffer { ";
829  for( size_t j = 0; pIncCol != (c_ind+block_length) && j < block_length; ++j ) {
830  std::cout << input_buffer[ j ] << " ";
831  }
832  std::cout << "} --> { ";*/
833 
834  //fill input buffer, z shadow
835  for( size_t i = 0; i < block_length; ++i ) {
836  input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
837  }
838 
839  //DBG
840  /*for( size_t j = 0; j < block_length; ++j ) {
841  std::cout << input_buffer[ j ] << " ";
842  }
843  std::cout << "}" << std::endl;*/
844 
845  //do vectorised multiplication
846  for( size_t i = 0; i < block_length; ++i ) {
847  outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
848  }
849  //shift input data
850  pDataA += block_length;
851  //fill c_ind_buffer
852  for( size_t i = 0; i < block_length; ++i ) {
853  c_ind_buffer[ i ] = *pIncCol++;
854  }
855  //shift input vector
856  pDataX += c_ind_buffer[ 0 ];
857  //reset start column index
858  c_ind_buffer[ 0 ] = 0;
859  }
860  //just reduce all elements of outputbuffer into z_shadow
861  for( size_t i = 0; i < block_length; ++i ) {
862  //for the 1x8 MIC case, z_shadow is not used:
863  //z_shadow[ blockrow ] += outputbuffer[ i ];
864  //instead do direct write-back:
865  *pDataZ += outputbuffer[ i ];
866  }
867  //in the MIC 1x8 case, no row vectorisation is employed; shift output vector
868  pDataZ += *pIncRow++;
869  //zero out output buffer
870  for( size_t i = 0; i < block_length; ++i ) {
871  outputbuffer[ i ] = 0;
872  }
873  //shift back input vector
874  pDataX -= this->noc;
875  }
876  //DBG
877  //std::cout << "Goodbye from compiler-optimised hardcoded 1x8 SpMV kernel (generic MIC version)." << std::endl;
878  }
879 
880 
887  virtual void zax( const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
888  //there is a template specialisation for the MIC 1x8 code:
889  if( block_length_row == 1 ) {
890  zax1by8( pDataX, pDataZ );
891  return;
892  }
893  //boundary checks
894  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) return;
895  T *__restrict__ pDataA = ds;
896 
897  //pointer aliases
898  const T * const pDataAend = ds + this->allocsize - block_length;
899  const T * const pDataXend = pDataX + this->noc;
900 #ifndef NDEBUG
901  const T * const pDataZst = pDataZ;
902  const T * const pDataXst = pDataX;
903  const T * const pDataZend = pDataZ + this->nor;
904 #endif
905  _i_value *__restrict__ pIncRow = r_ind;
906  _i_value *__restrict__ pIncCol = c_ind;
907 
908  //sanity checks on start position
909  assert( static_cast< ULI >( *r_ind ) < this->nor );
910  assert( static_cast< ULI >( *c_ind ) < this->noc );
911 
912  //define buffers
913 #ifdef __INTEL_COMPILER
914  __declspec(align(32)) _i_value c_ind_buffer[ block_length ];
915  __declspec(align(32)) _i_value r_ind_buffer[ block_length ];
916  __declspec(align(32)) T input_buffer[ block_length ];
917  __declspec(align(32)) T outputbuffer[ block_length ];
918  __declspec(align(32)) T z_shadow [ block_length ];
919 #else
920  _i_value c_ind_buffer[ block_length ] __attribute__((aligned));
921  _i_value r_ind_buffer[ block_length ] __attribute__((aligned));
922  T input_buffer[ block_length ] __attribute__((aligned));
923  T outputbuffer[ block_length ] __attribute__((aligned));
924  T z_shadow [ block_length ] __attribute__((aligned));
925 #endif
926 
927  //initialise kernel; fill c_ind_buffer, fill r_ind_buffer, zero output buffer
928  for( size_t i = 0; i < block_length; ++i ) {
929  c_ind_buffer[ i ] = *pIncCol++;
930  if( block_length_row > 1 ) {
931  r_ind_buffer[ i ] = *pIncRow++;
932  }
933  outputbuffer[ i ] = 0; //note: not this->zero_element, since those correspond to matrix `nonzeroes' only
934  }
935  //shift input vector, output vector
936  pDataX += c_ind_buffer[ 0 ];
937  pDataZ += r_ind_buffer[ 0 ];
938  //reset start column index, start row index
939  c_ind_buffer[ 0 ] = 0;
940  r_ind_buffer[ 0 ] = 0;
941  //keep track of z_shadow usage
942  size_t blockrow = 0;
943  //fill z_shadow
944  for( size_t i = 0; i < block_length; ++i ) {
945  z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
946  }
947  //start kernel
948  while( pDataA < pDataAend ) {
949  //process row
950  while( pDataX < pDataXend ) {
951  //sanity checks
952  assert( pDataA < pDataAend );
953  assert( pDataX >= pDataXst );
954  assert( pDataX < pDataXend );
955  assert( pDataZ >= pDataZst );
956  assert( pDataZ < pDataZend );
957  assert( pDataX + this->noc >= pDataXend ); //otherwise pDataX is before the start of x!
958  assert( pDataA + this->allocsize >= pDataAend );
959  assert( pDataZ + this->nor >= pDataZend );
960 #ifdef _DEBUG
961  std::cout << "Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
962  std::cout << "Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
963  std::cout << "Now at block nr. " << (this->allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
964 #endif
965  //DBG
966  /*std::cout << "Input buffer { ";
967  for( size_t j = 0; pIncCol != (c_ind+block_length) && j < block_length; ++j ) {
968  std::cout << input_buffer[ j ] << " ";
969  }
970  std::cout << "} --> { ";*/
971 
972  //fill input buffer, z shadow
973  for( size_t i = 0; i < block_length; ++i ) {
974  input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
975  }
976 
977  //DBG
978  /*for( size_t j = 0; j < block_length; ++j ) {
979  std::cout << input_buffer[ j ] << " ";
980  }
981  std::cout << "}" << std::endl;*/
982 
983  //do vectorised multiplication
984  for( size_t i = 0; i < block_length; ++i ) {
985  outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
986  }
987  //shift input data
988  pDataA += block_length;
989  //fill c_ind_buffer
990  for( size_t i = 0; i < block_length; ++i ) {
991  c_ind_buffer[ i ] = *pIncCol++;
992  }
993  //shift input vector
994  pDataX += c_ind_buffer[ 0 ];
995  //reset start column index
996  c_ind_buffer[ 0 ] = 0;
997  }
998  switch( block_length_row ) {
999  //check if we are in an all-reduce situation
1000  case 1:
1001  //in this case we should be in a template specialisation
1002  assert( false );
1003  break;
1004  //check if we are in a no-reduce situation
1005  case block_length:
1006  //yes, just add all elements into z_shadow
1007  for( size_t i = 0; i < block_length; ++i ) {
1008  z_shadow[ i ] += outputbuffer[ i ];
1009  }
1010  break;
1011  //default case, partial reductions
1012  default:
1013  assert( block_length_row > 1 );
1014  //for each row in this vectorised block
1015  for( size_t b = 0; b < block_length_row; ++b ) {
1016  //get relative output vector index
1017  const size_t z_shadow_index = b * block_length_column + blockrow;
1018  //reduce into z_shadow
1019  for( size_t i = 0; i < block_length_column; ++i ) {
1020  //get outputbuffer index
1021  const size_t o_buffer_index = b * block_length_column + i;
1022  //reduce
1023  z_shadow[ z_shadow_index ] += outputbuffer[ o_buffer_index ];
1024  }
1025  }
1026  }
1027  //we filled another block of rows of z_shadow
1028  ++blockrow;
1029 
1030  //zero out output buffer
1031  for( size_t i = 0; i < block_length; ++i ) {
1032  outputbuffer[ i ] = 0;
1033  }
1034  //shift back input vector
1035  pDataX -= this->noc;
1036 
1037  //check if z_shadow is full
1038  if( blockrow == block_length_column ) {
1039  //write back z_shadow
1040  for( size_t i = 0; i < block_length; ++i ) {
1041  pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1042  //DBG
1043  /*std::cout << "Write back z_shadow[ " << i << " ] = "
1044  << z_shadow[ i ] << " to output vector element at position "
1045  << (this->nor-(pDataZend-pDataZ)+r_ind_buffer[i]) << std::endl;*/
1046  }
1047  //load new r_ind_buffer
1048  for( size_t i = 0; i < block_length; ++i ) {
1049  r_ind_buffer[ i ] = *pIncRow++;
1050  }
1051  //shift output vector
1052  pDataZ += r_ind_buffer[ 0 ];
1053  //reset start row index
1054  r_ind_buffer[ 0 ] = 0;
1055  //DBG
1056  /*std::cout << "r_ind_buffer = { ";
1057  for( size_t i = 0; i < block_length; ++i ) {
1058  std::cout << r_ind_buffer[ i ] << " ";
1059  }
1060  std::cout << "}" << std::endl;*/
1061  //read new z_shadow
1062  for( size_t i = 0; i < block_length; ++i ) {
1063  z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1064  }
1065  //reset block row
1066  blockrow = 0;
1067  }
1068  }
1069  //coda, write back z_shadow in case there is unwritten data there
1070  for( size_t i = 0; i < block_length; ++i ) {
1071  //write back
1072  pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1073  //DBG
1074  //std::cout << "Write back z_shadow[ " << i << " ] = "
1075  // << z_shadow[ i ] << " to output vector element at position "
1076  // << (this->nor-(pDataZend-pDataZ)+r_ind_buffer[i]) << " (coda)" << std::endl;
1077  }
1078  //DBG
1079  //std::cout << "Goodbye from compiler-optimised " << block_length_row << "-by-" << block_length_column << " SpMV kernel." << std::endl;
1080  }
1081 //generic version for non-Xeon Phi targets
1082 #else
1083 
1089  virtual void zax( const T*__restrict__ pDataX, T*__restrict__ pDataZ ) {
1090  //boundary checks
1091  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) return;
1092  T *__restrict__ pDataA = ds;
1093 
1094  //pointer aliases
1095  const T * const pDataAend = ds + this->allocsize - block_length;
1096  const T * const pDataXend = pDataX + this->noc;
1097 #ifndef NDEBUG
1098  const T * const pDataZst = pDataZ;
1099  const T * const pDataXst = pDataX;
1100  const T * const pDataZend = pDataZ + this->nor;
1101 #endif
1102  _i_value *__restrict__ pIncRow = r_ind;
1103  _i_value *__restrict__ pIncCol = c_ind;
1104 
1105  //sanity checks on start position
1106  assert( static_cast< ULI >( *r_ind ) < this->nor );
1107  assert( static_cast< ULI >( *c_ind ) < this->noc );
1108 
1109  //define buffers
1110 #ifdef __INTEL_COMPILER
1111  __declspec(align(32)) _i_value c_ind_buffer[ block_length ];
1112  __declspec(align(32)) _i_value r_ind_buffer[ block_length ];
1113  __declspec(align(32)) T input_buffer[ block_length ];
1114  __declspec(align(32)) T outputbuffer[ block_length ];
1115  __declspec(align(32)) T z_shadow [ block_length ];
1116 #else
1117  _i_value c_ind_buffer[ block_length ] __attribute__((aligned));
1118  _i_value r_ind_buffer[ block_length ] __attribute__((aligned));
1119  T input_buffer[ block_length ] __attribute__((aligned));
1120  T outputbuffer[ block_length ] __attribute__((aligned));
1121  T z_shadow [ block_length ] __attribute__((aligned));
1122 #endif
1123 
1124  //initialise kernel; fill c_ind_buffer, fill r_ind_buffer, zero output buffer
1125  for( size_t i = 0; i < block_length; ++i ) {
1126  c_ind_buffer[ i ] = *pIncCol++;
1127  r_ind_buffer[ i ] = *pIncRow++;
1128  outputbuffer[ i ] = 0; //note: not this->zero_element, since those correspond to matrix `nonzeroes' only
1129  }
1130  //shift input vector, output vector
1131  pDataX += c_ind_buffer[ 0 ];
1132  pDataZ += r_ind_buffer[ 0 ];
1133  //reset start column index, start row index
1134  c_ind_buffer[ 0 ] = 0;
1135  r_ind_buffer[ 0 ] = 0;
1136  //keep track of z_shadow usage
1137  size_t blockrow = 0;
1138  //fill z_shadow
1139  for( size_t i = 0; i < block_length; ++i ) {
1140  z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1141  }
1142  //start kernel
1143  while( pDataA < pDataAend ) {
1144  //process row
1145  while( pDataX < pDataXend ) {
1146  //sanity checks
1147  assert( pDataA < pDataAend );
1148  assert( pDataX >= pDataXst );
1149  assert( pDataX < pDataXend );
1150  assert( pDataZ >= pDataZst );
1151  assert( pDataZ < pDataZend );
1152  assert( pDataX + this->noc >= pDataXend ); //otherwise pDataX is before the start of x!
1153  assert( pDataA + this->allocsize >= pDataAend );
1154  assert( pDataZ + this->nor >= pDataZend );
1155 #ifdef _DEBUG
1156  std::cout << "Input vector at " << ((pDataXend-pDataX)<0 ? (pDataX-pDataXend) : (this->noc+pDataX-pDataXend)) << std::endl;
1157  std::cout << "Output vector at " << (this->nor-(pDataZend-pDataZ)) << std::endl;
1158  std::cout << "Now at block nr. " << (this->allocsize-block_length-(pDataAend-pDataA))/block_length << std::endl;
1159 #endif
1160  //DBG
1161  /*std::cout << "Input buffer { ";
1162  for( size_t j = 0; pIncCol != (c_ind+block_length) && j < block_length; ++j ) {
1163  std::cout << input_buffer[ j ] << " ";
1164  }
1165  std::cout << "} --> { ";*/
1166 
1167  //fill input buffer, z shadow
1168  for( size_t i = 0; i < block_length; ++i ) {
1169  input_buffer[ i ] = pDataX[ c_ind_buffer[ i ] ];
1170  }
1171 
1172  //DBG
1173  /*for( size_t j = 0; j < block_length; ++j ) {
1174  std::cout << input_buffer[ j ] << " ";
1175  }
1176  std::cout << "}" << std::endl;*/
1177 
1178  //do vectorised multiplication
1179  for( size_t i = 0; i < block_length; ++i ) {
1180  outputbuffer[ i ] += pDataA[ i ] * input_buffer[ i ];
1181  }
1182  //shift input data
1183  pDataA += block_length;
1184  //fill c_ind_buffer
1185  for( size_t i = 0; i < block_length; ++i ) {
1186  c_ind_buffer[ i ] = *pIncCol++;
1187  }
1188  //shift input vector
1189  pDataX += c_ind_buffer[ 0 ];
1190  //reset start column index
1191  c_ind_buffer[ 0 ] = 0;
1192  }
1193  //for each row in this vectorised block
1194  for( size_t b = 0; b < block_length_row; ++b ) {
1195  //reduce block rows into z_shadow
1196  for( size_t i = 0; i < block_length_column; ++i ) {
1197  z_shadow[ b + block_length_row * blockrow ] += outputbuffer[ b * block_length_column + i ];
1198  //DBG
1199  /*std::cout << "z_shadow at " << (b + block_length_row * blockrow) <<
1200  " is added with outputbuffer at " << (b * block_length_column + i) <<
1201  " = " << outputbuffer[ b * block_length_column + i ] << std::endl;
1202  std::cout << "z_shadow = { ";
1203  for( size_t j = 0; j < block_length; ++j ) {
1204  std::cout << z_shadow[ j ] << " ";
1205  }
1206  std::cout << "}" << std::endl;*/
1207  }
1208  }
1209  //we filled another block of rows of z_shadow
1210  ++blockrow;
1211 
1212  //zero out output buffer
1213  for( size_t i = 0; i < block_length; ++i ) {
1214  outputbuffer[ i ] = 0;
1215  }
1216  //shift back input vector
1217  pDataX -= this->noc;
1218 
1219  //check if z_shadow is full
1220  if( blockrow == block_length_column ) {
1221  //write back z_shadow
1222  for( size_t i = 0; i < block_length; ++i ) {
1223  pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1224  //DBG
1225  /*std::cout << "Write back z_shadow[ " << i << " ] = "
1226  << z_shadow[ i ] << " to output vector element at position "
1227  << (this->nor-(pDataZend-pDataZ)+r_ind_buffer[i]) << std::endl;*/
1228  }
1229  //load new r_ind_buffer
1230  for( size_t i = 0; i < block_length; ++i ) {
1231  r_ind_buffer[ i ] = *pIncRow++;
1232  }
1233  //shift output vector
1234  pDataZ += r_ind_buffer[ 0 ];
1235  //reset start row index
1236  r_ind_buffer[ 0 ] = 0;
1237  //DBG
1238  /*std::cout << "r_ind_buffer = { ";
1239  for( size_t i = 0; i < block_length; ++i ) {
1240  std::cout << r_ind_buffer[ i ] << " ";
1241  }
1242  std::cout << "}" << std::endl;*/
1243  //read new z_shadow
1244  for( size_t i = 0; i < block_length; ++i ) {
1245  z_shadow[ i ] = pDataZ[ r_ind_buffer[ i ] ];
1246  }
1247  //reset block row
1248  blockrow = 0;
1249  }
1250  }
1251  //coda, write back z_shadow in case there is unwritten data there
1252  for( size_t i = 0; i < block_length; ++i ) {
1253  //write back
1254  pDataZ[ r_ind_buffer[ i ] ] = z_shadow[ i ];
1255  //DBG
1256  //std::cout << "Write back z_shadow[ " << i << " ] = "
1257  // << z_shadow[ i ] << " to output vector element at position "
1258  // << (this->nor-(pDataZend-pDataZ)+r_ind_buffer[i]) << " (coda)" << std::endl;
1259  }
1260  //DBG
1261  //std::cout << "Goodbye from compiler-optimised " << block_length_row << "-by-" << block_length_column << " SpMV kernel." << std::endl;
1262  }
1263 #endif
1264 
1267  deallocate();
1268  }
1269 
1270  virtual size_t bytesUsed() {
1271  return bytes;
1272  }
1273 
1274 };
1275 
1276 //derived from vecBICRS_MIC_template
1277 #if defined __INTEL_COMPILER && defined __KNC__
1278 
1279 //*** New implementation for gather-prefetch: ***//
1280 #define _mm512_prefetch_i32gather_pd_0hint_init() \
1281 register __mmask8 fullMask asm("sil") = 0xff;
1282 
1283 #define _mm512_prefetch_i32gather_pd_0hint( index, mv ) \
1284 __asm__ ( \
1285  "vgatherpf0hintdpd (%1, %0, 8) {{%2}}\n" \
1286  : /* nothing */ \
1287  : "x" (index), "g" (mv), "r" (fullMask) \
1288  : /* nothing */ \
1289  )
1290 //*** end implementation for gather-prefetch ***//
1291 
1292 //*** Old implementation for gather-prefetch: ***//
1293 //__mmask8 fullMask = 0xff;
1294 
1295 /*#define _mm512_prefetch_i32gather_pd_0hint( index, mv ) \
1296 __asm__ ( \
1297  "vgatherpf0hintdpd (%1, %0, 8) {{%2}}\n" \
1298  : \
1299  : "x" (index), "g" (mv), "" (fullMask) \
1300  : \
1301  )*/
1302 //*** end implementation for gather-prefetch ***//
1303 
1305  union __m512d_union {
1306  __m512d internal;
1307  double array[ 8 ];
1308  };
1309 
1311  union __m512i_union {
1312  __m512i internal;
1313  int32_t array[ 16 ];
1314  };
1315 
1316  //16-byte unsigned integer indices specialisation for double data types, 8x1 blocking:
1318  ds = (double*) _mm_malloc( this->allocsize * sizeof( double ), _SL_ALIGN_DOUBLE );
1319  c_ind = (int16_t*) _mm_malloc( this->allocsize * sizeof( int16_t ), _SL_ALIGN_INT16 );
1320  r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) * sizeof( int16_t ), _SL_ALIGN_INT16 );
1321  bytes = sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) + sizeof( double ) * this->allocsize;
1322  }
1323 
1324  //16-byte unsigned integer indices specialisation for double data types, 2x4 blocking:
1326  ds = (double*) _mm_malloc( this->allocsize * sizeof( double ), _SL_ALIGN_DOUBLE );
1327  c_ind = (int16_t*) _mm_malloc( this->allocsize * sizeof( int16_t ), _SL_ALIGN_INT16 );
1328  r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) * sizeof( int16_t ), _SL_ALIGN_INT16 );
1329  bytes = sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) + sizeof( double ) * this->allocsize;
1330  }
1331 
1332  //16-byte unsigned integer indices specialisation for double data types, 4x2 blocking:
1334  ds = (double*) _mm_malloc( this->allocsize * sizeof( double ), _SL_ALIGN_DOUBLE );
1335  c_ind = (int16_t*) _mm_malloc( this->allocsize * sizeof( int16_t ), _SL_ALIGN_INT16 );
1336  r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) * sizeof( int16_t ), _SL_ALIGN_INT16 );
1337  bytes = sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) + sizeof( double ) * this->allocsize;
1338  }
1339 
1340  //16-byte unsigned integer indices specialisation for double data types:
1342  ds = (double*) _mm_malloc( this->allocsize * sizeof( double ), _SL_ALIGN_DOUBLE );
1343  c_ind = (int16_t*) _mm_malloc( this->allocsize * sizeof( int16_t ), _SL_ALIGN_INT16 );
1344  r_ind = (int16_t*) _mm_malloc( (this->jumpsize + 1) * sizeof( int16_t ), _SL_ALIGN_INT16 );
1345  bytes = sizeof( int16_t ) * (this->jumpsize + this->allocsize + 1) + sizeof( double ) * this->allocsize;
1346  }
1347 
1348  //32-byte unsigned integer indices specialisation for double data types:
1350  ds = (double*) _mm_malloc( this->allocsize * sizeof( double ), _SL_ALIGN_DOUBLE );
1351  c_ind = (int32_t*) _mm_malloc( this->allocsize * sizeof( int32_t ), _SL_ALIGN_INT32 );
1352  r_ind = (int32_t*) _mm_malloc( (this->jumpsize + 1) * sizeof( int32_t ), _SL_ALIGN_INT32 );
1353  bytes = sizeof( int32_t ) * (this->jumpsize + this->allocsize + 1) + sizeof( double ) * this->allocsize;
1354  }
1355 
1356  //start 16-byte unsigned integer indices specialisation for double data types, 1x8 blocks.
1357  void vecBICRS< double, 1, 8, int16_t >::zax( const double *__restrict__ pDataX, double *__restrict__ pDataZ ) {
1358 
1359  //boundary checks
1360  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1361  return;
1362  }
1363 
1364  //initialise prefetch macro
1365  _mm512_prefetch_i32gather_pd_0hint_init();
1366 
1367  //pointer aliases
1368  const double * const pDataAend = ds + this->allocsize - block_length;
1369  const double * const pDataXend = pDataX + this->noc;
1370  double *__restrict__ pDataA = ds;
1371  int16_t *__restrict__ pIncRow = r_ind;
1372  int16_t *__restrict__ pIncCol = c_ind;
1373 
1374  //define buffers
1375  __m512i c_ind_buffer;
1376  __m512d input_buffer;
1377  __m512d value_buffer;
1378  __m512d outputbuffer;
1379  __m512i zeroF = _mm512_set_epi32(
1380  1, 1, 1, 1, 1, 1, 1, 0,
1381  1, 1, 1, 1, 1, 1, 1, 0
1382  );
1383 
1384  //initialise kernel
1385  outputbuffer = _mm512_setzero_pd();
1386 
1387  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1388  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1389 
1390  //shift input vector
1391  pDataX += *pIncCol;
1392 
1393  //move pIncCol up one block
1394  pIncCol += block_length;
1395 
1396  //reset start column index; alternatively, do this on 16-bit data earlier
1397  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1398 
1399  //prefetch input
1400  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1401 
1402  //shift output vector
1403  pDataZ += *pIncRow++;
1404 
1405  //start kernel
1406  while( pDataA < pDataAend ) {
1407 
1408  //process row
1409  while( pDataX < pDataXend ) {
1410 
1411  //manual unroll, stage 1: fill input buffers
1412  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1413 
1414  //fill nonzero buffer
1415  value_buffer = _mm512_load_pd( pDataA );
1416 
1417  //do vectorised multiply-add
1418  outputbuffer = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer );
1419 
1420  //shift input data
1421  pDataA += block_length;
1422 
1423  //shift c_ind_buffer
1424  c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1425 
1426  //shift input vector
1427  pDataX += *pIncCol;
1428 
1429  //prefetch input
1430  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1431 
1432  //move pIncCol up
1433  pIncCol += block_length;
1434 
1435  //check for break
1436  if( pDataX >= pDataXend ) {
1437 
1438  //reduce row contribution
1439  *pDataZ +=_mm512_reduce_add_pd( outputbuffer );
1440 
1441  //reset sums buffer
1442  outputbuffer = _mm512_setzero_pd();
1443 
1444  //row jump, shift back input vector
1445  pDataX -= this->noc;
1446 
1447  //shift output vector
1448  pDataZ += *pIncRow++;
1449 
1450  //check for end of matrix
1451  if( pDataA >= pDataAend ) {
1452  break;
1453  }
1454  }
1455 
1456  //manual unroll, stage 2: fill input buffers
1457  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1458 
1459  //fill value buffer
1460  value_buffer = _mm512_load_pd( pDataA );
1461 
1462  //do vectorised multiply-add
1463  outputbuffer = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer );
1464 
1465  //shift input data
1466  pDataA += block_length;
1467 
1468  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1469  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1470 
1471  //shift input vector
1472  pDataX += *pIncCol;
1473 
1474  //move pIncCol up
1475  pIncCol += block_length;
1476 
1477  //reset start column index; alternatively, do this on 16-bit data earlier
1478  c_ind_buffer =_mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1479 
1480  //prefetch input
1481  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1482 
1483  }
1484 
1485  //reduce row contribution
1486  *pDataZ +=_mm512_reduce_add_pd( outputbuffer );
1487 
1488  //reset sums buffer
1489  outputbuffer = _mm512_setzero_pd();
1490 
1491  //row jump, shift back input vector
1492  pDataX -= this->noc;
1493 
1494  //shift output vector
1495  pDataZ += *pIncRow++;
1496  }
1497  }
1498  //end 16-byte unsigned integer indices specialisation for double data types, 1x8 blocks.
1499 
1500  //start 16-byte unsigned integer indices specialisation for double data types, 2x4 blocks.
1501  void vecBICRS< double, 2, 4, int16_t >::zax( const double *__restrict__ pDataX, double *__restrict__ pDataZ ) {
1502  //boundary checks
1503  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1504  return;
1505  }
1506 
1507  //initialise prefetch macro
1508  _mm512_prefetch_i32gather_pd_0hint_init();
1509 
1510  //DBG
1511  //const double * const pDataXst = pDataX;
1512  //const double * const pDataZst = pDataZ;
1513 
1514  //pointer aliases
1515  const double * const pDataAend = ds + this->allocsize - block_length;
1516  const double * const pDataXend = pDataX + this->noc;
1517  double *__restrict__ pDataA = ds;
1518  int16_t *__restrict__ pIncRow = r_ind;
1519  int16_t *__restrict__ pIncCol = c_ind;
1520 
1521  //DBG
1522  //std::cout << "x start: " << pDataX << ", x end: " << pDataXend << std::endl;
1523 
1524  //define buffers
1525  __m512i c_ind_buffer;
1526  __m512d input_buffer;
1527  __m512d value_buffer;
1528  __m512d_union outputbuffer;
1529  __m512i zeroF = _mm512_set_epi32(
1530  1, 1, 1, 1, 1, 1, 1, 0,
1531  1, 1, 1, 1, 1, 1, 1, 0
1532  );
1533 
1534  //initialise kernel
1535  outputbuffer.internal = _mm512_setzero_pd();
1536 
1537  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1538  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1539 
1540  //DBG
1541  /*int16_t *dbgd = (int16_t*)pIncCol;
1542  std::cout << "c_ind_buffer= " << pDataX << ": "
1543  << dbgd[ 0 ] << " "
1544  << dbgd[ 1 ] << " "
1545  << dbgd[ 2 ] << " "
1546  << dbgd[ 3 ] << " "
1547  << dbgd[ 4 ] << " "
1548  << dbgd[ 5 ] << " "
1549  << dbgd[ 6 ] << " "
1550  << dbgd[ 7 ] << " "
1551  << std::endl;*/
1552 
1553  //shift input vector
1554  pDataX += *pIncCol;
1555 
1556  //move pIncCol up one block
1557  pIncCol += block_length;
1558 
1559  //reset start column index; alternatively, do this on 16-bit data earlier
1560  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1561 
1562  //prefetch input
1563  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1564 
1565  //shift output vector
1566  pDataZ += *pIncRow++;
1567 
1568  //DBG
1569  //std::cout << "Entering loop..." << std::endl;
1570 
1571  //start kernel
1572  while( pDataA < pDataAend ) {
1573 
1574  //process row
1575  while( pDataX < pDataXend ) {
1576 
1577  //DBG
1578  /*std::cout << "In loop, stage 1, A-E: " << std::endl;
1579  __declspec(align(64)) int32_t dbg[ 16 ];
1580  _mm512_store_epi32( dbg, c_ind_buffer );
1581  std::cout << "Grabbing from " << pDataX << " ( " << (pDataX-pDataXst) << " ): "
1582  << dbg[ 0 ] << " "
1583  << dbg[ 1 ] << " "
1584  << dbg[ 2 ] << " "
1585  << dbg[ 3 ] << " "
1586  << dbg[ 4 ] << " "
1587  << dbg[ 5 ] << " "
1588  << dbg[ 6 ] << " "
1589  << dbg[ 7 ] << " "
1590  << std::endl;*/
1591 
1592  //manual unroll, stage 1: fill input buffers
1593  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1594 
1595  //DBG
1596  //std::cout << "A" << std::endl;
1597 
1598  //fill nonzero buffer
1599  value_buffer = _mm512_load_pd( pDataA );
1600 
1601  //DBG
1602  //std::cout << "B" << std::endl;
1603 
1604  //do vectorised multiply-add
1605  outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1606 
1607  //DBG
1608  //std::cout << "C" << std::endl;
1609 
1610  //shift input data
1611  pDataA += block_length;
1612 
1613  //DBG
1614  //std::cout << "D" << std::endl;
1615 
1616  //shift c_ind_buffer
1617  c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1618 
1619  //DBG
1620  //std::cout << "E" << std::endl;
1621 
1622  //shift input vector
1623  pDataX += *pIncCol;
1624 
1625  //prefetch input
1626  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1627 
1628  //move pIncCol up
1629  pIncCol += block_length;
1630 
1631  //DBG
1632  //std::cout << "Unroll I done" << std::endl;
1633 
1634  //check for break
1635  if( pDataX >= pDataXend ) {
1636 
1637  //reduce half of row contribution
1638  const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1639  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1640  //(output buffer now contains HG GH FE EF DC CD BA AB)
1641  const __m512d reduce_buffer2 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_BADC );
1642  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer2 );
1643  //(output buffer now contains HGFE GHEF FEHG EFGH DCBA CDAB BADC ABCD)
1644 
1645  //DBG
1646  //std::cout << "Writing out " << outputbuffer.array[ 0 ] << " to index " << (static_cast< size_t >(pDataZ - pDataZst)) << std::endl;
1647 
1648  //reduce part 1
1649  *pDataZ += outputbuffer.array[ 0 ];
1650 
1651  //row jump, shift back input vector
1652  pDataX -= this->noc;
1653 
1654  //shift output vector
1655  pDataZ += *pIncRow++;
1656 
1657  //DBG
1658  //std::cout << "Writing out " << outputbuffer.array[ 4 ] << " to index " << (static_cast< size_t >(pDataZ - pDataZst)) << std::endl;
1659 
1660  //reduce part 2
1661  *pDataZ += outputbuffer.array[ 4 ];
1662 
1663  //reset sums buffer
1664  outputbuffer.internal = _mm512_setzero_pd();
1665 
1666  //shift output vector
1667  pDataZ += *pIncRow++;
1668 
1669  //check for end of matrix
1670  if( pDataA >= pDataAend ) {
1671  break;
1672  }
1673  }
1674 
1675  //DBG
1676  /*std::cout << "In loop, stage 2, A-E: " << std::endl;
1677  _mm512_store_epi32( dbg, c_ind_buffer );
1678  std::cout << "Grabbing from " << pDataX << " ( " << (pDataX-pDataXst) << " ): "
1679  << dbg[ 0 ] << " "
1680  << dbg[ 1 ] << " "
1681  << dbg[ 2 ] << " "
1682  << dbg[ 3 ] << " "
1683  << dbg[ 4 ] << " "
1684  << dbg[ 5 ] << " "
1685  << dbg[ 6 ] << " "
1686  << dbg[ 7 ] << " "
1687  << std::endl;*/
1688 
1689  //manual unroll, stage 2: fill input buffers
1690  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1691 
1692  //DBG
1693  //std::cout << "A" << std::endl;
1694 
1695  //fill value buffer
1696  value_buffer = _mm512_load_pd( pDataA );
1697 
1698  //DBG
1699  //std::cout << "B" << std::endl;
1700 
1701  //do vectorised multiply-add
1702  outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1703 
1704  //DBG
1705  //std::cout << "C" << std::endl;
1706 
1707  //shift input data
1708  pDataA += block_length;
1709 
1710  //DBG
1711  //std::cout << "D" << std::endl;
1712 
1713  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1714  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1715 
1716  //DBG
1717  //std::cout << "E" << std::endl;
1718 
1719  //shift input vector
1720  pDataX += *pIncCol;
1721 
1722  //move pIncCol up
1723  pIncCol += block_length;
1724 
1725  //DBG
1726  //std::cout << "F" << std::endl;
1727 
1728  //reset start column index; alternatively, do this on 16-bit data earlier
1729  c_ind_buffer =_mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1730 
1731  //prefetch input
1732  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1733 
1734  //DBG
1735  //std::cout << "Unroll II done" << std::endl;
1736 
1737  }
1738 
1739  //reduce half of row contribution
1740  const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1741  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1742  //(output buffer now contains HG GH FE EF DC CD BA AB)
1743  const __m512d reduce_buffer2 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_BADC );
1744  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer2 );
1745  //(output buffer now contains HGFE GHEF FEHG EFGH DCBA CDAB BADC ABCD)
1746 
1747  //DBG
1748  //std::cout << "Writing out " << outputbuffer.array[ 0 ] << " to index " << (static_cast< size_t >(pDataZ - pDataZst)) << std::endl;
1749 
1750  //reduce part 1
1751  *pDataZ += outputbuffer.array[ 0 ];
1752 
1753  //row jump, shift back input vector
1754  pDataX -= this->noc;
1755 
1756  //shift output vector
1757  pDataZ += *pIncRow++;
1758 
1759  //DBG
1760  //std::cout << "Writing out " << outputbuffer.array[ 4 ] << " to index " << (static_cast< size_t >(pDataZ - pDataZst)) << std::endl;
1761 
1762  //reduce part 2
1763  *pDataZ += outputbuffer.array[ 4 ];
1764 
1765  //reset sums buffer
1766  outputbuffer.internal = _mm512_setzero_pd();
1767 
1768  //shift output vector
1769  pDataZ += *pIncRow++;
1770  }
1771  }
1772  //end 16-byte unsigned integer indices specialisation for double data types, 2x4 blocks.
1773 
1774  //start 16-byte unsigned integer indices specialisation for double data types, 4x2 blocks.
1775  //NOTE: this code uses prefetching in both the input and output gathers. Enabling or disabling
1776  // gather-prefetch on the output vector seems to have no discernable effect on performance.
1777  // Presumably ICC filters out those prefetches based on internal heuristics. This is fine;
1778  // the code is retained here for completeness.
1779  void vecBICRS< double, 4, 2, int16_t >::zax( const double *__restrict__ pDataX, double *__restrict__ pDataZ ) {
1780  //boundary checks
1781  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
1782  return;
1783  }
1784 
1785  //initialise prefetch macro
1786  _mm512_prefetch_i32gather_pd_0hint_init();
1787 
1788  //DBG
1789  //const double * const pDataXst = pDataX;
1790  //const double * const pDataZst = pDataZ;
1791 
1792  //pointer aliases
1793  const double * const pDataAend = ds + this->allocsize - block_length;
1794  const double * const pDataXend = pDataX + this->noc;
1795  double *__restrict__ pDataA = ds;
1796  int16_t *__restrict__ pIncRow = r_ind;
1797  int16_t *__restrict__ pIncCol = c_ind;
1798 
1799  //DBG
1800  //std::cout << "x start: " << pDataX << ", x end: " << pDataXend << std::endl;
1801 
1802  //define buffers
1803  __m512d z_shadow;
1804  __m512i c_ind_buffer;
1805  __m512i r_ind_buffer;
1806  __m512d input_buffer;
1807  __m512d value_buffer;
1808  __m512d_union outputbuffer;
1809  __m512i zeroF = _mm512_set_epi32(
1810  1, 1, 1, 1, 1, 1, 1, 0,
1811  1, 1, 1, 1, 1, 1, 1, 0
1812  );
1813  __mmask8 mask1 = 0x55;
1814  __mmask8 mask2 = 0xAA;
1815 
1816  //initialise kernel
1817  outputbuffer.internal = _mm512_setzero_pd();
1818 
1819  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1820  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1821 
1822  //DBG
1823  /*int16_t *dbgd = (int16_t*)pIncCol;
1824  std::cout << "c_ind_buffer= " << pDataX << ": "
1825  << dbgd[ 0 ] << " "
1826  << dbgd[ 1 ] << " "
1827  << dbgd[ 2 ] << " "
1828  << dbgd[ 3 ] << " "
1829  << dbgd[ 4 ] << " "
1830  << dbgd[ 5 ] << " "
1831  << dbgd[ 6 ] << " "
1832  << dbgd[ 7 ] << " "
1833  << std::endl;*/
1834 
1835  //shift input vector
1836  pDataX += *pIncCol;
1837 
1838  //move pIncCol up one block
1839  pIncCol += block_length;
1840 
1841  //reset start column index; alternatively, do this on 16-bit data earlier
1842  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1843 
1844  //fill r_ind_buffer
1845  r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1846 
1847  //nullify r_ind_buffer
1848  r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
1849 
1850  //prefetch index buffers
1851  _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
1852  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1853 
1854  //shift output vector
1855  pDataZ += *pIncRow;
1856 
1857  //move pIncRow up one block
1858  pIncRow += block_length;
1859 
1860  //fill z shadow, this loads 8 output vector elements
1861  z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1862 
1863  //keep track of row/column switches (load vs register shuffle), plus mask shift for z_shadow
1864  bool colshift, rowshift, maskshift;
1865  colshift = rowshift = maskshift = true;
1866 
1867  //DBG
1868  //std::cout << "Entering loop..." << std::endl;
1869 
1870  //start kernel
1871  while( pDataA < pDataAend ) {
1872 
1873  //process row
1874  while( pDataX < pDataXend ) {
1875 
1876  //DBG
1877  /*std::cout << "In loop, stage 1, A-E: " << std::endl;
1878  __declspec(align(64)) int32_t dbg[ 16 ];
1879  _mm512_store_epi32( dbg, c_ind_buffer );
1880  std::cout << "Grabbing from " << pDataX << " ( " << (pDataX-pDataXst) << " ): "
1881  << dbg[ 0 ] << " "
1882  << dbg[ 1 ] << " "
1883  << dbg[ 2 ] << " "
1884  << dbg[ 3 ] << " "
1885  << dbg[ 4 ] << " "
1886  << dbg[ 5 ] << " "
1887  << dbg[ 6 ] << " "
1888  << dbg[ 7 ] << " "
1889  << std::endl;*/
1890 
1891  //manual unroll, stage 1: fill input buffers
1892  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
1893 
1894  //DBG
1895  //std::cout << "A" << std::endl;
1896 
1897  //fill nonzero buffer
1898  value_buffer = _mm512_load_pd( pDataA );
1899 
1900  //DBG
1901  //std::cout << "B" << std::endl;
1902 
1903  //do vectorised multiply-add
1904  outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
1905 
1906  //DBG
1907  //std::cout << "C" << std::endl;
1908 
1909  //shift input data
1910  pDataA += block_length;
1911 
1912  //DBG
1913  //std::cout << "D" << std::endl;
1914 
1915  if( colshift ) {
1916  //DBG
1917  //std::cout << "D colshift A" << std::endl;
1918 
1919  //shuffle c_ind_buffer
1920  c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
1921 
1922  //DBG
1923  //std::cout << "D colshift B" << std::endl;
1924  } else {
1925  //DBG
1926  //std::cout << "D !colshift A" << std::endl;
1927 
1928  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
1929  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
1930 
1931  //DBG
1932  //std::cout << "D !colshift B" << std::endl;
1933 
1934  //reset main increment to 0
1935  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
1936 
1937  //DBG
1938  //std::cout << "D !colshift C" << std::endl;
1939  }
1940  colshift = !colshift;
1941 
1942  //DBG
1943  //std::cout << "E" << std::endl;
1944 
1945  //shift input vector
1946  pDataX += *pIncCol;
1947 
1948  //DBG
1949  //std::cout << "F" << std::endl;
1950 
1951  //prefetch input
1952  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
1953 
1954  //move pIncCol up
1955  pIncCol += block_length;
1956 
1957  //DBG
1958  //std::cout << "end unroll I" << std::endl;
1959 
1960  //check for break
1961  if( pDataX >= pDataXend ) {
1962  //DBG
1963  //std::cout << "Row break in-between unroll I and unroll II" << std::endl;
1964 
1965  //reduce half of row contribution
1966  const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
1967  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
1968  //(output buffer now contains HG GH FE EF DC CD BA AB)
1969 
1970  //DBG
1971  /*std::cout << "Writing out "
1972  << outputbuffer.array[ 0 ] << ", "
1973  << outputbuffer.array[ 2 ] << ", "
1974  << outputbuffer.array[ 4 ] << ", "
1975  << outputbuffer.array[ 6 ] << " to shadow of index "
1976  << static_cast< size_t >(pDataZ - pDataZst)
1977  << std::endl;*/
1978 
1979  //add to z_shadow
1980  if( maskshift ) {
1981  //DBG
1982  //std::cout << "Updating z_shadow..." << std::endl;
1983 
1984  z_shadow = _mm512_mask_add_pd( z_shadow, mask1, z_shadow, outputbuffer.internal );
1985  } else {
1986  //DBG
1987  //std::cout << "Updating z_shadow using mask2" << std::endl;
1988 
1989  z_shadow = _mm512_mask_add_pd( z_shadow, mask2, z_shadow, outputbuffer.internal );
1990 
1991  //DBG
1992  /*std::cout << "Writing out z_shadow { ";
1993  __declspec(align(64)) double dbgd[ 8 ];
1994  _mm512_store_pd( dbgd, z_shadow );
1995  std::cout << dbgd[ 0 ] << " "
1996  << dbgd[ 1 ] << " "
1997  << dbgd[ 2 ] << " "
1998  << dbgd[ 3 ] << " "
1999  << dbgd[ 4 ] << " "
2000  << dbgd[ 5 ] << " "
2001  << dbgd[ 6 ] << " "
2002  << dbgd[ 7 ] << " "
2003  << "}";
2004  std::cout << " to positions " << (pDataZ - pDataZst) << " + { ";
2005  __declspec(align(64)) int32_t dbg[ 16 ];
2006  _mm512_store_epi32( dbg, r_ind_buffer );
2007  std::cout << dbg[ 0 ] << " "
2008  << dbg[ 1 ] << " "
2009  << dbg[ 2 ] << " "
2010  << dbg[ 3 ] << " "
2011  << dbg[ 4 ] << " "
2012  << dbg[ 5 ] << " "
2013  << dbg[ 6 ] << " "
2014  << dbg[ 7 ] << " "
2015  << "}" << std::endl;*/
2016 
2017  //we are done with this z_shadow; first write out
2018  _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2019 
2020  //shift output vector
2021  pDataZ += *pIncRow;
2022 
2023  //DBG
2024  //std::cout << "Getting new r_ind_buffer" << std::endl;
2025 
2026  //get next r_ind_buffer
2027  if( rowshift ) {
2028  //shift r_ind_buffer
2029  r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2030  } else {
2031  //fill r_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2032  r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2033 
2034  //reset main increment to 0
2035  r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF ); //i.e., r_ind_buffer[ 0 ] = 0;
2036  }
2037  rowshift = !rowshift;
2038 
2039  //prefetch next z_shadow
2040  _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2041 
2042  //shift pIncRow
2043  pIncRow += block_length;
2044 
2045  //DBG
2046  //std::cout << "Getting new z_shadow..." << std::endl;
2047 
2048  //load next z_shadow (this loads 8 output vector elements)
2049  z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2050  }
2051  maskshift = !maskshift;
2052 
2053  //row jump, shift back input vector
2054  pDataX -= this->noc;
2055 
2056  //reset sums buffer
2057  outputbuffer.internal = _mm512_setzero_pd();
2058 
2059  //check for end of matrix
2060  if( pDataA >= pDataAend ) {
2061  //write out z_shadow; there won't be a regular write-out after unroll II
2062  break;
2063  }
2064 
2065  //DBG
2066  //std::cout << "Exit row switch" << std::endl;
2067  }
2068 
2069  //DBG
2070  /*std::cout << "In loop, unroll II: " << std::endl;
2071  _mm512_store_epi32( dbg, c_ind_buffer );
2072  std::cout << "Grabbing from " << pDataX << " ( " << (pDataX-pDataXst) << " ): "
2073  << dbg[ 0 ] << " "
2074  << dbg[ 1 ] << " "
2075  << dbg[ 2 ] << " "
2076  << dbg[ 3 ] << " "
2077  << dbg[ 4 ] << " "
2078  << dbg[ 5 ] << " "
2079  << dbg[ 6 ] << " "
2080  << dbg[ 7 ] << " "
2081  << std::endl;*/
2082 
2083  //manual unroll, stage 2: fill input buffers
2084  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2085 
2086  //DBG
2087  //std::cout << "A" << std::endl;
2088 
2089  //fill value buffer
2090  value_buffer = _mm512_load_pd( pDataA );
2091 
2092  //DBG
2093  //std::cout << "B" << std::endl;
2094 
2095  //do vectorised multiply-add
2096  outputbuffer.internal = _mm512_fmadd_pd( value_buffer, input_buffer, outputbuffer.internal );
2097 
2098  //DBG
2099  //std::cout << "C" << std::endl;
2100 
2101  //shift input data
2102  pDataA += block_length;
2103 
2104  //DBG
2105  //std::cout << "D" << std::endl;
2106 
2107  if( colshift ) {
2108  //DBG
2109  //std::cout << "D colshift A" << std::endl;
2110 
2111  //shuffle c_ind_buffer
2112  c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
2113 
2114  //DBG
2115  //std::cout << "D colshift B" << std::endl;
2116  } else {
2117  //DBG
2118  //std::cout << "D !colshift A" << std::endl;
2119 
2120  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2121  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2122 
2123  //DBG
2124  //std::cout << "D !colshift B" << std::endl;
2125 
2126  //reset main increment to 0
2127  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
2128 
2129  //DBG
2130  //std::cout << "D !colshift C" << std::endl;
2131  }
2132  colshift = !colshift;
2133 
2134  //DBG
2135  //std::cout << "E" << std::endl;
2136 
2137  //shift input vector
2138  pDataX += *pIncCol;
2139 
2140  //DBG
2141  //std::cout << "F" << std::endl;
2142 
2143  //move pIncCol up
2144  pIncCol += block_length;
2145 
2146  //prefetch input
2147  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2148 
2149  //DBG
2150  //std::cout << "Unroll II done" << std::endl;
2151  }
2152 
2153  //reduce half of row contribution
2154  const __m512d reduce_buffer1 = _mm512_swizzle_pd( outputbuffer.internal, _MM_SWIZ_REG_CDAB );
2155  outputbuffer.internal = _mm512_add_pd( outputbuffer.internal, reduce_buffer1 );
2156  //(output buffer now contains HG GH FE EF DC CD BA AB)
2157 
2158  //DBG
2159  /*std::cout << "Writing out "
2160  << outputbuffer.array[ 0 ] << ", "
2161  << outputbuffer.array[ 2 ] << ", "
2162  << outputbuffer.array[ 4 ] << ", "
2163  << outputbuffer.array[ 6 ] << " to shadow of index "
2164  << (static_cast< size_t >(pDataZ - pDataZst))
2165  << std::endl;*/
2166 
2167  //add to z_shadow
2168  if( maskshift ) {
2169  //DBG
2170  //std::cout << "Updating z_shadow..." << std::endl;
2171 
2172  z_shadow = _mm512_mask_add_pd( z_shadow, mask1, z_shadow, outputbuffer.internal );
2173  } else {
2174  //DBG
2175  //std::cout << "Updating z_shadow using mask2" << std::endl;
2176 
2177  z_shadow = _mm512_mask_add_pd( z_shadow, mask2, z_shadow, outputbuffer.internal );
2178 
2179  //DBG
2180  /*std::cout << "Writing out z_shadow { ";
2181  __declspec(align(64)) double dbgd[ 8 ];
2182  _mm512_store_pd( dbgd, z_shadow );
2183  std::cout << dbgd[ 0 ] << " "
2184  << dbgd[ 1 ] << " "
2185  << dbgd[ 2 ] << " "
2186  << dbgd[ 3 ] << " "
2187  << dbgd[ 4 ] << " "
2188  << dbgd[ 5 ] << " "
2189  << dbgd[ 6 ] << " "
2190  << dbgd[ 7 ] << " "
2191  << "}";
2192  std::cout << " to positions " << (pDataZ - pDataZst) << " + { ";
2193  __declspec(align(64)) int32_t dbg[ 16 ];
2194  _mm512_store_epi32( dbg, r_ind_buffer );
2195  std::cout << dbg[ 0 ] << " "
2196  << dbg[ 1 ] << " "
2197  << dbg[ 2 ] << " "
2198  << dbg[ 3 ] << " "
2199  << dbg[ 4 ] << " "
2200  << dbg[ 5 ] << " "
2201  << dbg[ 6 ] << " "
2202  << dbg[ 7 ] << " "
2203  << "}" << std::endl;*/
2204 
2205  //we are done with this z_shadow; first write out
2206  _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2207 
2208  //shift output vector
2209  pDataZ += *pIncRow;
2210 
2211  //DBG
2212  //std::cout << "Getting new r_ind_buffer" << std::endl;
2213 
2214  //get next r_ind_buffer
2215  if( rowshift ) {
2216  //shift r_ind_buffer
2217  r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2218  } else {
2219  //fill r_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2220  r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2221 
2222  //reset main increment to 0
2223  r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF ); //i.e., r_ind_buffer[ 0 ] = 0;
2224  }
2225  rowshift = !rowshift;
2226 
2227  //prefetch next z_shadow
2228  _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2229 
2230  //shift pIncRow
2231  pIncRow += block_length;
2232 
2233  //DBG
2234  //std::cout << "Getting new z_shadow..." << std::endl;
2235 
2236  //load next z_shadow (this loads 8 output vector elements)
2237  z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2238  }
2239  maskshift = !maskshift;
2240 
2241  //reset sums buffer
2242  outputbuffer.internal = _mm512_setzero_pd();
2243 
2244  //row jump, shift back input vector
2245  pDataX -= this->noc;
2246 
2247  //DBG
2248  //std::cout << "End outer loop" << std::endl;
2249  }
2250  if( !maskshift ) {
2251  //there is a write-out of z_shadow still pending; do it
2252  _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2253  }
2254  }
2255  //end 16-byte unsigned integer indices specialisation for double data types, 4x2 blocks.
2256 
2257  //start 16-byte unsigned integer indices specialisation for double data types, 8x1 blocks.
2258  void vecBICRS< double, 8, 1, int16_t >::zax( const double *__restrict__ pDataX, double *__restrict__ pDataZ ) {
2259  //boundary checks
2260  if( this->nor == 0 || this->noc == 0 || this->nnz == 0 ) {
2261  return;
2262  }
2263 
2264  //initialise prefetch macro
2265  _mm512_prefetch_i32gather_pd_0hint_init();
2266 
2267  //DBG
2268  //const double * const pDataXst = pDataX;
2269  //const double * const pDataZst = pDataZ;
2270 
2271  //pointer aliases
2272  double *__restrict__ pDataA = ds;
2273  const double * const pDataAend = ds + this->allocsize - block_length;
2274  const double * const pDataXend = pDataX + this->noc;
2275  int16_t *__restrict__ pIncRow = r_ind;
2276  int16_t *__restrict__ pIncCol = c_ind;
2277 
2278  //define buffers
2279  __m512d z_shadow;
2280  __m512i c_ind_buffer;
2281  __m512i r_ind_buffer;
2282  __m512d input_buffer;
2283  __m512d value_buffer;
2284  __m512i zeroF = _mm512_set_epi32(
2285  1, 1, 1, 1, 1, 1, 1, 0,
2286  1, 1, 1, 1, 1, 1, 1, 0
2287  );
2288 
2289  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2290  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2291 
2292  //DBG
2293  /*__declspec(align(64)) int32_t dbg[ 16 ];
2294  _mm512_store_epi32( dbg, c_ind_buffer );
2295  std::cout << "c_ind_buffer equals " << dbg[ 0 ] << ", " << dbg[ 1 ] << ", " <<
2296  dbg[ 2 ] << ", " << dbg[ 3 ] << ", " << dbg[ 4 ] << ", " <<
2297  dbg[ 5 ] << ", " << dbg[ 6 ] << ", " << dbg[ 7 ] << "." << std::endl;*/
2298 
2299  //shift input vector
2300  pDataX += *pIncCol;
2301 
2302  //move pIncCol up one block
2303  pIncCol += block_length;
2304 
2305  //reset start column index; alternatively, do this on 16-bit data earlier
2306  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
2307 
2308  //fill r_ind_buffer
2309  r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2310 
2311  //nullify r_ind_buffer
2312  r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF );
2313 
2314  //prefetch index buffers
2315  _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2316  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2317 
2318  //shift output vector
2319  pDataZ += *pIncRow;
2320 
2321  //move pIncRow up one block
2322  pIncRow += block_length;
2323 
2324  //fill z shadow, this loads 8 output vector elements
2325  z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2326 
2327  //keep track of row/column switches (load vs register shuffle), plus mask shift for z_shadow
2328  bool colshift, rowshift;
2329  colshift = rowshift = true;
2330 
2331  //DBG
2332  //std::cout << "Entering loop..." << std::endl;
2333 
2334  //start kernel
2335  while( pDataA < pDataAend ) {
2336  //for each block row
2337  while( pDataX < pDataXend ) {
2338  //DBG
2339  /*std::cout << "Looping with x @ " << (pDataX-pDataXst) << " and z @ " << (pDataZ-pDataZst) << std::endl;
2340  __declspec(align(64)) int32_t dbg[ 16 ];
2341  __declspec(align(64)) double dbgd[ 16 ];
2342  _mm512_store_epi32( dbg, c_ind_buffer );
2343  std::cout << "c_ind_buffer equals " << dbg[ 0 ] << ", " << dbg[ 1 ] << ", " <<
2344  dbg[ 2 ] << ", " << dbg[ 3 ] << ", " << dbg[ 4 ] << ", " <<
2345  dbg[ 5 ] << ", " << dbg[ 6 ] << ", " << dbg[ 7 ] << "." << std::endl;
2346  _mm512_store_epi32( dbg, r_ind_buffer );
2347  std::cout << "r_ind_buffer equals " << dbg[ 0 ] << ", " << dbg[ 1 ] << ", " <<
2348  dbg[ 2 ] << ", " << dbg[ 3 ] << ", " << dbg[ 4 ] << ", " <<
2349  dbg[ 5 ] << ", " << dbg[ 6 ] << ", " << dbg[ 7 ] << "." << std::endl;
2350  _mm512_store_pd( dbgd, z_shadow );
2351  std::cout << "z_shadow equals " << dbgd[ 0 ] << " " << dbgd[ 1 ] << " "
2352  << dbgd[ 2 ] << " " << dbgd[ 3 ] << " " << dbgd[ 4 ]<< " "
2353  << dbgd[ 5 ] << " " << dbgd[ 6 ] << " " << dbgd[ 7 ]
2354  << std::endl;*/
2355 
2356  //fill input buffers
2357  input_buffer = _mm512_i32loextgather_pd( c_ind_buffer, pDataX, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2358 
2359  //fill value buffer
2360  value_buffer = _mm512_load_pd( pDataA );
2361 
2362  //do vectorised multiply-add
2363  z_shadow = _mm512_fmadd_pd( value_buffer, input_buffer, z_shadow );
2364 
2365  //shift input data
2366  pDataA += block_length;
2367 
2368  //DBG
2369  //std::cout << "preshift" << std::endl;
2370 
2371  //check how to move to the next block
2372  if( colshift ) {
2373  //shift c_ind_buffer
2374  c_ind_buffer = _mm512_permute4f128_epi32( c_ind_buffer, _MM_PERM_DCDC );
2375  } else {
2376  //fill c_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2377  c_ind_buffer = _mm512_extload_epi32( pIncCol, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2378 
2379  //reset main increment to 0
2380  c_ind_buffer = _mm512_mullo_epi32( c_ind_buffer, zeroF ); //i.e., c_ind_buffer[ 0 ] = 0;
2381  }
2382  colshift = !colshift;
2383 
2384  //shift input vector
2385  pDataX += *pIncCol;
2386 
2387  //prefetch input
2388  _mm512_prefetch_i32gather_pd_0hint( c_ind_buffer, pDataX );
2389 
2390  //move pIncCol up
2391  pIncCol += block_length;
2392 
2393  //DBG
2394  //std::cout << "/inner loop" << std::endl;
2395  }
2396 
2397  //row jump, z_shadow is fully updated, write out z_shadow and load next. Scatter:
2398  _mm512_i32loextscatter_pd( pDataZ, r_ind_buffer, z_shadow, _MM_DOWNCONV_PD_NONE, 8, _MM_HINT_NONE );
2399 
2400  //get next r_ind_buffer
2401  if( rowshift ) {
2402  //shift r_ind_buffer
2403  r_ind_buffer = _mm512_permute4f128_epi32( r_ind_buffer, _MM_PERM_DCDC );
2404  } else {
2405  //fill r_ind_buffer NOTE: one load reads 16 increments; i.e., 2 blocks!
2406  r_ind_buffer = _mm512_extload_epi32( pIncRow, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NT );
2407 
2408  //reset main increment to 0
2409  r_ind_buffer = _mm512_mullo_epi32( r_ind_buffer, zeroF ); //i.e., r_ind_buffer[ 0 ] = 0;
2410  }
2411  rowshift = !rowshift;
2412 
2413  //shift output vector
2414  pDataZ += *pIncRow;
2415 
2416  //prefetch next z_shadow
2417  _mm512_prefetch_i32gather_pd_0hint( r_ind_buffer, pDataZ );
2418 
2419  //shift pIncRow
2420  pIncRow += block_length;
2421 
2422  //load next z_shadow (this loads 8 output vector elements)
2423  z_shadow = _mm512_i32loextgather_pd( r_ind_buffer, pDataZ, _MM_UPCONV_PD_NONE, 8, _MM_HINT_NONE );
2424 
2425  //row jump, shift back input vector
2426  pDataX -= this->noc;
2427  }
2428  }
2429  //end 16-byte unsigned integer indices specialisation for double data types, 8x1 blocks.
2430 
2431 #endif
2432 
2433 #endif
2434 
ULI nnz
Number of non-zeros.
Definition: SparseMatrix.hpp:58
_i_value * r_ind
Array containing the row jumps.
Definition: vecBICRS.hpp:81
static int compareTriplets(const void *left, const void *right)
Comparison function used for sorting input data.
Definition: vecBICRS.hpp:364
void postProcessRowIncrements(std::vector< _i_value > &r_ind)
Makes suitable for vectorisation the row increment array.
Definition: vecBICRS.hpp:302
ULI i() const
Definition: Triplet.hpp:70
virtual unsigned long int m()
Queries the number of rows this matrix contains.
Definition: SparseMatrix.hpp:107
virtual size_t bytesUsed()
Function to query the amount of storage required by this sparse matrix.
Definition: vecBICRS.hpp:1270
virtual void zax(const T *__restrict__ pDataX, T *__restrict__ pDataZ)
In-place z=Ax function.
Definition: vecBICRS.hpp:1089
static void addPadding(std::vector< _i_value > &row_increments, const std::map< size_t, size_t > &row2local, const size_t blockingSize, const size_t prev_row, const size_t m)
Helper function for vecBICRS construction.
Definition: vecBICRS.hpp:135
static size_t prepareBlockIteration(std::vector< size_t > *const row_indices, std::vector< _i_value > &row_increments, _i_value &prev_row, const std::vector< Triplet< double > > &input, const size_t k, const size_t m)
Helper method for oBICRS constructor.
Definition: vecBICRS.hpp:221
void deallocate()
Utility function: free all memory areas.
Definition: vecBICRS.hpp:114
virtual void zxa(const T *__restrict__ pDataX, T *__restrict__ pDataZ)
In-place z=xA function.
Definition: vecBICRS.hpp:696
size_t allocsize
The number of nonzeroes allocated (may differ from the actual number of nonzeroes).
Definition: vecBICRS.hpp:87
virtual void getFirstIndexPair(ULI &row, ULI &col)
Returns the first nonzero index, per reference.
Definition: vecBICRS.hpp:666
size_t bytes
Remembers the number of bytes allocated.
Definition: vecBICRS.hpp:84
vecBICRS(vecBICRS< T > &toCopy)
Copy constructor.
Definition: vecBICRS.hpp:431
void loadFromFile(const std::string file, const T zero=0)
Function which loads a matrix from a matrix market file.
Definition: SparseMatrix.hpp:89
Interface common to all sparse matrix storage schemes.
Definition: SparseMatrix.hpp:46
static int pColumnSort(const void *left, const void *right)
Comparison function used for sorting vectorised blocks; in-row column ordering.
Definition: vecBICRS.hpp:379
ULI noc
Number of columns.
Definition: SparseMatrix.hpp:55
~vecBICRS()
Base deconstructor.
Definition: vecBICRS.hpp:1266
virtual void load(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero)
Definition: vecBICRS.hpp:467
vecBICRS()
Base constructor.
Definition: vecBICRS.hpp:401
ULI nor
Number of rows.
Definition: SparseMatrix.hpp:52
The incremental compressed row storage sparse matrix data structure.
Definition: vecBICRS.hpp:65
T zero_element
The element considered to be zero.
Definition: SparseMatrix.hpp:63
vecBICRS(std::vector< Triplet< T > > &input, const ULI m, const ULI n, const T zero=0)
Constructor which transforms a collection of input triplets to CRS format.
Definition: vecBICRS.hpp:462
ULI j() const
Definition: Triplet.hpp:73
void allocate()
Utility function: allocate memory areas using the allocsize and jumpsize fields.
Definition: vecBICRS.hpp:98
static const size_t block_length
Combined blocking size.
Definition: vecBICRS.hpp:72
size_t jumpsize
The number of row jumps plus one; i.e., the length of r_ind.
Definition: vecBICRS.hpp:90
vecBICRS(std::string file, T zero=0)
Base constructor.
Definition: vecBICRS.hpp:407
T * ds
Array containing the actual nnz non-zeros.
Definition: vecBICRS.hpp:75
size_t fillIn
Fill-in (explicitly added nonzeroes to enable vectorisation).
Definition: vecBICRS.hpp:361
virtual unsigned long int n()
Queries the number of columns this matrix contains.
Definition: SparseMatrix.hpp:115
A single triplet value.
Definition: Triplet.hpp:52
vecBICRS(const ULI number_of_nonzeros, const ULI number_of_rows, const ULI number_of_cols, T zero)
Base constructor which only initialises the internal arrays.
Definition: vecBICRS.hpp:421
static int pRowSort(const void *left, const void *right)
Comparison function used for sorting vectorised blocks; row ordering.
Definition: vecBICRS.hpp:390
_i_value * c_ind
Array containing the column jumps.
Definition: vecBICRS.hpp:78