
@inproceedings{yzelman15a,
 author = "Yzelman, A. N.",
 title = "Generalised vectorisation for sparse matrix--vector multiplication",
 year = "2015",
 isbn = {9781450340014},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 doi = {10.1145/2833179.2833185},
 articleno = {6},
 numpages = {8},
 booktitle = {Proceedings of the 5th Workshop on Irregular Applications: Architectures and Algorithms},
 series = {IA3 '15},
 keywords = {cache-oblivious algorithms, segmented reduce, CRS, shared-memory parallel, CSR},
 keywords = "Sparse matrix--vector multiplication",
 keywords = "shared-memory parallelism",
 keywords = "cache-oblivious",
 keywords = "vectorisation",
 keywords = "{SIMD}",
 keywords = "high-performance computing",
 keywords = "{Intel Xeon Phi}",
 keywords ="Compressed Row Storage",
 keywords = "{ELLPACK}",
 keywords = "Compressed Sparse Rows",
 keywords = "Blocked {CRS}",
 keywords = "segmented scan",
 abstract = "This work generalises the various ways in which a sparse matrix--vector (SpMV) multiplication can be vectorised. It arrives at a novel data structure that generalises three earlier well-known data structures for sparse computations: the Blocked CRS format, the (sliced) ELLPACK format, and segmented scan based formats.
The new data structure is relevant for sparse computations on modern architectures, since efficient use of new hardware requires the use of increasingly wide vector registers. Normally, the use of vectorisation for sparse computations is limited due to bandwidth constraints. In cases where computations are limited by memory latencies instead of memory bandwidth, however, vectorisation can still help performance. The Intel Xeon Phi, appearing as a component in several top-500 supercomputers, displays exactly this behaviour for SpMV multiplication.
On this architecture, the use of the new generalised vectorisation scheme increases performance up to 178 percent."
}

