Last active
May 4, 2021 11:47
-
-
Save neoblizz/f04ebdf572073b41736825e94ba7a6be to your computer and use it in GitHub Desktop.
CUDA-based implementation to introduce sparsity.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <ctime> | |
#include <random> | |
#include <thrust/device_vector.h> | |
#include <thrust/host_vector.h> | |
#include <thrust/transform.h> | |
#include <thrust/iterator/counting_iterator.h> | |
#include <thrust/iterator/discard_iterator.h> | |
float get_random() { | |
static std::default_random_engine e; | |
static std::uniform_real_distribution<> dis(0, 1); // rage 0 - 1 | |
return dis(e); | |
} | |
int main(int argc, char** argv) { | |
using weight_t = float; | |
// Weight Matrix (m x n) | |
constexpr std::size_t m = 32; | |
constexpr std::size_t n = 32; | |
thrust::host_vector<weight_t> h_weights(n * m); | |
// Generate random weights | |
srand((unsigned)time(0)); | |
for (auto& weight : h_weights) | |
weight = get_random(); | |
// Move the data to GPU | |
thrust::device_vector<weight_t> d_weights = h_weights; | |
auto weights = d_weights.data().get(); // pointer to data | |
// Block configuration (2 x 2) | |
constexpr std::size_t blk_m = 2; | |
constexpr std::size_t blk_n = 2; | |
constexpr std::size_t blk_size = blk_m * blk_n; | |
// Tile configuration (Number of blocks per m, n) | |
constexpr std::size_t tile_m = m / blk_m; | |
constexpr std::size_t tile_n = n / blk_n; | |
// Sparsify lambda (50%) | |
float sparsity_factor = 0.5; | |
std::size_t number_of_zeros_per_block = floor(blk_size * sparsity_factor); | |
auto sparsify = [=] __device__(std::size_t const& blk_idx) { | |
// Global idx strided by blk_idx | |
auto global_idx = blk_idx * blk_size; | |
std::size_t sparsified = 0; | |
// Block idx as (m, n) | |
// auto blk_m_idx = blk_idx % blk_n; | |
// auto blk_n_idx = blk_idx / blk_n; | |
// Loop over the (2 x 2) block | |
for (std::size_t h = 0; h < blk_m; ++h) { | |
for (std::size_t w = 0; w < blk_n; ++w) { | |
if (sparsified == number_of_zeros_per_block) | |
break; | |
// <todo> need a good condition to determine | |
// if a value should be sparsified. | |
auto idx = global_idx + h + (w * blk_n); | |
weights[idx] = (weight_t)0; | |
sparsified++; | |
} | |
} | |
return 0; | |
}; | |
// Kernel launch using transform | |
cudaStream_t stream = 0; | |
thrust::transform( | |
thrust::cuda::par.on(stream), // CUDA stream | |
thrust::make_counting_iterator<std::size_t>(0), // Begin iterator: 0 | |
thrust::make_counting_iterator<std::size_t>( | |
tile_m * tile_n), // End iterator: tile_m * tile_n | |
thrust::make_discard_iterator(), // Discard output | |
sparsify // Unary Operator | |
); | |
// Log and output | |
std::cout << "Matrix Size (m, n) = (" << m << ", " << n << ")" << std::endl; | |
std::cout << "Sparsity Factor = " << sparsity_factor * 100 << "%" | |
<< std::endl; | |
std::cout << "Number of Nonzeros Per Block = " << number_of_zeros_per_block | |
<< std::endl; | |
std::cout << "Block Size blk_(m, n) = (" << blk_m << ", " << blk_n << ")" | |
<< std::endl; | |
std::cout << "Number of Blocks = " << blk_size << std::endl; | |
std::cout << "Weights (sparsified) = " << std::endl; | |
thrust::copy(d_weights.begin(), d_weights.end(), | |
std::ostream_iterator<weight_t>(std::cout, " ")); | |
std::cout << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compile and run;