Skip to content

Instantly share code, notes, and snippets.

@al42and
Created April 5, 2023 14:49
Show Gist options
  • Save al42and/1c9e166abd02c76772214d50b76150a0 to your computer and use it in GitHub Desktop.
Save al42and/1c9e166abd02c76772214d50b76150a0 to your computer and use it in GitHub Desktop.
Scheduling a lot of small kernels, CUDA vs oneAPI
#include "nvToolsExt.h"
#include <iostream>
#include <sycl/sycl.hpp>
#include <vector>
template <int N> class Kernel;
constexpr int size = 8;
template <int N> void run_kernel(sycl::queue &queue, int *ptr) {
queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for<Kernel<N>>(sycl::range<1>{size},
[=](sycl::id<1> id) { ptr[id.get(0)] += N; });
});
}
int main() {
for (const auto &dev :
sycl::device::get_devices(sycl::info::device_type::gpu)) {
std::cout << dev.get_info<sycl::info::device::name>() << std::endl;
sycl::property_list qpl{sycl::property::queue::in_order()};
sycl::queue q(dev, qpl);
if (!q.is_in_order())
return -1;
nvtxRangePush("Allocate and initialize");
int *buffer = sycl::malloc_device<int>(size, q);
int *hostBuffer = sycl::malloc_host<int>(size, q);
q.fill<int>(buffer, 0, size);
q.wait();
nvtxRangePop();
for (int iter = 0; iter < 5; iter++) {
nvtxRangePush("Submit GPU work");
run_kernel<1>(q, buffer);
run_kernel<2>(q, buffer);
if (iter == 4) {
q.copy<int>(buffer, hostBuffer, size);
}
nvtxRangePop();
}
nvtxRangePush("Get GPU results");
q.wait();
for (int i = 0; i < size; i++)
if (hostBuffer[i] != 15)
return -1;
nvtxRangePop();
sycl::free(buffer, q);
sycl::free(hostBuffer, q);
std::cout << "Success!" << std::endl;
}
}
// Converted from SYCL with help of ChatGPT :)
#include "nvToolsExt.h"
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
constexpr int size = 8;
template <int N> __global__ void Kernel(int *ptr) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) {
ptr[id] += N;
}
}
template <int N> void run_kernel(cudaStream_t stream, int *ptr) {
int block_size = 256;
int grid_size = (size + block_size - 1) / block_size;
Kernel<N><<<grid_size, block_size, 0, stream>>>(ptr);
}
int main() {
int num_devices;
cudaGetDeviceCount(&num_devices);
for (int dev_id = 0; dev_id < num_devices; ++dev_id) {
cudaDeviceProp dev_prop;
cudaGetDeviceProperties(&dev_prop, dev_id);
std::cout << dev_prop.name << std::endl;
cudaStream_t stream;
cudaStreamCreate(&stream);
int *buffer, *hostBuffer;
nvtxRangePush("Allocate and initialize");
cudaMalloc(&buffer, size * sizeof(int));
cudaMallocHost(&hostBuffer, size * sizeof(int));
cudaMemset(buffer, 0, size * sizeof(int));
cudaStreamSynchronize(stream);
nvtxRangePop();
for (int iter = 0; iter < 5; iter++) {
nvtxRangePush("Submit GPU work");
run_kernel<1>(stream, buffer);
run_kernel<2>(stream, buffer);
if (iter == 4) {
cudaMemcpyAsync(hostBuffer, buffer, size * sizeof(int),
cudaMemcpyDeviceToHost, stream);
}
nvtxRangePop();
}
nvtxRangePush("Get GPU results");
cudaStreamSynchronize(stream);
for (int i = 0; i < size; i++)
if (hostBuffer[i] != 15)
return -1;
nvtxRangePop();
std::cout << "Success!" << std::endl;
cudaFree(buffer);
cudaFreeHost(hostBuffer);
cudaStreamDestroy(stream);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment