Last active
April 18, 2024 16:30
-
-
Save al42and/90a0e410f7d140b851eb4d2b9c14d9cd to your computer and use it in GitHub Desktop.
Simple standalone test to see how different versions of a small scan kernel behave
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sycl/sycl.hpp> | |
#if BUILD_ONEDPL | |
#define ONEDPL_USE_DPCPP_BACKEND 1 | |
#include <oneapi/dpl/algorithm> | |
#include <oneapi/dpl/async> | |
#include <oneapi/dpl/execution> | |
#endif | |
template <int workGroupSize, int nElements_> struct ExclusivePrefixSumGlobal { | |
constexpr static int nElements = nElements_; | |
static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input, | |
int *__restrict__ gm_output) { | |
static_assert(nElements % workGroupSize == 0, | |
"This simple scan kernel does not handle padding"); | |
return [=](sycl::nd_item<1> itemIdx) { | |
const int tid = itemIdx.get_local_id(0); | |
const sycl::group<1> workGroup = itemIdx.get_group(); | |
sycl::joint_exclusive_scan(workGroup, gm_input, gm_input + nElements, | |
gm_output, 0, sycl::plus<int>{}); | |
}; | |
} | |
static void submit(sycl::queue q, const int *dataIn, int *dataOut) { | |
q.submit([&](sycl::handler &cgh) { | |
sycl::nd_range<1> range{workGroupSize, workGroupSize}; | |
cgh.parallel_for(range, kernel(cgh, dataIn, dataOut)); | |
}); | |
} | |
}; | |
template <int workGroupSize, int nElements_> struct ExclusivePrefixSumLocal { | |
constexpr static int nElements = nElements_; | |
static auto kernel(sycl::handler &cgh, const int *__restrict__ gm_input, | |
int *__restrict__ gm_output) { | |
static_assert(nElements % workGroupSize == 0, | |
"This simple scan kernel does not handle padding"); | |
sycl::local_accessor<int, 1> sm_localBuf(nElements, cgh); | |
return [=](sycl::nd_item<1> itemIdx) { | |
const int tid = itemIdx.get_local_id(0); | |
const sycl::group<1> workGroup = itemIdx.get_group(); | |
int *sm_localBufPtr = sm_localBuf.get_pointer(); | |
for (int elem = tid; elem < nElements; elem += workGroupSize) { | |
sm_localBufPtr[elem] = gm_input[elem]; | |
} | |
sycl::joint_exclusive_scan(workGroup, sm_localBufPtr, | |
sm_localBufPtr + nElements, sm_localBufPtr, 0, | |
sycl::plus<int>{}); | |
sycl::group_barrier(workGroup); | |
for (int elem = tid; elem < nElements; elem += workGroupSize) { | |
gm_output[elem] = sm_localBufPtr[elem]; | |
} | |
}; | |
} | |
static void submit(sycl::queue q, const int *dataIn, int *dataOut) { | |
q.submit([&](sycl::handler &cgh) { | |
sycl::nd_range<1> range{workGroupSize, workGroupSize}; | |
cgh.parallel_for(range, kernel(cgh, dataIn, dataOut)); | |
}); | |
} | |
}; | |
template <int nElements_> struct ExclusivePrefixSumOneDPL { | |
constexpr static int nElements = nElements_; | |
static void submit(sycl::queue q, const int *dataIn, int *dataOut) { | |
#if BUILD_ONEDPL | |
static const auto policy = oneapi::dpl::execution::make_device_policy(q); | |
oneapi::dpl::experimental::exclusive_scan_async( | |
policy, dataIn, dataIn + nElements, dataOut, 0, sycl::plus<int>{}); | |
#endif | |
} | |
}; | |
template <typename F> | |
void benchmark(sycl::device device, F &&kernel, const char *name) { | |
sycl::queue q(device, {sycl::property::queue::in_order()}); | |
std::cout << "Running on " << device.get_info<sycl::info::device::name>() | |
<< ", version " << name << "\n"; | |
constexpr int size = F::nElements; | |
int *dataIn = sycl::malloc_device<int>(size, q); | |
int *dataOut = sycl::malloc_device<int>(size, q); | |
std::vector<int> dataH(size); | |
// Warm-up and sanity check | |
q.fill<int>(dataIn, 1, size); | |
F::submit(q, dataIn, dataOut); | |
q.copy<int>(dataOut, dataH.data(), size).wait(); | |
int numFail = 0; | |
for (size_t i = 0; i < size; i++) { | |
int reference = i; | |
numFail += (dataH[i] != reference); | |
} | |
if (numFail > 0) | |
return; | |
for (int iter = 0; iter < 1000; iter++) { | |
F::submit(q, dataIn, dataOut); | |
q.wait(); | |
} | |
sycl::free(dataIn, q); | |
sycl::free(dataOut, q); | |
} | |
int main() { | |
for (const auto &device : sycl::device::get_devices()) { | |
// Creating SYCL queue | |
sycl::queue q(device, {sycl::property::queue::in_order()}); | |
constexpr size_t size = 8192; | |
benchmark(device, ExclusivePrefixSumGlobal<256, size>{}, "global (256)"); | |
benchmark(device, ExclusivePrefixSumGlobal<512, size>{}, "global (512)"); | |
benchmark(device, ExclusivePrefixSumGlobal<1024, size>{}, "global (1024)"); | |
benchmark(device, ExclusivePrefixSumLocal<256, size>{}, "local (256)"); | |
benchmark(device, ExclusivePrefixSumLocal<512, size>{}, "local (512)"); | |
benchmark(device, ExclusivePrefixSumLocal<1024, size>{}, "local (1024)"); | |
benchmark(device, ExclusivePrefixSumOneDPL<size>{}, "oneDPL"); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment