Last active
March 1, 2016 05:49
-
-
Save scraimer/0e8965a38cb235e0565a to your computer and use it in GitHub Desktop.
A StackOverflow question was complaining about copying an array of uint8 to an array of uint16 as a "bottleneck". That sounded like a latency problem, and while I was looking into it, I finally got to write down the magic anti-optimizer functions that Chandler Carruth mentioned in his CppCon talk, for preventing code/data from being optimized away.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <chrono> | |
#include <iostream> | |
/* The following two functions are from a talk by Chandler Carruth: | |
* | |
* CppCon 2015: Chandler Carruth "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!" | |
* | |
* [https://www.youtube.com/watch?v=nXaxk27zwlk] | |
*/ | |
// Magical escape function to prevent the opitimizer from assuming p can be optimized away | |
static void escape(void *p) | |
{ | |
asm volatile("" : : "g"(p) : "memory"); | |
} | |
// Magical function to convince the compiler that ALL the memory has been written to | |
static void clobber() | |
{ | |
asm volatile("" : : : "memory"); | |
} | |
/* By combining escape() and clobber() we convince the compiler to not optimize | |
* away references to data that we never use. Becuase escape says "hey, I might | |
* be using this somewhere", and clobber() says "hey, I just touched ALL the | |
* memory". So the compiler can't assume we didn't touch the bit of memory we | |
* marked as escaped. And thus can't get rid of it. */ | |
struct original | |
{ | |
int8_t a[128]; | |
int16_t b[128]; | |
void go() | |
{ | |
escape(a); | |
escape(b); | |
for (int i=0;i<128;i++) | |
{ | |
b[i] = a[i]; | |
} | |
clobber(); | |
} | |
}; | |
struct aligned64 | |
{ | |
struct alignas(64) { int8_t v; } a[128]; | |
struct alignas(64) { int16_t v; } b[128]; | |
void go() | |
{ | |
escape(a); | |
escape(b); | |
for (int i=0;i<128;i++) | |
{ | |
b[i].v = a[i].v; | |
} | |
clobber(); | |
} | |
}; | |
struct unrolled_8s | |
{ | |
int8_t a[128]; | |
int16_t b[128]; | |
void go() | |
{ | |
escape(a); | |
escape(b); | |
for (int i=0;i<128;i+=8) | |
{ | |
b[i] = a[i]; | |
b[i+1] = a[i+1]; | |
b[i+2] = a[i+2]; | |
b[i+3] = a[i+3]; | |
b[i+4] = a[i+4]; | |
b[i+5] = a[i+5]; | |
b[i+6] = a[i+6]; | |
b[i+7] = a[i+7]; | |
} | |
clobber(); | |
} | |
}; | |
struct unrolled_64s | |
{ | |
int8_t a[128]; | |
int16_t b[128]; | |
void go() | |
{ | |
escape(a); | |
escape(b); | |
for (int i=0;i<128;i+=64) | |
{ | |
b[i] = a[i]; | |
b[i+1] = a[i+1]; | |
b[i+2] = a[i+2]; | |
b[i+3] = a[i+3]; | |
b[i+4] = a[i+4]; | |
b[i+5] = a[i+5]; | |
b[i+6] = a[i+6]; | |
b[i+7] = a[i+7]; | |
b[i+8] = a[i+8]; | |
b[i+9] = a[i+9]; | |
b[i+10] = a[i+10]; | |
b[i+11] = a[i+11]; | |
b[i+12] = a[i+12]; | |
b[i+13] = a[i+13]; | |
b[i+14] = a[i+14]; | |
b[i+15] = a[i+15]; | |
b[i+16] = a[i+16]; | |
b[i+17] = a[i+17]; | |
b[i+18] = a[i+18]; | |
b[i+19] = a[i+19]; | |
b[i+20] = a[i+20]; | |
b[i+21] = a[i+21]; | |
b[i+22] = a[i+22]; | |
b[i+23] = a[i+23]; | |
b[i+24] = a[i+24]; | |
b[i+25] = a[i+25]; | |
b[i+26] = a[i+26]; | |
b[i+27] = a[i+27]; | |
b[i+28] = a[i+28]; | |
b[i+29] = a[i+29]; | |
b[i+30] = a[i+30]; | |
b[i+31] = a[i+31]; | |
b[i+32] = a[i+32]; | |
b[i+33] = a[i+33]; | |
b[i+34] = a[i+34]; | |
b[i+35] = a[i+35]; | |
b[i+36] = a[i+36]; | |
b[i+37] = a[i+37]; | |
b[i+38] = a[i+38]; | |
b[i+39] = a[i+39]; | |
b[i+40] = a[i+40]; | |
b[i+41] = a[i+41]; | |
b[i+42] = a[i+42]; | |
b[i+43] = a[i+43]; | |
b[i+44] = a[i+44]; | |
b[i+45] = a[i+45]; | |
b[i+46] = a[i+46]; | |
b[i+47] = a[i+47]; | |
b[i+48] = a[i+48]; | |
b[i+49] = a[i+49]; | |
b[i+50] = a[i+50]; | |
b[i+51] = a[i+51]; | |
b[i+52] = a[i+52]; | |
b[i+53] = a[i+53]; | |
b[i+54] = a[i+54]; | |
b[i+55] = a[i+55]; | |
b[i+56] = a[i+56]; | |
b[i+57] = a[i+57]; | |
b[i+58] = a[i+58]; | |
b[i+59] = a[i+59]; | |
b[i+60] = a[i+60]; | |
b[i+61] = a[i+61]; | |
b[i+62] = a[i+62]; | |
b[i+63] = a[i+63]; | |
} | |
clobber(); | |
} | |
}; | |
int main(int const argc, char const * const * const argv) | |
{ | |
uint64_t const iter_num = 10000000; | |
original orig; | |
auto start = std::chrono::steady_clock::now(); | |
for (uint64_t i=0; i<iter_num; ++i) | |
{ | |
orig.go(); | |
} | |
auto end = std::chrono::steady_clock::now(); | |
auto delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); | |
std::cout << "original: " << (delta / (double)iter_num) << "usec" << std::endl; | |
///////////////////////////////////////////////////// | |
aligned64 al64; | |
start = std::chrono::steady_clock::now(); | |
for (uint64_t i=0; i<iter_num; ++i) | |
{ | |
al64.go(); | |
} | |
end = std::chrono::steady_clock::now(); | |
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); | |
std::cout << "aligned64: " << (delta / (double)iter_num) << "usec" << std::endl; | |
///////////////////////////////////////////////////// | |
unrolled_8s u8; | |
start = std::chrono::steady_clock::now(); | |
for (uint64_t i=0; i<iter_num; ++i) | |
{ | |
u8.go(); | |
} | |
end = std::chrono::steady_clock::now(); | |
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); | |
std::cout << "unrolled_8s: " << (delta / (double)iter_num) << "usec" << std::endl; | |
///////////////////////////////////////////////////// | |
unrolled_64s u64; | |
start = std::chrono::steady_clock::now(); | |
for (uint64_t i=0; i<iter_num; ++i) | |
{ | |
u64.go(); | |
} | |
end = std::chrono::steady_clock::now(); | |
delta = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); | |
std::cout << "unrolled_64s: " << (delta / (double)iter_num) << "usec" << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output on my 3.1Ghz CPU: