This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
00007FF7837A1240 vmovups xmm0,xmmword ptr [rdx] | |
00007FF7837A1244 vinsertf128 ymm9,ymm0,xmmword ptr [rdx+10h],1 | |
00007FF7837A124B vmovups xmm1,xmmword ptr [rdx+20h] | |
00007FF7837A1250 vinsertf128 ymm10,ymm1,xmmword ptr [rdx+30h],1 | |
00007FF7837A1257 vmovups xmm0,xmmword ptr [rdx+40h] | |
00007FF7837A125C vinsertf128 ymm3,ymm0,xmmword ptr [rdx+50h],1 | |
00007FF7837A1263 vmovups xmm1,xmmword ptr [rdx+60h] | |
00007FF7837A1268 vinsertf128 ymm5,ymm1,xmmword ptr [rdx+70h],1 | |
00007FF7837A126F vperm2f128 ymm2,ymm3,ymm3,0 | |
00007FF7837A1275 vperm2f128 ymm7,ymm3,ymm3,11h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG | |
// RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath | |
constexpr bool useEigen = true; | |
// Eigen 3.4.0 | |
#include <Eigen/Eigen> | |
__forceinline void multiplyWithEigen( float* rdi, const float* rsi ) | |
{ | |
using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>; | |
static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)"; | |
#define _CRT_SECURE_NO_WARNINGS | |
#include <iostream> | |
#include <chrono> | |
#include <immintrin.h> | |
#include <assert.h> | |
using namespace std; | |
using namespace std::chrono; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)"; | |
#define _CRT_SECURE_NO_WARNINGS | |
#include <iostream> | |
#include <chrono> | |
#include <immintrin.h> | |
using namespace std; | |
using namespace std::chrono; | |
constexpr int SIZE = 640000; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <emmintrin.h> // SSE 2 | |
#include <tmmintrin.h> // SSSE 3 | |
#include <smmintrin.h> // SSE 4.1 | |
// Vector constants for dot4Sse function | |
struct ConstantVectorsSse | |
{ | |
__m128i abcd; | |
__m128i lowNibbleMask; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <vector> | |
#include <set> | |
static bool s_log = false; | |
void message( const char* what ) | |
{ | |
if( s_log ) | |
{ | |
printf( "%s\n", what ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
std::vector<std::string> someFunction( const Invocation& invocation ) | |
{ | |
// Define hash and comparison for string pointers, by value | |
struct StringPtrTraits | |
{ | |
size_t operator()( const std::string* rsi ) const | |
{ | |
return std::hash<std::string>()( *rsi ); | |
} | |
bool operator()( const std::string* a, const std::string* b ) const |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <stdint.h> | |
// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads | |
// It seems on AMD CPUs scalar loads are slightly faster | |
#define USE_GATHER_INSTUCTIONS 0 | |
// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers | |
// When the sum of all bytes exceeds 0xFF, the output is garbage | |
// Which is fine here because our bytes are in [0..8] interval |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <immintrin.h> | |
#include <intrin.h> | |
#include <stdio.h> | |
// Count of set bits in `plus` minus count of set bits in `minus` | |
// The result is in [ -32 .. +32 ] interval | |
inline int popCntDiff( uint32_t plus, uint32_t minus ) | |
{ | |
plus = __popcnt( plus ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Transform 4 inputs with 4 lookup tables, making 4 outputs | |
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval | |
// The 4 tables are in a single AVX2 vector | |
uint32_t applyLookup4( uint32_t i4, __m256i tables4 ) | |
{ | |
// Move 4 bytes into SSE vector | |
__m128i bytes = _mm_cvtsi32_si128( (int)i4 ); | |
// Expand bytes into uint64_t lanes | |
__m256i v = _mm256_cvtepu8_epi64( bytes ); | |
// Multiply them by 4 to get shift amounts in bits |
NewerOlder