Skip to content

Instantly share code, notes, and snippets.

00007FF7837A1240 vmovups xmm0,xmmword ptr [rdx]
00007FF7837A1244 vinsertf128 ymm9,ymm0,xmmword ptr [rdx+10h],1
00007FF7837A124B vmovups xmm1,xmmword ptr [rdx+20h]
00007FF7837A1250 vinsertf128 ymm10,ymm1,xmmword ptr [rdx+30h],1
00007FF7837A1257 vmovups xmm0,xmmword ptr [rdx+40h]
00007FF7837A125C vinsertf128 ymm3,ymm0,xmmword ptr [rdx+50h],1
00007FF7837A1263 vmovups xmm1,xmmword ptr [rdx+60h]
00007FF7837A1268 vinsertf128 ymm5,ymm1,xmmword ptr [rdx+70h],1
00007FF7837A126F vperm2f128 ymm2,ymm3,ymm3,0
00007FF7837A1275 vperm2f128 ymm7,ymm3,ymm3,11h
// Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG
// RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath
constexpr bool useEigen = true;
// Eigen 3.4.0
#include <Eigen/Eigen>
__forceinline void multiplyWithEigen( float* rdi, const float* rsi )
{
using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) );
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
#include <assert.h>
using namespace std;
using namespace std::chrono;
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
using namespace std;
using namespace std::chrono;
constexpr int SIZE = 640000;
#include <stdint.h>
#include <emmintrin.h> // SSE 2
#include <tmmintrin.h> // SSSE 3
#include <smmintrin.h> // SSE 4.1
// Vector constants for dot4Sse function
struct ConstantVectorsSse
{
__m128i abcd;
__m128i lowNibbleMask;
#include <stdio.h>
#include <vector>
#include <set>
static bool s_log = false;
void message( const char* what )
{
if( s_log )
{
printf( "%s\n", what );
std::vector<std::string> someFunction( const Invocation& invocation )
{
// Define hash and comparison for string pointers, by value
struct StringPtrTraits
{
size_t operator()( const std::string* rsi ) const
{
return std::hash<std::string>()( *rsi );
}
bool operator()( const std::string* a, const std::string* b ) const
#include <immintrin.h>
#include <stdint.h>
// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0
// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>
// Count of set bits in `plus` minus count of set bits in `minus`
// The result is in [ -32 .. +32 ] interval
inline int popCntDiff( uint32_t plus, uint32_t minus )
{
plus = __popcnt( plus );
// Transform 4 inputs with 4 lookup tables, making 4 outputs
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
// The 4 tables are in a single AVX2 vector
uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
{
// Move 4 bytes into SSE vector
__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
// Expand bytes into uint64_t lanes
__m256i v = _mm256_cvtepu8_epi64( bytes );
// Multiply them by 4 to get shift amounts in bits