-
-
Save AndrewJDR/37a97cf7b4f50db189c89850d51805dc to your computer and use it in GitHub Desktop.
Meow v0.5 in C and ARMv8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#define MEOW_HASH_VERSION 5 | |
#define MEOW_HASH_VERSION_NAME "0.5/calico" | |
// Meow hash v0.5 with ARMv8 Crypto Extension instructions | |
// Ported from https://github.com/cmuratori/meow_hash | |
// Performance on Pine A64 (Cortex-A53, 1.2GHz) | |
// (compiled with clang v10.0 with -O3 -mcpu=cortex-a53) | |
// C code = ~0.34 bytes/cycle | |
// this code = ~1.75 bytes/cycle | |
// Performance on Jetson Nano (Cortex-A57, 1.43GHz) | |
// (compiled with clang v10.0 with -O3 -mcpu=cortex-a57) | |
// C code = ~0.5 bytes/cycle | |
// this code = ~4.0 bytes/cycle (for ~1MB or less data) | |
// ~2.5 bytes/cycle (for >1MB) | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <arm_neon.h> | |
#if !defined MEOW_PAGESIZE | |
#define MEOW_PAGESIZE 4096 | |
#endif | |
#if !defined(meow_u8) | |
#define meow_u8 uint8_t | |
#define meow_u32 uint32_t | |
#define meow_u64 uint64_t | |
#define meow_umm size_t | |
#define meow_u128 uint8x16_t | |
#endif | |
#define MeowU32From(A, I) vgetq_lane_u32(vreinterpretq_u32_u8(A), (I)) | |
#define MeowU64From(A, I) vgetq_lane_u64(vreinterpretq_u64_u8(A), (I)) | |
#define MeowHashesAreEqual(A, B) (MeowU64From(A, 0) == MeowU64From(B, 0) && MeowU64From(A, 1) == MeowU64From(B, 1)) | |
#define movdqu(A, B) A = vld1q_u8(B) | |
#define movdqu_mem(A, B) vst1q_u8((A), (B)) | |
#define movq(A, B) A = vreinterpretq_u8_u64((uint64x2_t){ (B), 0 }) | |
#define paddq(A, B) A = vreinterpretq_u8_u64(vaddq_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) | |
#define pxor_clear(A) A = (uint8x16_t){} | |
#define pxor(A, B) A = veorq_u8(A, B) | |
#define pand(A, B) A = vandq_u8(A, B) | |
#define palignr1(A, B) A = \ | |
vorrq_u8( \ | |
vqtbl1q_u8(A, (uint8x16_t){0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0}), \ | |
vqtbl1q_u8(B, (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0xff}) \ | |
) | |
#define palignr15(A, B) A = \ | |
vorrq_u8( \ | |
vqtbl1q_u8(A, (uint8x16_t){0xff,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}), \ | |
vqtbl1q_u8(B, (uint8x16_t){15,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}) \ | |
) | |
#define pshufb(A, B) A = vqtbl1q_u8(A, B) | |
#define aesdec(A, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (uint8x16_t){})), (B)) | |
#define aesdec_xor(A, X, B) A = veorq_u8(vaesimcq_u8(vaesdq_u8((A), (X))), (B)) | |
#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) do { \ | |
aesdec(r1, r2); \ | |
paddq(r3, i1); \ | |
aesdec_xor(r2, i2, r4); \ | |
paddq(r5, i3); \ | |
pxor(r4, i4); \ | |
} while (0) | |
#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \ | |
meow_u128 i1 = vld1q_u8(ptr + 15); \ | |
meow_u128 i2 = vld1q_u8(ptr + 0); \ | |
meow_u128 i3 = vld1q_u8(ptr + 1); \ | |
meow_u128 i4 = vld1q_u8(ptr + 16); \ | |
MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4); \ | |
} while (0) | |
#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \ | |
aesdec(r1, r4); \ | |
paddq(r2, r5); \ | |
aesdec_xor(r4, r6, r2); \ | |
paddq(r5, r6); \ | |
pxor(r2, r3); \ | |
} while (0) | |
static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; | |
static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; | |
// NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi. | |
static meow_u8 MeowDefaultSeed[128] = | |
{ | |
0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, | |
0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, | |
0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, | |
0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, | |
0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, | |
0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, | |
0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, | |
0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, | |
0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, | |
0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, | |
0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, | |
0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, | |
0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, | |
0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, | |
0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, | |
0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6, | |
}; | |
// | |
// NOTE(casey): Single block version | |
// | |
static meow_u128 | |
MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit) | |
{ | |
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes | |
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) | |
meow_u8* rax = (meow_u8*)SourceInit; | |
meow_u8* rcx = (meow_u8*)Seed128Init; | |
// | |
// NOTE(casey): Seed the eight hash registers | |
// | |
movdqu(xmm0, rcx + 0x00); | |
movdqu(xmm1, rcx + 0x10); | |
movdqu(xmm2, rcx + 0x20); | |
movdqu(xmm3, rcx + 0x30); | |
movdqu(xmm4, rcx + 0x40); | |
movdqu(xmm5, rcx + 0x50); | |
movdqu(xmm6, rcx + 0x60); | |
movdqu(xmm7, rcx + 0x70); | |
// | |
// NOTE(casey): Hash all full 256-byte blocks | |
// | |
meow_umm BlockCount = (Len >> 8); | |
while (BlockCount--) | |
{ | |
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); | |
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); | |
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); | |
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); | |
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); | |
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); | |
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); | |
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); | |
rax += 0x100; | |
} | |
// | |
// NOTE(casey): Load any less-than-32-byte residual | |
// | |
pxor_clear(xmm9); | |
pxor_clear(xmm11); | |
// | |
// TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, | |
// because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the | |
// result, but it would take a little thought to make sure it couldn't read off the end of the buffer due | |
// to the & 0xf on the align computation. | |
// | |
// NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned | |
meow_u8 *Last = (meow_u8*)SourceInit + (Len & ~0xf); | |
meow_u32 Len8 = (Len & 0xf); | |
if (Len8) | |
{ | |
// NOTE(casey): Load the mask early | |
movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); | |
meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); | |
int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0; | |
movdqu(xmm10, &MeowShiftAdjust[Align]); | |
movdqu(xmm9, Last - Align); | |
pshufb(xmm9, xmm10); | |
// NOTE(jeffr): and off the extra bytes | |
pand(xmm9, xmm8); | |
} | |
// NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned | |
if (Len & 0x10) | |
{ | |
xmm11 = xmm9; | |
movdqu(xmm9, Last - 0x10); | |
} | |
// | |
// NOTE(casey): Construct the residual and length injests | |
// | |
xmm8 = xmm9; | |
xmm10 = xmm9; | |
palignr15(xmm8, xmm11); | |
palignr1(xmm10, xmm11); | |
// NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but | |
// the decision was made to leave them zero'd so as not to confuse people | |
// about hwo to use them or what security implications they had. | |
pxor_clear(xmm12); | |
pxor_clear(xmm13); | |
pxor_clear(xmm14); | |
movq(xmm15, Len); | |
palignr15(xmm12, xmm15); | |
palignr1(xmm14, xmm15); | |
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty | |
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); | |
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding | |
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); | |
// | |
// NOTE(casey): Hash all full 32-byte blocks | |
// | |
meow_u32 LaneCount = (Len >> 5) & 0x7; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; | |
// | |
// NOTE(casey): Mix the eight lanes down to one 128-bit hash | |
// | |
MixDown: | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); | |
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); | |
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); | |
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
paddq(xmm0, xmm2); | |
paddq(xmm1, xmm3); | |
paddq(xmm4, xmm6); | |
paddq(xmm5, xmm7); | |
pxor(xmm0, xmm1); | |
pxor(xmm4, xmm5); | |
paddq(xmm0, xmm4); | |
return xmm0; | |
} | |
// | |
// NOTE(casey): Streaming construction | |
// | |
typedef struct | |
{ | |
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; | |
meow_u64 TotalLengthInBytes; | |
meow_u32 BufferLen; | |
meow_u8 Buffer[256]; | |
meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary | |
} meow_state; | |
static void | |
MeowBegin(meow_state* State, void* Seed128) | |
{ | |
meow_u8* rcx = (meow_u8*)Seed128; | |
movdqu(State->xmm0, rcx + 0x00); | |
movdqu(State->xmm1, rcx + 0x10); | |
movdqu(State->xmm2, rcx + 0x20); | |
movdqu(State->xmm3, rcx + 0x30); | |
movdqu(State->xmm4, rcx + 0x40); | |
movdqu(State->xmm5, rcx + 0x50); | |
movdqu(State->xmm6, rcx + 0x60); | |
movdqu(State->xmm7, rcx + 0x70); | |
State->BufferLen = 0; | |
State->TotalLengthInBytes = 0; | |
} | |
static void | |
MeowAbsorbBlocks(meow_state* State, meow_umm BlockCount, meow_u8* rax) | |
{ | |
meow_u128 xmm0 = State->xmm0; | |
meow_u128 xmm1 = State->xmm1; | |
meow_u128 xmm2 = State->xmm2; | |
meow_u128 xmm3 = State->xmm3; | |
meow_u128 xmm4 = State->xmm4; | |
meow_u128 xmm5 = State->xmm5; | |
meow_u128 xmm6 = State->xmm6; | |
meow_u128 xmm7 = State->xmm7; | |
while (BlockCount--) | |
{ | |
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); | |
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); | |
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); | |
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); | |
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); | |
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); | |
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); | |
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); | |
rax += 0x100; | |
} | |
State->xmm0 = xmm0; | |
State->xmm1 = xmm1; | |
State->xmm2 = xmm2; | |
State->xmm3 = xmm3; | |
State->xmm4 = xmm4; | |
State->xmm5 = xmm5; | |
State->xmm6 = xmm6; | |
State->xmm7 = xmm7; | |
} | |
static void | |
MeowAbsorb(meow_state* State, meow_umm Len, void* SourceInit) | |
{ | |
State->TotalLengthInBytes += Len; | |
meow_u8* Source = (meow_u8*)SourceInit; | |
// NOTE(casey): Handle any buffered residual | |
if (State->BufferLen) | |
{ | |
meow_u32 Fill = (sizeof(State->Buffer) - State->BufferLen); | |
if (Fill > Len) | |
{ | |
Fill = (meow_u32)Len; | |
} | |
Len -= Fill; | |
while (Fill--) | |
{ | |
State->Buffer[State->BufferLen++] = *Source++; | |
} | |
if (State->BufferLen == sizeof(State->Buffer)) | |
{ | |
MeowAbsorbBlocks(State, 1, State->Buffer); | |
State->BufferLen = 0; | |
} | |
} | |
// NOTE(casey): Handle any full blocks | |
meow_u64 BlockCount = (Len >> 8); | |
meow_u64 Advance = (BlockCount << 8); | |
MeowAbsorbBlocks(State, BlockCount, Source); | |
Len -= Advance; | |
Source += Advance; | |
// NOTE(casey): Store residual | |
while (Len--) | |
{ | |
State->Buffer[State->BufferLen++] = *Source++; | |
} | |
} | |
static meow_u128 | |
MeowEnd(meow_state* State, meow_u8* Store128) | |
{ | |
meow_u64 Len = State->TotalLengthInBytes; | |
meow_u128 xmm0 = State->xmm0; | |
meow_u128 xmm1 = State->xmm1; | |
meow_u128 xmm2 = State->xmm2; | |
meow_u128 xmm3 = State->xmm3; | |
meow_u128 xmm4 = State->xmm4; | |
meow_u128 xmm5 = State->xmm5; | |
meow_u128 xmm6 = State->xmm6; | |
meow_u128 xmm7 = State->xmm7; | |
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; | |
meow_u8* rax = State->Buffer; | |
pxor_clear(xmm9); | |
pxor_clear(xmm11); | |
meow_u8* Last = (uint8_t*)rax + (Len & 0xf0); | |
meow_u32 Len8 = (Len & 0xf); | |
if (Len8) | |
{ | |
movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); | |
movdqu(xmm9, Last); | |
pand(xmm9, xmm8); | |
} | |
if (Len & 0x10) | |
{ | |
xmm11 = xmm9; | |
movdqu(xmm9, Last - 0x10); | |
} | |
xmm8 = xmm9; | |
xmm10 = xmm9; | |
palignr15(xmm8, xmm11); | |
palignr1(xmm10, xmm11); | |
pxor_clear(xmm12); | |
pxor_clear(xmm13); | |
pxor_clear(xmm14); | |
movq(xmm15, Len); | |
palignr15(xmm12, xmm15); | |
palignr1(xmm14, xmm15); | |
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty | |
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); | |
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding | |
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); | |
// | |
// NOTE(casey): Hash all full 32-byte blocks | |
// | |
meow_u32 LaneCount = (Len >> 5) & 0x7; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; | |
// | |
// NOTE(casey): Mix the eight lanes down to one 128-bit hash | |
// | |
MixDown: | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); | |
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); | |
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); | |
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
if (Store128) | |
{ | |
movdqu_mem(Store128 + 0x00, xmm0); | |
movdqu_mem(Store128 + 0x10, xmm1); | |
movdqu_mem(Store128 + 0x20, xmm2); | |
movdqu_mem(Store128 + 0x30, xmm3); | |
movdqu_mem(Store128 + 0x40, xmm4); | |
movdqu_mem(Store128 + 0x50, xmm5); | |
movdqu_mem(Store128 + 0x60, xmm6); | |
movdqu_mem(Store128 + 0x70, xmm7); | |
} | |
paddq(xmm0, xmm2); | |
paddq(xmm1, xmm3); | |
paddq(xmm4, xmm6); | |
paddq(xmm5, xmm7); | |
pxor(xmm0, xmm1); | |
pxor(xmm4, xmm5); | |
paddq(xmm0, xmm4); | |
return(xmm0); | |
} | |
#undef movdqu | |
#undef movdqu_mem | |
#undef movq | |
#undef aesdec | |
#undef pxor | |
#undef paddq | |
#undef pand | |
#undef palignr1 | |
#undef palignr15 | |
#undef pxor_clear | |
#undef MEOW_MIX | |
#undef MEOW_MIX_REG | |
#undef MEOW_SHUFFLE | |
// | |
// NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed | |
// to create a seed which you then store for repeated use. It is _expensive_ to generate the seed, | |
// so you do not want to do this every time you hash. You _only_ want to do it when you actually | |
// need to create a new seed. | |
// | |
static void | |
MeowExpandSeed(meow_umm InputLen, void* Input, meow_u8* SeedResult) | |
{ | |
meow_state State; | |
meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results | |
meow_umm InjestCount = (256 / InputLen) + 2; | |
MeowBegin(&State, MeowDefaultSeed); | |
MeowAbsorb(&State, sizeof(LengthTab), &LengthTab); | |
while (InjestCount--) | |
{ | |
MeowAbsorb(&State, InputLen, Input); | |
} | |
MeowEnd(&State, SeedResult); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#define MEOW_HASH_VERSION 5 | |
#define MEOW_HASH_VERSION_NAME "0.5/calico" | |
// Meow hash v0.5 in C without dependency on special CPU instructions | |
// Ported from https://github.com/cmuratori/meow_hash | |
// Performance on Ryzen 9 3950X | |
// AESNI code = ~16 bytes/cycle | |
// this code = ~0.78 bytes/cycle | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <string.h> | |
#if !defined MEOW_PAGESIZE | |
#define MEOW_PAGESIZE 4096 | |
#endif | |
#if !defined(meow_u8) | |
#define meow_u8 uint8_t | |
#define meow_u32 uint32_t | |
#define meow_u64 uint64_t | |
#define meow_umm size_t | |
typedef union | |
{ | |
meow_u8 u8[16]; | |
meow_u32 u32[4]; | |
meow_u64 u64[2]; | |
} meow_u128; | |
#endif | |
#define MeowU32From(A, I) ((A).u32[I]) | |
#define MeowU64From(A, I) ((A).u64[I]) | |
#define MeowHashesAreEqual(A, B) ((A).u64[0] == (B).u64[0] && (A).u64[1] == (B).u64[1]) | |
static const meow_u32 MeowAesBox0[256] = { | |
0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, | |
0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, | |
0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b, | |
0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e, | |
0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d, | |
0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9, | |
0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566, | |
0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed, | |
0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4, | |
0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd, | |
0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060, | |
0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879, | |
0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c, | |
0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624, | |
0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c, | |
0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14, | |
0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b, | |
0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684, | |
0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177, | |
0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322, | |
0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f, | |
0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382, | |
0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb, | |
0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef, | |
0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235, | |
0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117, | |
0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546, | |
0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d, | |
0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a, | |
0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478, | |
0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff, | |
0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0, | |
}; | |
static const meow_u32 MeowAesBox1[256] = { | |
0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3, | |
0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362, | |
0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3, | |
0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9, | |
0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace, | |
0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08, | |
0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55, | |
0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216, | |
0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6, | |
0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e, | |
0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550, | |
0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8, | |
0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a, | |
0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36, | |
0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12, | |
0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e, | |
0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb, | |
0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6, | |
0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1, | |
0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033, | |
0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad, | |
0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3, | |
0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b, | |
0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815, | |
0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2, | |
0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691, | |
0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165, | |
0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6, | |
0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147, | |
0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44, | |
0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d, | |
0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, | |
}; | |
static const meow_u32 MeowAesBox2[256] = { | |
0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303, | |
0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3, | |
0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9, | |
0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8, | |
0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a, | |
0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b, | |
0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab, | |
0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82, | |
0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe, | |
0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110, | |
0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15, | |
0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee, | |
0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72, | |
0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e, | |
0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a, | |
0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9, | |
0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e, | |
0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011, | |
0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3, | |
0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90, | |
0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf, | |
0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af, | |
0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb, | |
0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8, | |
0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066, | |
0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6, | |
0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51, | |
0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347, | |
0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1, | |
0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db, | |
0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195, | |
0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257, | |
}; | |
static const meow_u32 MeowAesBox3[256] = { | |
0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93, | |
0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f, | |
0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6, | |
0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44, | |
0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4, | |
0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994, | |
0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a, | |
0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c, | |
0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a, | |
0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51, | |
0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff, | |
0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db, | |
0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e, | |
0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a, | |
0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16, | |
0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8, | |
0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34, | |
0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420, | |
0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0, | |
0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef, | |
0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4, | |
0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5, | |
0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b, | |
0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6, | |
0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0, | |
0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f, | |
0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f, | |
0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13, | |
0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c, | |
0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886, | |
0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41, | |
0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042, | |
}; | |
#define movdqu(A, B) memcpy(&(A), (B), 16) | |
#define movdqu_mem(A, B) memcpy((A), &(B), 16) | |
#define movq(A, B) do { (A).u64[0] = B; (A).u64[1] = 0; } while (0) | |
#define paddq(A, B) do { (A).u64[0] += (B).u64[0]; (A).u64[1] += (B).u64[1]; } while (0) | |
#define pxor_clear(A) do { (A).u64[0] = (A).u64[1] = 0; } while (0) | |
#define pxor(A, B) do { (A).u64[0] ^= (B).u64[0]; (A).u64[1] ^= (B).u64[1]; } while (0) | |
#define pand(A, B) do { (A).u64[0] &= (B).u64[0]; (A).u64[1] &= (B).u64[1]; } while (0) | |
#define palignr1(A, B) do { \ | |
(A).u64[1] = ((B).u64[1] >> 8) | ((A).u64[0] << 56); \ | |
(A).u64[0] = ((B).u64[0] >> 8) | ((B).u64[1] << 56); \ | |
} while (0) | |
#define palignr15(A, B) do { \ | |
(A).u64[1] = ((A).u64[0] >> 56) | ((A).u64[1] << 8); \ | |
(A).u64[0] = ((B).u64[1] >> 56) | ((A).u64[0] << 8); \ | |
} while (0) | |
#define aesdec(A, B) do { \ | |
meow_u32 s0 = MeowAesBox0[(A).u8[0]] ^ MeowAesBox1[(A).u8[7]] ^ MeowAesBox2[(A).u8[10]] ^ MeowAesBox3[(A).u8[13]]; \ | |
meow_u32 s1 = MeowAesBox0[(A).u8[4]] ^ MeowAesBox1[(A).u8[11]] ^ MeowAesBox2[(A).u8[14]] ^ MeowAesBox3[(A).u8[1]]; \ | |
meow_u32 s2 = MeowAesBox0[(A).u8[8]] ^ MeowAesBox1[(A).u8[15]] ^ MeowAesBox2[(A).u8[2]] ^ MeowAesBox3[(A).u8[5]]; \ | |
meow_u32 s3 = MeowAesBox0[(A).u8[12]] ^ MeowAesBox1[(A).u8[3]] ^ MeowAesBox2[(A).u8[6]] ^ MeowAesBox3[(A).u8[9]]; \ | |
(A).u32[0] = s0; \ | |
(A).u32[1] = s1; \ | |
(A).u32[2] = s2; \ | |
(A).u32[3] = s3; \ | |
pxor(A, B); \ | |
} while (0) | |
#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \ | |
aesdec(r1, r2); \ | |
paddq(r3, i1); \ | |
pxor(r2, i2); \ | |
aesdec(r2, r4); \ | |
paddq(r5, i3); \ | |
pxor(r4, i4); | |
#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) do { \ | |
meow_u128 i1, i2, i3, i4; \ | |
memcpy(&i1, ptr + 15, sizeof(i1)); \ | |
memcpy(&i2, ptr + 0, sizeof(i2)); \ | |
memcpy(&i3, ptr + 1, sizeof(i3)); \ | |
memcpy(&i4, ptr + 16, sizeof(i4)); \ | |
MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4); \ | |
} while (0) | |
#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) do { \ | |
aesdec(r1, r4); \ | |
paddq(r2, r5); \ | |
pxor(r4, r6); \ | |
aesdec(r4, r2); \ | |
paddq(r5, r6); \ | |
pxor(r2, r3); \ | |
} while (0) | |
// NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi. | |
static meow_u8 MeowDefaultSeed[128] = | |
{ | |
0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, | |
0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, | |
0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, | |
0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, | |
0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, | |
0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, | |
0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, | |
0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, | |
0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, | |
0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, | |
0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, | |
0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, | |
0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, | |
0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, | |
0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, | |
0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6, | |
}; | |
// | |
// NOTE(casey): Single block version | |
// | |
static meow_u128 | |
MeowHash(void* Seed128Init, meow_umm Len, void* SourceInit) | |
{ | |
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes | |
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) | |
meow_u8* rax = (meow_u8*)SourceInit; | |
meow_u8* rcx = (meow_u8*)Seed128Init; | |
// | |
// NOTE(casey): Seed the eight hash registers | |
// | |
movdqu(xmm0, rcx + 0x00); | |
movdqu(xmm1, rcx + 0x10); | |
movdqu(xmm2, rcx + 0x20); | |
movdqu(xmm3, rcx + 0x30); | |
movdqu(xmm4, rcx + 0x40); | |
movdqu(xmm5, rcx + 0x50); | |
movdqu(xmm6, rcx + 0x60); | |
movdqu(xmm7, rcx + 0x70); | |
// | |
// NOTE(casey): Hash all full 256-byte blocks | |
// | |
meow_umm BlockCount = (Len >> 8); | |
while (BlockCount--) | |
{ | |
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); | |
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); | |
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); | |
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); | |
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); | |
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); | |
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); | |
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); | |
rax += 0x100; | |
} | |
// | |
// NOTE(casey): Load any less-than-32-byte residual | |
// | |
pxor_clear(xmm9); | |
pxor_clear(xmm11); | |
// | |
// TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, | |
// because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the | |
// result, but it would take a little thought to make sure it couldn't read off the end of the buffer due | |
// to the & 0xf on the align computation. | |
// | |
// NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned | |
meow_u8 *Last = (meow_u8*)SourceInit + (Len & ~0xf); | |
meow_u32 Len8 = (Len & 0xf); | |
if (Len8) | |
{ | |
for (meow_umm i=0; i<Len8; i++) | |
{ | |
xmm9.u8[i] = Last[i]; | |
} | |
for (meow_umm i=Len8; i<0x10; i++) | |
{ | |
xmm9.u8[i] = 0; | |
} | |
} | |
// NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned | |
if (Len & 0x10) | |
{ | |
xmm11 = xmm9; | |
movdqu(xmm9, Last - 0x10); | |
} | |
// | |
// NOTE(casey): Construct the residual and length injests | |
// | |
xmm8 = xmm9; | |
xmm10 = xmm9; | |
palignr15(xmm8, xmm11); | |
palignr1(xmm10, xmm11); | |
// NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but | |
// the decision was made to leave them zero'd so as not to confuse people | |
// about hwo to use them or what security implications they had. | |
pxor_clear(xmm12); | |
pxor_clear(xmm13); | |
pxor_clear(xmm14); | |
movq(xmm15, Len); | |
palignr15(xmm12, xmm15); | |
palignr1(xmm14, xmm15); | |
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty | |
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); | |
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding | |
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); | |
// | |
// NOTE(casey): Hash all full 32-byte blocks | |
// | |
meow_u32 LaneCount = (Len >> 5) & 0x7; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; | |
if (LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; | |
// | |
// NOTE(casey): Mix the eight lanes down to one 128-bit hash | |
// | |
MixDown: | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); | |
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); | |
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); | |
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
paddq(xmm0, xmm2); | |
paddq(xmm1, xmm3); | |
paddq(xmm4, xmm6); | |
paddq(xmm5, xmm7); | |
pxor(xmm0, xmm1); | |
pxor(xmm4, xmm5); | |
paddq(xmm0, xmm4); | |
return xmm0; | |
} | |
// | |
// NOTE(casey): Streaming construction | |
// | |
typedef struct | |
{ | |
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; | |
meow_u64 TotalLengthInBytes; | |
meow_u32 BufferLen; | |
meow_u8 Buffer[256]; | |
meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary | |
} meow_state; | |
static void | |
MeowBegin(meow_state* State, void* Seed128) | |
{ | |
meow_u8* rcx = (meow_u8*)Seed128; | |
movdqu(State->xmm0, rcx + 0x00); | |
movdqu(State->xmm1, rcx + 0x10); | |
movdqu(State->xmm2, rcx + 0x20); | |
movdqu(State->xmm3, rcx + 0x30); | |
movdqu(State->xmm4, rcx + 0x40); | |
movdqu(State->xmm5, rcx + 0x50); | |
movdqu(State->xmm6, rcx + 0x60); | |
movdqu(State->xmm7, rcx + 0x70); | |
State->BufferLen = 0; | |
State->TotalLengthInBytes = 0; | |
} | |
static void | |
MeowAbsorbBlocks(meow_state* State, meow_umm BlockCount, meow_u8* rax) | |
{ | |
meow_u128 xmm0 = State->xmm0; | |
meow_u128 xmm1 = State->xmm1; | |
meow_u128 xmm2 = State->xmm2; | |
meow_u128 xmm3 = State->xmm3; | |
meow_u128 xmm4 = State->xmm4; | |
meow_u128 xmm5 = State->xmm5; | |
meow_u128 xmm6 = State->xmm6; | |
meow_u128 xmm7 = State->xmm7; | |
while (BlockCount--) | |
{ | |
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); | |
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); | |
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); | |
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); | |
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); | |
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); | |
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); | |
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); | |
rax += 0x100; | |
} | |
State->xmm0 = xmm0; | |
State->xmm1 = xmm1; | |
State->xmm2 = xmm2; | |
State->xmm3 = xmm3; | |
State->xmm4 = xmm4; | |
State->xmm5 = xmm5; | |
State->xmm6 = xmm6; | |
State->xmm7 = xmm7; | |
} | |
static void | |
MeowAbsorb(meow_state* State, meow_umm Len, void* SourceInit) | |
{ | |
State->TotalLengthInBytes += Len; | |
meow_u8* Source = (meow_u8*)SourceInit; | |
// NOTE(casey): Handle any buffered residual | |
if (State->BufferLen) | |
{ | |
meow_u32 Fill = (sizeof(State->Buffer) - State->BufferLen); | |
if (Fill > Len) | |
{ | |
Fill = (meow_u32)Len; | |
} | |
Len -= Fill; | |
while (Fill--) | |
{ | |
State->Buffer[State->BufferLen++] = *Source++; | |
} | |
if (State->BufferLen == sizeof(State->Buffer)) | |
{ | |
MeowAbsorbBlocks(State, 1, State->Buffer); | |
State->BufferLen = 0; | |
} | |
} | |
// NOTE(casey): Handle any full blocks | |
meow_u64 BlockCount = (Len >> 8); | |
meow_u64 Advance = (BlockCount << 8); | |
MeowAbsorbBlocks(State, BlockCount, Source); | |
Len -= Advance; | |
Source += Advance; | |
// NOTE(casey): Store residual | |
while (Len--) | |
{ | |
State->Buffer[State->BufferLen++] = *Source++; | |
} | |
} | |
static meow_u128 | |
MeowEnd(meow_state* State, meow_u8* Store128) | |
{ | |
meow_u64 Len = State->TotalLengthInBytes; | |
meow_u128 xmm0 = State->xmm0; | |
meow_u128 xmm1 = State->xmm1; | |
meow_u128 xmm2 = State->xmm2; | |
meow_u128 xmm3 = State->xmm3; | |
meow_u128 xmm4 = State->xmm4; | |
meow_u128 xmm5 = State->xmm5; | |
meow_u128 xmm6 = State->xmm6; | |
meow_u128 xmm7 = State->xmm7; | |
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; | |
meow_u8* rax = State->Buffer; | |
pxor_clear(xmm9); | |
pxor_clear(xmm11); | |
meow_u8* Last = (uint8_t*)rax + (Len & 0xf0); | |
meow_u32 Len8 = (Len & 0xf); | |
if (Len8) | |
{ | |
for (meow_umm i=0; i<Len8; i++) | |
{ | |
xmm9.u8[i] = Last[i]; | |
} | |
for (meow_umm i=Len8; i<0x10; i++) | |
{ | |
xmm9.u8[i] = 0; | |
} | |
} | |
if (Len & 0x10) | |
{ | |
xmm11 = xmm9; | |
movdqu(xmm9, Last - 0x10); | |
} | |
xmm8 = xmm9; | |
xmm10 = xmm9; | |
palignr15(xmm8, xmm11); | |
palignr1(xmm10, xmm11); | |
pxor_clear(xmm12); | |
pxor_clear(xmm13); | |
pxor_clear(xmm14); | |
movq(xmm15, Len); | |
palignr15(xmm12, xmm15); | |
palignr1(xmm14, xmm15); | |
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty | |
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11); | |
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding | |
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15); | |
// | |
// NOTE(casey): Hash all full 32-byte blocks | |
// | |
meow_u32 LaneCount = (Len >> 5) & 0x7; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; | |
if( LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; | |
// | |
// NOTE(casey): Mix the eight lanes down to one 128-bit hash | |
// | |
MixDown: | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); | |
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); | |
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); | |
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); | |
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); | |
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); | |
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); | |
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); | |
if (Store128) | |
{ | |
movdqu_mem(Store128 + 0x00, xmm0); | |
movdqu_mem(Store128 + 0x10, xmm1); | |
movdqu_mem(Store128 + 0x20, xmm2); | |
movdqu_mem(Store128 + 0x30, xmm3); | |
movdqu_mem(Store128 + 0x40, xmm4); | |
movdqu_mem(Store128 + 0x50, xmm5); | |
movdqu_mem(Store128 + 0x60, xmm6); | |
movdqu_mem(Store128 + 0x70, xmm7); | |
} | |
paddq(xmm0, xmm2); | |
paddq(xmm1, xmm3); | |
paddq(xmm4, xmm6); | |
paddq(xmm5, xmm7); | |
pxor(xmm0, xmm1); | |
pxor(xmm4, xmm5); | |
paddq(xmm0, xmm4); | |
return(xmm0); | |
} | |
#undef movdqu | |
#undef movdqu_mem | |
#undef movq | |
#undef aesdec | |
#undef pxor | |
#undef paddq | |
#undef pand | |
#undef palignr1 | |
#undef palignr15 | |
#undef pxor_clear | |
#undef MEOW_MIX | |
#undef MEOW_MIX_REG | |
#undef MEOW_SHUFFLE | |
// | |
// NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed | |
// to create a seed which you then store for repeated use. It is _expensive_ to generate the seed, | |
// so you do not want to do this every time you hash. You _only_ want to do it when you actually | |
// need to create a new seed. | |
// | |
static void | |
MeowExpandSeed(meow_umm InputLen, void* Input, meow_u8* SeedResult) | |
{ | |
meow_state State; | |
meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results | |
meow_umm InjestCount = (256 / InputLen) + 2; | |
MeowBegin(&State, MeowDefaultSeed); | |
MeowAbsorb(&State, sizeof(LengthTab), &LengthTab); | |
while (InjestCount--) | |
{ | |
MeowAbsorb(&State, InputLen, Input); | |
} | |
MeowEnd(&State, SeedResult); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment