Last active
June 14, 2023 16:56
-
-
Save pervognsen/1a3d6a42655023ec41dc18698b6e5911 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#if defined(__x86_64__) | |
#define BREAK asm("int3") | |
#else | |
#error Implement macros for your CPU. | |
#endif | |
#define R(x) asm("" : : "r"(x)) | |
#define RW(x) asm("" : "+r"(x)) | |
#define INLINE __attribute__((always_inline)) | |
#define NOINLINE __attribute__((noinline)) | |
#define TAIL __attribute__((musttail)) | |
#define TAILABI __attribute__((sysv_abi)) | |
typedef int8_t i8; | |
typedef uint8_t u8; | |
typedef int16_t i16; | |
typedef uint16_t u16; | |
typedef int32_t i32; | |
typedef uint32_t u32; | |
typedef int64_t i64; | |
typedef uint64_t u64; | |
//#define SYNCHRONOUS 1 | |
#define PREFETCHED 1 | |
#if SYNCHRONOUS // 5900HX (Zen 3): 6.23 secs/1000000000 ins, 160 MIPS, 28.12 cycles/ins | |
#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, __VA_ARGS__) | |
#define GOTO(f, ...) TAIL return (f)(ip, ins, __VA_ARGS__) | |
BLOCK(*vm_ops[256]); | |
INLINE | |
BLOCK(vm_dispatch) { | |
ins = *ip; | |
u8 op = ins; | |
BLOCK(*blk) = vm_ops[op]; | |
GOTO(blk); | |
} | |
const u32 *vm_exec(const u32 *ip) { | |
return vm_dispatch(ip, 0); | |
} | |
#elif PREFETCHED // 5900HX (Zen 3): 4.26 secs/1000000000 ins, 235 MIPS, 19.24 cycles/ins | |
#define BLOCK(f, ...) TAILABI const u32 *(f)(const u32 *ip, u32 ins, u32 next_ins, void *next_blk, __VA_ARGS__) | |
#define GOTO(f, ...) TAIL return (f)(ip, ins, next_ins, next_blk, __VA_ARGS__) | |
BLOCK(*vm_ops[256]); | |
INLINE | |
BLOCK(vm_dispatch) { | |
ins = next_ins; | |
BLOCK(*blk) = next_blk; | |
next_ins = ip[1]; | |
u8 next_op = next_ins; | |
next_blk = vm_ops[next_op]; | |
GOTO(blk); | |
} | |
const u32 *vm_exec(const u32 *ip) { | |
u32 next_ins = *ip; | |
u8 next_op = next_ins; | |
void *next_blk = vm_ops[next_op]; | |
return vm_dispatch(ip, 0, next_ins, next_blk); | |
} | |
#else | |
#error Select an implementation. | |
#endif | |
enum { | |
ADD = 0x00, | |
ADDI = 0x01, | |
SUB = 0x02, | |
SUBI = 0x03, | |
AND = 0x04, | |
ANDI = 0x05, | |
OR = 0x06, | |
ORI = 0x07, | |
XOR = 0x08, | |
XORI = 0x09, | |
LSL = 0x0A, | |
LSLI = 0x0B, | |
LSR = 0x0C, | |
LSRI = 0x0D, | |
ASR = 0x0E, | |
ASRI = 0x0F, | |
BRK = 0xFD, | |
HLT = 0xFE, | |
ERR = 0xFF, | |
}; | |
u64 vm_regs[256]; | |
#define a ((u8)(ins >> 8)) | |
#define b ((u8)(ins >> 16)) | |
#define c ((u8)(ins >> 24)) | |
#define ra (vm_regs[a]) | |
#define rb (vm_regs[b]) | |
#define rc (vm_regs[c]) | |
#define ic ((u64)(i8)c) | |
INLINE | |
BLOCK(vm_next) { | |
ip++; | |
GOTO(vm_dispatch); | |
} | |
BLOCK(vm_add) { | |
ra = rb + rc; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_addi) { | |
ra = rb + ic; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_sub) { | |
ra = rb - rc; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_subi) { | |
ra = rb - ic; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_and) { | |
ra = rb & rc; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_andi) { | |
ra = rb & ic; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_or) { | |
ra = rb | rc; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_ori) { | |
ra = rb | ic; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_xor) { | |
ra = rb ^ rc; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_xori) { | |
ra = rb ^ ic; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_lsl) { | |
ra = rb << (rc & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_lsli) { | |
ra = rb << (ic & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_lsr) { | |
ra = rb >> (rc & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_lsri) { | |
ra = rb >> (ic & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_asr) { | |
ra = (i64)rb >> (rc & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_asri) { | |
ra = (i64)rb >> (ic & 63); | |
GOTO(vm_next); | |
} | |
BLOCK(vm_brk) { | |
BREAK; | |
GOTO(vm_next); | |
} | |
BLOCK(vm_hlt) { | |
return ip; | |
} | |
BLOCK(vm_err) { | |
BREAK; | |
return NULL; | |
} | |
#undef a | |
#undef b | |
#undef c | |
#undef ra | |
#undef rb | |
#undef rc | |
#undef ic | |
void vm_init(void) { | |
memset(vm_regs, 0, sizeof(vm_regs)); | |
for (int op = 0; op < 256; op++) | |
vm_ops[op] = vm_err; | |
vm_ops[ADD] = vm_add; | |
vm_ops[ADDI] = vm_addi; | |
vm_ops[SUB] = vm_sub; | |
vm_ops[SUBI] = vm_subi; | |
vm_ops[AND] = vm_and; | |
vm_ops[ANDI] = vm_andi; | |
vm_ops[OR] = vm_or; | |
vm_ops[ORI] = vm_ori; | |
vm_ops[XOR] = vm_xor; | |
vm_ops[XORI] = vm_xori; | |
vm_ops[LSL] = vm_lsl; | |
vm_ops[LSLI] = vm_lsli; | |
vm_ops[LSR] = vm_lsr; | |
vm_ops[LSRI] = vm_lsri; | |
vm_ops[ASR] = vm_asr; | |
vm_ops[ASRI] = vm_asri; | |
vm_ops[BRK] = vm_brk; | |
vm_ops[HLT] = vm_hlt; | |
} | |
#ifdef _WIN32 | |
#pragma comment(lib, "kernel32.lib") | |
typedef int BOOL; | |
typedef int64_t LONGLONG; | |
typedef struct { LONGLONG QuadPart; } LARGE_INTEGER; | |
#define WINAPI __stdcall | |
WINAPI BOOL QueryPerformanceFrequency(LARGE_INTEGER *lpFrequency); | |
WINAPI BOOL QueryPerformanceCounter(LARGE_INTEGER *lpPerformanceCount); | |
LARGE_INTEGER timer_base; | |
void start_timer(void) { | |
QueryPerformanceCounter(&timer_base); | |
} | |
double stop_timer(void) { | |
if (timer_base.QuadPart == 0) return 0.0; | |
LARGE_INTEGER now; | |
QueryPerformanceCounter(&now); | |
LARGE_INTEGER freq; | |
QueryPerformanceFrequency(&freq); | |
double time = (double)(now.QuadPart - timer_base.QuadPart) / freq.QuadPart; | |
timer_base.QuadPart = 0; | |
return time; | |
} | |
#else | |
#error Implement timers for your platform. | |
#endif | |
NOINLINE | |
int64_t delay_loop(int64_t cycles) { | |
int64_t i = cycles; | |
do { | |
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i; | |
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i; | |
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i; | |
RW(i); --i; RW(i); --i; RW(i); --i; RW(i); --i; | |
} while (i > 0); | |
R(i); | |
return i; | |
} | |
double cpufreq; | |
void calc_cpufreq(void) { | |
int64_t cycles = 1ull << 33; | |
start_timer(); | |
int64_t adjust = delay_loop(cycles); | |
double time = stop_timer(); | |
cpufreq = (cycles - adjust) / time; | |
} | |
u64 random_u64(void) { | |
static u64 x = 0x2545F4914F6CDD1D; | |
x ^= x << 13; | |
x ^= x >> 7; | |
x ^= x << 17; | |
return x; | |
} | |
int main() { | |
printf("Estimating CPU frequency...\n"); | |
calc_cpufreq(); | |
printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq); | |
u64 n = 1e9; | |
u32 *ip = malloc(n * sizeof(u32)); | |
for (u64 i = 0; i < n; i++) | |
ip[i] = random_u64() & ~0xF0; | |
ip[n-2] = HLT; | |
ip[n-1] = ERR; | |
for (int i = 0; i < 2; i++) { | |
vm_init(); | |
start_timer(); | |
const u32 *last_ip = vm_exec(ip); | |
double time = stop_timer(); | |
if (last_ip != &ip[n-2]) BREAK; | |
double cycles = time * cpufreq; | |
printf("Run %d: %.2f secs/%llu ins, %.0f MIPS, %.2f cycles/ins\n", i, time, n, 1e-6 * n / time, cycles / n); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment