Last active
August 29, 2015 14:09
-
-
Save wrunk/a6f19ff3d9de23a57fb9 to your computer and use it in GitHub Desktop.
Creating GUIDs in python using the first two chars to get a shard
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Written by Warren Runk | |
# This file is free software in the public domain. | |
import base64 | |
import uuid | |
""" | |
******* Deprecated. I went the route of explict shard in the node section. | |
See: https://gist.github.com/wrunk/b6d340297e7a9f3d97a0 | |
Get a GUID - URL safe, Base32, all lowercase, looks like: | |
rwc5wtj73fdqjasiii5texzize | |
aruud7l6ijarfdk2a3kstwqmnu | |
p4zkxwv5hvfsbmp7o5b7cktb5i | |
We also provide the ability to get the bucket for the ID (0-1023) | |
""" | |
def _get_base_int(c): | |
"Return the base int value of the base32 char (0-31)" | |
c = ord(c) | |
if c < 97: | |
return c - 50 | |
return c - 91 | |
def get_bucket_int(gid): | |
""" | |
Return a base 10 integer representing the bucket of this guid. Will be between 0-1023 with decent | |
distribution: | |
Total # of bucket ids 1024 | |
[MAX] Buck id (0-1023) (642) # of ids generated in this bucket (1076) | |
[MIN] Buck id (0-1023) (12) # of ids generated in this bucket (883) | |
Median pos 511 977 | |
Median pos 512 977 | |
Median pos 513 977 | |
We take the first 2 character and count using their values. | |
Each char has 32 potential values it can be, so 32*32 = 1024. | |
Multiple the base of the first val by 32 and add to the second val. | |
""" | |
return (_get_base_int(gid[0]) * 32) + _get_base_int(gid[1]) | |
def get_new_guid(): | |
""" | |
Returns a length 26 globally unique ID. | |
""" | |
return base64.b32encode(uuid.uuid4().bytes).lower().replace('=', '') | |
def print_bucket_distro_stats(): | |
from pprint import pprint | |
meat_hap = {} | |
for i in range(1000000): | |
u = get_new_guid() | |
buck = get_bucket_int(u) | |
if buck not in meat_hap: | |
meat_hap[buck] = 1 | |
meat_hap[buck] += 1 | |
pprint(meat_hap) | |
print "Total # of bucket ids", len(meat_hap) | |
mi = 20000 | |
mic = None | |
ma = 0 | |
mac = None | |
for k, v in meat_hap.items(): | |
if v < mi: | |
mic = k | |
mi = v | |
if v > ma: | |
mac = k | |
ma = v | |
print "[MAX] Buck id (0-1023) (%s) # of ids generated in this bucket (%i)" % (mac, ma) | |
print "[MIN] Buck id (0-1023) (%s) # of ids generated in this bucket (%i)" % (mic, mi) | |
print "Median pos 511", sorted(meat_hap.values())[511] | |
print "Median pos 512", sorted(meat_hap.values())[512] | |
print "Median pos 513", sorted(meat_hap.values())[513] | |
def print_sample(): | |
for i in range(20): | |
u = get_new_guid() | |
print "GUID: (%s) LEN (%i)" % (u, len(u)) | |
if __name__ == '__main__': | |
print_bucket_distro_stats() | |
print_sample() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment