Last active
November 29, 2016 14:41
-
-
Save ryanwitt/3167463 to your computer and use it in GitHub Desktop.
silly huffman coding example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__all__ = ( | |
'frequency_histogram', | |
'endode_simple', | |
'decode_simple', | |
'huffman_tree', | |
'encode_character', | |
'encode_huffman', | |
'hist2dot', | |
'tree2dot', | |
) | |
phrase = 'A man a plan a canal Panama'.lower() | |
def frequency_histogram(phrase): | |
""" | |
>>> frequency_histogram(phrase) | |
[(10, 'a'), (6, ' '), (4, 'n'), (2, 'p'), (2, 'm'), (2, 'l'), (1, 'c')] | |
""" | |
d = {} | |
for c in phrase: | |
d[c] = d.get(c, 0) + 1 | |
return list(reversed(sorted(zip(d.values(), d.keys())))) | |
def encode_simple(phrase, tree=None): | |
""" | |
>>> ''.join(encode_simple(phrase)) | |
'01011110011010010111011111001101001011111100110011111010111001100111100' | |
""" | |
if tree is None: | |
tree = frequency_histogram(phrase) | |
for c in phrase: | |
for count, character in tree: | |
if character == c: | |
yield '0' | |
break | |
else: | |
yield '1' | |
def decode_simple(phrase, tree): | |
""" | |
>>> ''.join(decode_simple('01011110011010010111011111001101001011111100110011111010111001100111100', frequency_histogram(phrase))) | |
'a man a plan a canal panama' | |
""" | |
cur = iter(tree) | |
for c in phrase: | |
if c == '1': | |
cur.next() | |
elif c == '0': | |
yield cur.next()[1] | |
cur = iter(tree) | |
import heapq | |
def huffman_tree(phrase): | |
""" | |
>>> huffman_tree(phrase) | |
(27, None, (10, 'a', None, None), (17, None, (7, None, (3, None, (1, 'c', None, None), (2, 'l', None, None)), (4, None, (2, 'm', None, None), (2, 'p', None, None))), (10, None, (4, 'n', None, None), (6, ' ', None, None)))) | |
""" | |
tree = [(freq, char, None, None) for freq, char in frequency_histogram(phrase)] # last 2 args are left and right children | |
heapq.heapify(tree) | |
while len(tree) > 1: | |
p1, c1, r1, l1 = heapq.heappop(tree) | |
p2, c2, r2, l2 = heapq.heappop(tree) | |
heapq.heappush(tree, (p1 + p2, None, (p1, c1, r1, l1), (p2, c2, r2, l2))) | |
return tree[0] | |
def encode_character(c, tree, append=''): | |
""" | |
>>> encode_character('a', huffman_tree(phrase)) | |
'0' | |
""" | |
if not tree: | |
return string_so_far | |
freq, character, right_child, left_child = tree | |
if c == character: | |
return append | |
if right_child: | |
r = encode_character(c, right_child, append='1') | |
if r: | |
return append + r | |
if left_child: | |
l = encode_character(c, left_child, append='0') | |
if l: | |
return append + l | |
return None | |
def encode_huffman(phrase, tree=None): | |
""" | |
>>> encode_huffman(phrase) | |
'10000101100100010000100011010010001000011110011011000001001001101011' | |
""" | |
if tree is None: | |
tree = huffman_tree(phrase) | |
return ''.join(encode_character(c, tree) for c in phrase) | |
def hist2dot(hist): | |
print 'digraph g {' | |
last = 'root' | |
i = 0 | |
for freq, c in hist: | |
print last, '-> ', c.replace(' ','space'), ' [label=0];' | |
new = bin(i).replace('b','') | |
print last, '->', new, '[label=1];' | |
last = new | |
i += 1 | |
print '}' | |
def tree2dot(tree): | |
total, char, left, right = tree | |
titles = {} | |
def get_title(node): | |
freq, char, left, right = node | |
if id(node) not in titles: | |
count = len(titles) | |
titles[id(node)] = count | |
return titles[id(node)] | |
def get_node(node): | |
freq, char, left, right = node | |
title = get_title(node) | |
yield '\t%s [label="%sp=%0.3f"];' % ( | |
title, | |
'%s\\n' % repr(char).encode('unicode-escape').replace('"', '\\"') if char else '', | |
1.0 * freq / total, | |
) | |
if right: | |
yield '\t%s -> %s [label=1];' % (title, get_title(right)) | |
if left: | |
yield '\t%s -> %s [label=0];' % (title, get_title(left)) | |
def inner(node): | |
freq, char, left, right = node | |
text = list(get_node(node)) | |
if left: | |
text.extend(inner(left)) | |
if right: | |
text.extend(inner(right)) | |
return text | |
text = ['digraph g {'] | |
text.extend(inner(tree)) | |
text.append('}') | |
return '\n'.join(text); | |
if __name__ == '__main__': | |
import sys | |
if '--help' in sys.argv: | |
print >>sys.stderr, 'usage: huffman.py --help # print this message' | |
print >>sys.stderr, ' huffman.py --test # run tests' | |
print >>sys.stderr, ' huffman.py < file # output the string representation of binary encoding of file' | |
print >>sys.stderr, ' huffman.py --tree < file # output the tree (in dot format) generated from file' | |
raise SystemExit(0) | |
if '--test' in sys.argv: | |
def test_character(c, phrase): | |
print c, encode_character(c, huffman_tree(phrase)), phrase | |
def test_phrase(phrase): | |
print "Phrase: %s" % phrase | |
print "Length: %d" % (len(phrase) * 8) | |
encoded = encode_huffman(phrase) | |
print "Encoded: %s" % encoded | |
print "Code Length: %d" % len(encoded) | |
print "Compression: %f" % (1.0 * len(encoded) / (len(phrase) * 8)) | |
test_character('a', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12) | |
test_character('b', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12) | |
test_character('c', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12) | |
test_character('d', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12) | |
test_phrase(phrase) | |
test_phrase('a'*50 + 'b'*25 + 'c'*13 + 'd'*12) | |
test_character('a', phrase) | |
test_character(' ', phrase) | |
test_character('m', phrase) | |
test_character('n', phrase) | |
test_character('p', phrase) | |
test_character('c', phrase) | |
test_character('l', phrase) | |
raise SystemExit(0) | |
def calculate_compression(original, encoded): | |
original_bits = len(original.decode('UTF-8')*8) | |
encoded_bits = len(encoded) | |
print >>sys.stderr, "Original bits: %d" % original_bits | |
print >>sys.stderr, "Encoded bits: %d" % encoded_bits | |
print >>sys.stderr, "Compression: %f" % (1.0 * encoded_bits / original_bits) | |
original = sys.stdin.read() | |
tree = huffman_tree(original) | |
encoded = encode_huffman(original, tree) | |
calculate_compression(original, encoded) | |
if '--tree' in sys.argv: | |
digraph = tree2dot(tree) | |
print digraph | |
else: | |
print encoded |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment