-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhuffman.py
More file actions
117 lines (90 loc) · 3.25 KB
/
huffman.py
File metadata and controls
117 lines (90 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import http.client
import random
def random_string(n):
return "".join(chr(random.randrange(128)) for i in range(n))
def str_to_bin(text):
""" translates text to binary reprersentation using
7 bits per character. """
return "".join(['{:07b}'.format(ord(ch)) for ch in text])
def purify(text):
""" Discards non ascii characters from text"""
return "".join(ch for ch in text
if ord(ch)<128)
def char_count(text):
""" counts the number of occurances of ascii characters -
ord(ch)<128, in a text. Returns a dictionary, with keys
being the observed characters, values being the counts """
d = {}
for ch in text:
if ord(ch)>=128: continue
if ch in d:
d[ch] = 1+d[ch]
else:
d[ch]=1
return d
def build_huffman_tree(letter_count):
""" recieves dictionary with char:count entries
generates a LIST structure representing
the binary Huffman encoding tree"""
queue = [(x, px) for x,px in letter_count.items()]
while len(queue) > 1:
# combine two smallest elements
a, pa = extract_min(queue) # smallest in queue
b, pb = extract_min(queue) # next smallest
chars = [a,b]
weight = pa+pb # combined weight
queue.append((chars,weight)) # insert new node
# print(queue) # to see what whole queue is
# print()
x, px = extract_min(queue) # only root node left
return x
def extract_min(queue):
P = [px for x,px in queue]
return queue.pop(P.index(min(P)))
def generate_code(huff_tree, prefix=""):
""" receives a Huffman tree with embedded encoding,
and a prefix of encodings.
returns a dictionary where characters are
keys and associated binary strings are values."""
if isinstance(huff_tree, str): # a leaf
return {huff_tree: prefix}
else:
lchild, rchild = huff_tree[0], huff_tree[1]
codebook = {}
codebook.update( generate_code(lchild, prefix+'0'))
codebook.update( generate_code(rchild, prefix+'1'))
# oh, the beauty of recursion...
return codebook
def compress(text, encoding_dict):
""" compress text using encoding dictionary """
assert isinstance(text, str)
return "".join(encoding_dict[ch] for ch in text if ord(ch)<=128)
def build_decoding_dict(encoding_dict):
"""build the "reverse" of encoding dictionary"""
return {y:x for (x,y) in encoding_dict.items()}
# return {y:x for x,y in encoding_dict.items()} # OK too
def decompress(bits, decoding_dict):
prefix = ""
result = []
for bit in bits:
prefix += bit
if prefix in decoding_dict:
result.append(decoding_dict[prefix])
prefix = ""
assert prefix == "" # must finish last codeword
return "".join(result) # converts list of chars to a string
def download_wiki():
HOST = "en.wikipedia.org"
GET_METHOD = "GET"
URL = "/wiki/Huffman_coding"
ENCODING = "utf-8"
# Get the HTML raw data
conn = http.client.HTTPConnection(HOST)
conn.request(GET_METHOD, URL)
data = conn.getresponse().read()
conn.close()
return str(data, ENCODING)
def wiki_cycle():
wikitext=download_wiki()
wikicount=char_count(wikitext)
return build_huffman_tree(wikicount)