-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeatures.py
More file actions
47 lines (39 loc) · 1.37 KB
/
features.py
File metadata and controls
47 lines (39 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import math
from collections import Counter
# Extracting binary word features from text
def ext_feat(item):
return set(item.lower().split())
# Computing mutual information between a binary feature and binary class label
def compute_mut_inf(feat_count, label_counts, joint_count, N):
mi = 0.0
for x in [0, 1]:
for y in [0, 1]:
p_xy = joint_count[(x, y)] / N
if p_xy == 0:
continue
p_x = feat_count[x] / N
p_y = label_counts[y] / N
mi += p_xy * math.log2(p_xy / (p_x * p_y))
return mi
# Calculating MI scores for all feature in the datasets
# labels:1 = spam, 0= ham
def compute_feature_mut_inf(dataset, labels):
N = len(dataset)
mi_scores = {}
all_feat = set().union(*dataset)
for f in all_feat:
feat_counts = Counter()
label_counts = Counter()
joint_counts = Counter()
for feat, y in zip(dataset, labels):
x = 1 if f in feat else 0
feat_counts[x] += 1
label_counts[y] += 1
joint_counts[(x, y)] += 1
mi_scores[f] = compute_mut_inf(
feat_counts, label_counts, joint_counts, N
)
return mi_scores
# selecting features with MI score on a given threshold
def slt_feat(features, mi_scores, threshold=0.01):
return {f for f in features if mi_scores.get(f, 0) >= threshold}