-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprediction_system.py
More file actions
118 lines (102 loc) · 4.73 KB
/
prediction_system.py
File metadata and controls
118 lines (102 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import shap
import joblib
import pandas as pd
import numpy as np
def load_resources():
try:
model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')
feature_names = joblib.load('feature_names.pkl')
# Load the original data used to train the model, specifically to use as a SHAP background
# Note: Since the models were trained randomly on train_test_split, we can simply recreate
# a small subset of the training background dataset via the load_breast_cancer function.
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
bg_X = pd.DataFrame(data.data, columns=data.feature_names)
bg_scaled = scaler.transform(bg_X)
return model, scaler, feature_names, bg_scaled
except FileNotFoundError:
return None, None, None, None
def predict_tumor(feature_dict):
"""
Predicts if a tumor is benign or malignant and provides SHAP explanations.
"""
model, scaler, feature_names, bg_scaled = load_resources()
if not model:
raise ValueError("Model parts not found. Ensure model_training.py is run first.")
# Create DataFrame from input to ensure ordering is correct
input_df = pd.DataFrame([feature_dict])
input_df = input_df[feature_names]
# Preprocess
scaled_data = scaler.transform(input_df)
# Predict
proba = model.predict_proba(scaled_data)[0]
pred = model.predict(scaled_data)[0]
class_name = 'Malignant' if pred == 1 else 'Benign'
malignant_prob = proba[1] if model.classes_[1] == 1 else proba[0]
benign_prob = proba[0] if model.classes_[1] == 1 else proba[1]
# SHAP Explainability
try:
# Determine the type of model to select the correct explainer
# If Random Forest, use TreeExplainer; if LogisticRegression/SVM, use KernelExplainer.
if hasattr(model, 'estimators_'): # Check if it's an ensemble (like RandomForest)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(scaled_data)
# Depending on sklearn version and model type, shap_values might be a list where [1] is positive class
if isinstance(shap_values, list):
# For RandomForest, shap_values is typically a list for classification
class_index = np.where(model.classes_ == 1)[0][0] # Index of 'Malignant' class
contributions = shap_values[class_index][0]
else:
# Some versions/models return array of shape (n_samples, n_features, n_classes)
if len(shap_values.shape) == 3:
class_index = np.where(model.classes_ == 1)[0][0]
contributions = shap_values[0, :, class_index]
else:
contributions = shap_values[0] # Fallback
else:
# Fallback for generic models (Logistic Regression, etc.)
bg_summary = shap.kmeans(bg_scaled, 10) # Summarize background for KernelExplainer speed
explainer = shap.KernelExplainer(model.predict_proba, bg_summary)
# Explain probability of Malignant class
class_index = np.where(model.classes_ == 1)[0][0]
shap_values = explainer.shap_values(scaled_data)
# KernelExplainer typically returns a list of arrays, one for each class
if isinstance(shap_values, list):
contributions = shap_values[class_index][0]
else:
contributions = shap_values[0]
# Structure the SHAP data
# Pair feature names with their absolute SHAP values to find top contributors
shap_dict = {}
for i, f_name in enumerate(feature_names):
shap_dict[f_name] = float(contributions[i])
# Sort by absolute impact
sorted_shap = dict(sorted(shap_dict.items(), key=lambda item: abs(item[1]), reverse=True))
# Take top 5 features
top_5_features = {}
count = 0
for k, v in sorted_shap.items():
if count >= 5: break
top_5_features[k] = v
count += 1
except Exception as e:
print(f"SHAP explanation failed: {e}")
top_5_features = {}
return {
'prediction': class_name,
'malignant_probability': malignant_prob,
'benign_probability': benign_prob,
'explanations': top_5_features
}
if __name__ == "__main__":
# Test function
import random
_, _, f_names = load_resources()
if f_names:
sample_data = {f: random.uniform(0.1, 10.0) for f in f_names}
res = predict_tumor(sample_data)
print("Test Prediction:")
print(res)
else:
print("Please train the model first.")