OncoPredict/prediction_system.py at main · AagmanS/OncoPredict · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import shap
import joblib
import pandas as pd
import numpy as np

def load_resources():
    try:
        model = joblib.load('best_model.pkl')
        scaler = joblib.load('scaler.pkl')
        feature_names = joblib.load('feature_names.pkl')

        # Load the original data used to train the model, specifically to use as a SHAP background
        # Note: Since the models were trained randomly on train_test_split, we can simply recreate
        # a small subset of the training background dataset via the load_breast_cancer function.
        from sklearn.datasets import load_breast_cancer
        data = load_breast_cancer()
        bg_X = pd.DataFrame(data.data, columns=data.feature_names)
        bg_scaled = scaler.transform(bg_X)

        return model, scaler, feature_names, bg_scaled
    except FileNotFoundError:
        return None, None, None, None

def predict_tumor(feature_dict):
    """
    Predicts if a tumor is benign or malignant and provides SHAP explanations.
    """
    model, scaler, feature_names, bg_scaled = load_resources()

    if not model:
        raise ValueError("Model parts not found. Ensure model_training.py is run first.")

    # Create DataFrame from input to ensure ordering is correct
    input_df = pd.DataFrame([feature_dict])
    input_df = input_df[feature_names]

    # Preprocess
    scaled_data = scaler.transform(input_df)

    # Predict
    proba = model.predict_proba(scaled_data)[0]
    pred = model.predict(scaled_data)[0]

    class_name = 'Malignant' if pred == 1 else 'Benign'
    malignant_prob = proba[1] if model.classes_[1] == 1 else proba[0]
    benign_prob = proba[0] if model.classes_[1] == 1 else proba[1]

    # SHAP Explainability
    try:
        # Determine the type of model to select the correct explainer
        # If Random Forest, use TreeExplainer; if LogisticRegression/SVM, use KernelExplainer.
        if hasattr(model, 'estimators_'): # Check if it's an ensemble (like RandomForest)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(scaled_data)
            # Depending on sklearn version and model type, shap_values might be a list where [1] is positive class
            if isinstance(shap_values, list):
                # For RandomForest, shap_values is typically a list for classification
                class_index = np.where(model.classes_ == 1)[0][0] # Index of 'Malignant' class
                contributions = shap_values[class_index][0]
            else:
                # Some versions/models return array of shape (n_samples, n_features, n_classes)
                if len(shap_values.shape) == 3:
                     class_index = np.where(model.classes_ == 1)[0][0]
                     contributions = shap_values[0, :, class_index]
                else:
                     contributions = shap_values[0] # Fallback
        else:
            # Fallback for generic models (Logistic Regression, etc.)
            bg_summary = shap.kmeans(bg_scaled, 10) # Summarize background for KernelExplainer speed
            explainer = shap.KernelExplainer(model.predict_proba, bg_summary)
            # Explain probability of Malignant class
            class_index = np.where(model.classes_ == 1)[0][0]
            shap_values = explainer.shap_values(scaled_data)
            # KernelExplainer typically returns a list of arrays, one for each class
            if isinstance(shap_values, list):
                 contributions = shap_values[class_index][0]
            else:
                 contributions = shap_values[0]

        # Structure the SHAP data
        # Pair feature names with their absolute SHAP values to find top contributors
        shap_dict = {}
        for i, f_name in enumerate(feature_names):
             shap_dict[f_name] = float(contributions[i])

        # Sort by absolute impact
        sorted_shap = dict(sorted(shap_dict.items(), key=lambda item: abs(item[1]), reverse=True))

        # Take top 5 features
        top_5_features = {}
        count = 0
        for k, v in sorted_shap.items():
            if count >= 5: break
            top_5_features[k] = v
            count += 1

    except Exception as e:
        print(f"SHAP explanation failed: {e}")
        top_5_features = {}

    return {
        'prediction': class_name,
        'malignant_probability': malignant_prob,
        'benign_probability': benign_prob,
        'explanations': top_5_features
    }

if __name__ == "__main__":
    # Test function
    import random
    _, _, f_names = load_resources()
    if f_names:
        sample_data = {f: random.uniform(0.1, 10.0) for f in f_names}
        res = predict_tumor(sample_data)
        print("Test Prediction:")
        print(res)
    else:
        print("Please train the model first.")