Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified flask_backend/models/BreastCancer_SVM_v1/chemo_model.pkl
Binary file not shown.
336 changes: 235 additions & 101 deletions flask_backend/models/BreastCancer_SVM_v1/chemo_model_training.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@

# Breast Cancer Response Prediction (Post-treatment) Combinational Treatment
# This notebook trains a SVM Classifier to predict patient response based on post-treatment intervention data.
"""
Breast Cancer Response Prediction - Chemo Pre-treatment
"""

import warnings
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from utils import remove_iqr_outliers
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GroupShuffleSplit
from utils import remove_iqr_outliers, apply_iqr_bounds
from argparse import ArgumentParser

warnings.filterwarnings("ignore")
Expand All @@ -23,131 +20,268 @@
parser.add_argument("--input", type=str, required=True)
args = parser.parse_args()

# Load Data
print("="*70)
print("CHEMO PRE-TREATMENT MODEL TRAINING")
print("="*70)

# ========================================
# 1. LOAD & FILTER DATA
# ========================================
print("\n[1] Loading and filtering data...")
df = pd.read_csv(args.input)

# Filter for Post_treatment
df_pre_chemo = df[(df['Timeline'] == 'Pre_treatment') & (df['Treatment'] == 'Chemo')].copy()
print(f"Original shape: {df.shape}, Pre-treatment shape: {df_pre_chemo.shape}")
print(f"Original shape: {df.shape}")
print(f"Pre-treatment Chemo shape: {df_pre_chemo.shape}")
print(f"Patients: {df_pre_chemo['Patient_code'].nunique()}")
print(f"Response distribution:\n{df_pre_chemo['Response'].value_counts()}")

# Drop redundant/irrelevant columns
drop_cols = ['Tissue', 'Identifier', 'Timeline', 'defcls', 'Treatment']
df_chemo = df_pre_chemo.drop(columns=drop_cols)

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['Response']

label_mappings = {}
for col in categorical_cols:
df_chemo[col] = le.fit_transform(df_chemo[col])
label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
if 'group' in df_pre_chemo.columns:
drop_cols.append('group')
if 'batch' in df_pre_chemo.columns:
drop_cols.append('batch')
if 'myleiden' in df_pre_chemo.columns:
drop_cols.append('myleiden')

#One Hot Encode Origin
categorical_cols = ['Origin']
for col in categorical_cols:
encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(df_chemo[[col]])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col]))
df_chemo = pd.concat([df_chemo.drop(columns=[col]).reset_index(drop=True), encoded_df], axis=1)

#Outlier Removal

feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene", "percent_mito", "percent_hsp", "percent_ig", "percent_rp", "nUMI", "PDCD1"]
label_cols = ["Response"]
df_chemo_ = remove_iqr_outliers(df_chemo, feature_cols)
df_chemo = df_pre_chemo.drop(columns=drop_cols)

#Model Training
# ========================================
# 2. TRAIN/TEST SPLIT
# ========================================
print("\n[2] Splitting train/test by patient...")

gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# groups = your patient IDs
groups = df_chemo_["Patient_code"]

train_idx, test_idx = next(gss.split(df_chemo_, df_chemo_["Response"], groups=groups))

train_df = df_chemo_.iloc[train_idx]
test_df = df_chemo_.iloc[test_idx]

# -------------------------
# Define Model
# -------------------------
groups = df_chemo["Patient_code"]

train_idx, test_idx = next(gss.split(df_chemo, df_chemo["Response"], groups=groups))

train_df = df_chemo.iloc[train_idx].copy()
test_df = df_chemo.iloc[test_idx].copy()

# ========================================
# 3. ENCODE RESPONSE (FIT ON TRAIN ONLY)
# ========================================
print("\n[3] Encoding Response variable (fit on train only)...")

response_encoder = LabelEncoder()
train_df['Response'] = response_encoder.fit_transform(train_df['Response'])
test_df['Response'] = response_encoder.transform(test_df['Response'])

# ========================================
# 4. ONE-HOT ENCODE ORIGIN
# FIT ON ALL UNIQUE VALUES (EXCEPTION TO THE RULE)
# ========================================
print("\n[4] One-hot encoding Origin...")

# Get ALL unique Origin values from both train and test
all_origins = pd.concat([train_df['Origin'], test_df['Origin']]).unique().reshape(-1, 1)

# Fit encoder on all possible categories
origin_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
origin_encoder.fit(all_origins)

print(f"Origin categories: {origin_encoder.categories_[0]}")

# Transform both train and test
train_origin_encoded = origin_encoder.transform(train_df[['Origin']])
test_origin_encoded = origin_encoder.transform(test_df[['Origin']])

# Create DataFrames
train_origin_df = pd.DataFrame(
train_origin_encoded,
columns=origin_encoder.get_feature_names_out(['Origin']),
index=train_df.index
)
test_origin_df = pd.DataFrame(
test_origin_encoded,
columns=origin_encoder.get_feature_names_out(['Origin']),
index=test_df.index
)

# Concatenate
train_df = pd.concat([train_df.drop(columns=['Origin']), train_origin_df], axis=1)
test_df = pd.concat([test_df.drop(columns=['Origin']), test_origin_df], axis=1)

print(f"Origin columns created: {list(origin_encoder.get_feature_names_out(['Origin']))}")

# ========================================
# 5. OUTLIER REMOVAL (FIT ON TRAIN ONLY)
# ========================================
print("\n[5] Removing outliers (fit on train only)...")

feature_cols = ["Expression", "Origin_breast", "Origin_liver", "nGene",
"percent_mito", "percent_hsp", "percent_ig", "percent_rp",
"nUMI", "PDCD1"]

print(f"Train before outlier removal: {len(train_df)} cells")
train_df, iqr_bounds = remove_iqr_outliers(train_df, feature_cols)
print(f"Train after outlier removal: {len(train_df)} cells")

print(f"Test before outlier removal: {len(test_df)} cells")
test_df = apply_iqr_bounds(test_df, iqr_bounds)
print(f"Test after outlier removal: {len(test_df)} cells")

# ========================================
# 6. PREPARE FEATURES
# ========================================
X_train = train_df[feature_cols].values
y_train = train_df['Response'].values
groups_train = train_df['Patient_code'].values

X_test = test_df[feature_cols].values
y_test = test_df['Response'].values

print(f"\n[6] Feature shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# ========================================
# 7. DEFINE MODEL
# ========================================
def build_model():
return make_pipeline(
StandardScaler(),
SVC(
return Pipeline([
('scaler', StandardScaler()),
('svm', SVC(
kernel="rbf",
C=15,
gamma=0.1,
class_weight="balanced",
probability=True,
random_state=42
)
)
))
])

# ========================================
# 8. CROSS-VALIDATION (GroupKFold)
# ========================================
print("\n[7] Cross-validation with GroupKFold...")

X = train_df[feature_cols]
y = train_df[label_cols]
n_patients = train_df['Patient_code'].nunique()
n_folds = min(3, n_patients)
print(f"Using {n_folds}-fold GroupKFold CV")

kf = KFold(n_splits=3, shuffle=True, random_state=42)
gkf = GroupKFold(n_splits=n_folds)

best_auc = -1
best_model = None
cv_results = []
fold_num = 1
best_cm = None
# -------------------------
# KFold CV Loop
# -------------------------
for train_idx, val_idx in kf.split(X):

print(f"\n==============================")
print(f" Fold {fold_num}")
print("==============================")
for train_idx, val_idx in gkf.split(X_train, y_train, groups_train):
print(f"\n{'='*50}")
print(f"Fold {fold_num}/{n_folds}")
print(f"{'='*50}")

model = build_model()

X_tr, X_test = X.iloc[train_idx], X.iloc[val_idx]
y_tr, y_test = y.iloc[train_idx], y.iloc[val_idx]
X_tr, X_val = X_train[train_idx], X_train[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]

# Get patient IDs for this fold
train_pats = train_df.iloc[train_idx]['Patient_code'].unique()
val_pats = train_df.iloc[val_idx]['Patient_code'].unique()

print(f"Train patients: {sorted(train_pats)}")
print(f"Val patients: {sorted(val_pats)}")
print(f"Train: {len(X_tr)} cells, Val: {len(X_val)} cells")

# Train
model.fit(X_tr, y_tr)
model.fit(X_tr, y_tr.ravel())

# Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

# Metrics
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan
cm = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {auc:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# -------------------------
# SAVE BEST MODEL
# -------------------------
if auc > best_auc: # <-- CHANGE TO acc > best_acc IF YOU WANT ACCURACY
best_auc = auc
best_model = model
best_cm = cm
print("🔥 New best model found and stored.")
acc = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_prob) if len(np.unique(y_val)) > 1 else np.nan
cm = confusion_matrix(y_val, y_pred)

fold_num += 1
print(f"Accuracy: {acc:.4f}")
print(f"AUC-ROC: {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")

print("\n==============================")
print(" BEST MODEL FROM CV")
print("==============================")
print(f"Best AUC: {best_auc:.4f}")
cv_results.append({
'fold': fold_num,
'accuracy': acc,
'auc': auc
})

#Confusion Matrix
fold_num += 1

print("\nConfusion Matrix:\n", best_cm)
# CV Summary
cv_df = pd.DataFrame(cv_results)
print(f"\n{'='*50}")
print("CROSS-VALIDATION SUMMARY")
print(f"{'='*50}")
print(cv_df.to_string(index=False))
print(f"\nMean Accuracy: {cv_df['accuracy'].mean():.4f} ± {cv_df['accuracy'].std():.4f}")
if not cv_df['auc'].isna().all():
print(f"Mean AUC: {cv_df['auc'].mean():.4f} ± {cv_df['auc'].std():.4f}")

# ========================================
# 9. TRAIN FINAL MODEL (on full training set)
# ========================================
print(f"\n[8] Training final model on FULL training set...")

final_model = build_model()
final_model.fit(X_train, y_train.ravel())

# ========================================
# 10. EVALUATE ON TEST SET
# ========================================
print(f"\n[9] Evaluating on held-out test set...")

y_test_pred = final_model.predict(X_test)
y_test_prob = final_model.predict_proba(X_test)[:, 1] if len(np.unique(y_test)) > 1 else None

test_acc = accuracy_score(y_test, y_test_pred)
test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None and len(np.unique(y_test)) > 1 else np.nan
test_cm = confusion_matrix(y_test, y_test_pred)

print(f"\n{'='*50}")
print("TEST SET EVALUATION")
print(f"{'='*50}")
print(f"Accuracy: {test_acc:.4f}")
print(f"AUC-ROC: {test_auc:.4f}")
print(f"Confusion Matrix:\n{test_cm}")
if len(np.unique(y_test)) > 1:
print(f"\nClassification Report:\n{classification_report(y_test, y_test_pred, target_names=response_encoder.classes_)}")

# ========================================
# 11. SAVE MODEL & ARTIFACTS
# ========================================
print(f"\n[10] Saving model and preprocessing artifacts...")

model_artifact = {
'model': final_model,
'response_encoder': response_encoder,
'origin_encoder': origin_encoder,
'iqr_bounds': iqr_bounds,
'feature_cols': feature_cols,
'train_patients': sorted(train_df['Patient_code'].unique()),
'test_patients': sorted(test_df['Patient_code'].unique()),
'cv_results': cv_df.to_dict('records'),
'test_metrics': {
'accuracy': test_acc,
'auc': test_auc,
'confusion_matrix': test_cm.tolist()
}
}

# -------------------------
# SAVE BEST MODEL TO .pkl
# -------------------------
with open("chemo_model.pkl", "wb") as f:
pickle.dump(best_model, f)

print("\n✅ Chemo model saved")
pickle.dump(model_artifact, f)

print("Chemo model saved to: chemo_model.pkl")
print("\nModel artifact includes:")
print(" - Trained model (with fitted StandardScaler)")
print(" - Response encoder")
print(" - Origin encoder")
print(" - IQR bounds")
print(" - Feature columns")
print(" - CV results")
print(" - Test metrics")

print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
Binary file modified flask_backend/models/BreastCancer_SVM_v1/combo_model.pkl
Binary file not shown.
Loading