OncoPredict/generate_notebook.py at main · AagmanS/OncoPredict · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import nbformat as nbf

nb = nbf.v4.new_notebook()

nb['cells'] = [
    nbf.v4.new_markdown_cell('# AI-Based Breast Cancer Detection\n## Exploratory Data Analysis'),

    nbf.v4.new_markdown_cell('### 1. Data Acquisition\nLoad the Dataset using scikit-learn.'),
    nbf.v4.new_code_cell('''import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

# Load Dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target).map({0: 1, 1: 0}) # Change targets: 1 (Malignant), 0 (Benign)
df = X.copy()
df['target'] = y
print("Data shape:", df.shape)
'''),

    nbf.v4.new_markdown_cell('### 2. Data Understanding\nDisplay dataset structure, feature names, and class distribution.'),
    nbf.v4.new_code_cell('''# Dataset Structure
df.info()
print("\\nClass Distribution:")
print(df['target'].value_counts(normalize=True))

# Summary Statistics
df.describe()
'''),

    nbf.v4.new_markdown_cell('### 3. Data Preprocessing\nCheck for missing values and handle them if present.'),
    nbf.v4.new_code_cell('''# Missing values
print("Missing values in dataset:\\n", df.isnull().sum().sum())

# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
print("Standardization applied successfully.")
'''),

    nbf.v4.new_markdown_cell('### 4. Exploratory Data Analysis\nMeaningful Visualizations.'),
    nbf.v4.new_code_cell('''# Correlation heatmap
plt.figure(figsize=(15, 12))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap of Biological Features")
plt.show()

# Distribution plot of key tumor features
features_to_plot = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness']
df[features_to_plot].hist(bins=15, figsize=(15, 10), layout=(2, 3))
plt.suptitle('Distribution of Key Tumor Features')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Class Distribution
plt.figure(figsize=(6,4))
sns.countplot(x='target', data=df)
plt.title('Target Class Distribution (0=Benign, 1=Malignant)')
plt.show()
'''),

    nbf.v4.new_markdown_cell('### 5. Dimensionality Reduction\nApply Principal Component Analysis (PCA)'),
    nbf.v4.new_code_cell('''from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.xlabel(f"First Principal Component (Variance: {pca.explained_variance_ratio_[0]:.2f})")
plt.ylabel(f"Second Principal Component (Variance: {pca.explained_variance_ratio_[1]:.2f})")
plt.title("2D PCA Visualization of Tumor Data")
plt.legend(handles=scatter.legend_elements()[0], labels=['Benign', 'Malignant'])
plt.grid(True)
plt.show()
''')
]

with open('data_analysis.ipynb', 'w') as f:
    nbf.write(nb, f)

print("data_analysis.ipynb created successfully.")