-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimplement.py
More file actions
150 lines (110 loc) · 5.45 KB
/
implement.py
File metadata and controls
150 lines (110 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Objective 1: Exploratory data analysis
#Load the dataset
df = pd.read_excel("Proj_data.xlsx")
#Basic structure and types
print("Dataset Info:")
print(df.info())
# Summary statistics for all numerical columns
print("\nSummary Statistics:")
print(df.describe())
#Convert TotalCharges to numeric and handle missing data
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df_clean = df.dropna(subset=['TotalCharges']).copy()
df_clean['tenure'] = pd.to_numeric(df_clean['tenure'], errors='coerce')
df_clean['MonthlyCharges'] = pd.to_numeric(df_clean['MonthlyCharges'], errors='coerce')
# Check for missing values
total_missing = df_clean.isnull().sum().sum()
print(f"Total Missing Values in Cleaned Dataset: {total_missing}")
# Objective 2: Tenure and Revenue Analysis
# Correlation between tenure and total charges
tenure_totalcharges_corr = df_clean[['tenure', 'TotalCharges']].corr().iloc[0,1]
print(f"Correlation between tenure and TotalCharges: {tenure_totalcharges_corr:.2f}")
# Summary statistics for tenure
tenure_stats = df_clean['tenure'].describe()
print("Tenure Summary Statistics:")
print(tenure_stats)
# Scatter plot with regression line
# plt.figure(figsize=(8, 6))
# sns.regplot(x='tenure', y='TotalCharges', data=df_clean, scatter_kws={'alpha':0.5}) #alpha for transparency
# plt.title('Relationship between Tenure and TotalCharges')
# plt.xlabel('Tenure (months)')
# plt.ylabel('Total Charges')
# # plt.show()
# # Boxplot for Outlier Detection in TotalCharges
# plt.figure(figsize=(8, 4))
# sns.boxplot(x=df_clean['TotalCharges'])
# plt.title('Outlier Detection in Total Charges')
# plt.show()
# # Detecting Outliers in TotalCharges
# q1 = df_clean['TotalCharges'].quantile(0.25)
# q3 = df_clean['TotalCharges'].quantile(0.75)
# iqr = q3 - q1
# outliers_totalcharges = df_clean[(df_clean['TotalCharges'] < q1 - 1.5 * iqr) | (df_clean['TotalCharges'] > q3 + 1.5 * iqr)]
# print(f"TotalCharges Outliers: {len(outliers_totalcharges)} rows detected")
# # --------------------------------------------
# # Objective 3: Service Adoption and Bundling Opportunities
# services = ['OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'StreamingMovies']
# adoption_rates = {}
# for service in services:
# counts = df_clean[service].value_counts(normalize=True) * 100
# adoption_rates[service] = counts.get("Yes", 0)
# print("Service Adoption Rates (in %):")
# print(adoption_rates)
# plt.figure(figsize=(8, 6))
# sns.barplot(x=list(adoption_rates.keys()), y=list(adoption_rates.values()))
# plt.title('Adoption Rates of Additional Services')
# plt.xlabel('Service')
# plt.ylabel('Adoption Rate (%)')
# plt.ylim(0, 100)
# plt.tight_layout()
# plt.show()
# # --------------------------------------------
# # Objective 4: Payment Method Preferences and Billing Differences
# payment_counts = df_clean['PaymentMethod'].value_counts()
# print("Payment Method Frequencies:")
# print(payment_counts)
# plt.figure(figsize=(8, 6))
# sns.barplot(x=payment_counts.index, y=payment_counts.values, palette="viridis")
# plt.title('Payment Method Distribution')
# plt.xlabel('Payment Method')
# plt.ylabel('Number of Customers')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()
# # ANOVA test for MonthlyCharges across Payment Methods
# anova_result = stats.f_oneway(*[group['MonthlyCharges'].dropna().values for name, group in df_clean.groupby('PaymentMethod')])
# print("ANOVA test on MonthlyCharges by PaymentMethod:")
# print(f"F-statistic: {anova_result.statistic:.2f}, p-value: {anova_result.pvalue:.4f}")
# # Boxplot to visualize outliers in MonthlyCharges across Payment Methods
# plt.figure(figsize=(10, 6))
# sns.boxplot(x='PaymentMethod', y='MonthlyCharges', data=df_clean)
# plt.title('Outlier Detection: Monthly Charges by Payment Method')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()
# # Detecting Outliers in MonthlyCharges
# q1_m = df_clean['MonthlyCharges'].quantile(0.25)
# q3_m = df_clean['MonthlyCharges'].quantile(0.75)
# iqr_m = q3_m - q1_m
# outliers_monthlycharges = df_clean[(df_clean['MonthlyCharges'] < q1_m - 1.5 * iqr_m) | (df_clean['MonthlyCharges'] > q3_m + 1.5 * iqr_m)]
# print(f"MonthlyCharges Outliers: {len(outliers_monthlycharges)} rows detected")
# # --------------------------------------------
# # Objective 5: Statistical Test - Payment Method Comparison
# group1 = df_clean[df_clean['PaymentMethod'] == 'Electronic check']['MonthlyCharges'].dropna()
# group2 = df_clean[df_clean['PaymentMethod'] == 'Bank transfer (automatic)']['MonthlyCharges'].dropna()
# t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
# print("T-test comparing MonthlyCharges between 'Electronic check' and 'Bank transfer (automatic'):")
# print(f"t-statistic: {t_stat:.2f}, p-value: {p_val:.4f}")
# # --------------------------------------------
# # Summary:
# # --------------------------------------------
# # Insight 1: Higher tenure is associated with higher total revenue.
# # Insight 2: Varying adoption rates suggest bundling opportunities.
# # Insight 3: Payment method usage varies significantly.
# # Insight 4: Statistical tests indicate real billing differences by method.
# # Insight 5: Outliers present in charges should be monitored for anomalies or special attention.