You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
This project aims to predict the survival of passengers on the Titanic based on various features such as age, sex, class, and fare. The project uses machine learning algorithms to train models on the Titanic dataset and evaluate their performance.
Contents
gender_submission.csv: A sample submission file in the correct format.
README.md: This file, providing an overview of the project.
test.csv: The test dataset used for making predictions.
Titanic_cross_validation.ipynb: Jupyter notebook for cross-validation and model evaluation.
Titanic_model.ipynb: Jupyter notebook for building and training the predictive model.
train.csv: The training dataset used to build the model.
Models
The project uses several machine learning algorithms, including:
Decision Tree Classifier
Linear Support Vector Machine (SVM)
Gradient Boosting Classifier
K-Nearest Neighbors (KNN)
Naive Bayes
Logistic Regression
Performance Metrics
The project evaluates the performance of each model using accuracy, precision, recall, and F1-score.
cat=train.select_dtypes(include=['object']).copy()
cat['Pclass'] =train['Pclass']
cat=cat.drop(['Ticket', 'Cabin', 'Name'], axis=1) #droped knowing it has a lot of possible valuescat.columns
Cabin 1014
Survived 418
Age 263
Embarked 2
Fare 1
PassengerId 0
Pclass 0
Name 0
Sex 0
SibSp 0
Parch 0
Ticket 0
train_test 0
dtype: int64
df=df.drop(['Cabin'],axis=1)
#Filling missing Age values based on the median of the name tiltle (Mss. Mr. Mrs. etc)importredf['name_titles'] =df['Name'].apply(lambdax: re.findall(',\s(\w*).',x)[0])
df['Age'] =df.groupby('name_titles')['Age'].transform(lambdax: x.fillna(x.median()))
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
df_bin=pd.DataFrame() #to store continuous variables in bins ex(age bins: 0-10, 11-20, 21-31, etc..)df_con=pd.DataFrame() # to store continuous variables train.dtypes
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
#Name ticket and Cabin have too many value types, hard to put in bins not so useful.print(train.Name.value_counts().count())
print(train.Ticket.value_counts().count())
print(train.Cabin.value_counts().count())
# Function to plot a count and a distribution(vs Survived) of each variable defplot_count_dist (data, bin_df, label_column, target_colum, figsize=(20,5), use_bin_fd=False):
ifuse_bin_fd:
fig=plt.figure(figsize=figsize)
plt.subplot(1,2,1)
sns.countplot(y=target_colum, data=bin_df)
plt.subplot(1,2,2)
sns.distplot(data.loc[data[label_column]==1][target_colum], kde_kws={'label':'Survived'})
sns.distplot(data.loc[data[label_column]==0][target_colum], kde_kws={'label':'Did Not Survived'})
else:
fig=plt.figure(figsize=figsize)
plt.subplot(1,2,1)
sns.countplot(y=target_colum, data=data)
plt.subplot(1,2,2)
sns.distplot(data.loc[data[label_column]==1][target_colum], kde_kws={'label':'Survived'})
sns.distplot(data.loc[data[label_column]==0][target_colum], kde_kws={'label':'Did Not Survived'})
# Adding Sex and changing to binary form using np.wheredf_bin['Sex'] =train['Sex']
df_bin['Sex'] =np.where(df_bin['Sex']=='female', 1,0)
df_con['Sex'] =train['Sex']
# Adding SibSp = num of Siblings/Spousedf_bin['SibSp']=train['SibSp']
df_con['SibSp']=train['SibSp']
# Adding Parch = num of Parents/Childrendf_bin['Parch']=train['Parch']
df_con['Parch']=train['Parch']
plot_count_dist(train, bin_df=df_bin, label_column='Survived', target_colum='Parch', figsize=(20,10))
# Ading Fare, first variable to be cut into bins df_con['Fare'] =train['Fare']
df_bin['Fare'] =pd.qcut(train['Fare'], 5)
plot_count_dist(train,df_bin,'Survived', 'Fare', use_bin_fd=True, figsize=(20,15))
# Function to run different algorithm with cross val deffit_algo(algo, X_train, y_train, cv): #cv Cross validate, number of times try 10 model=algo.fit(X_train, y_train)
accuracy=round(model.score(X_train,y_train)*100,2)
cross_val=model_selection.cross_val_predict(algo, X_train,y_train, cv=cv, n_jobs=-1)
acc_cv=round(metrics.accuracy_score(y_train,cross_val)*100,2)
#feature_ranks = pd.Series((model.feature_importances_)*100, index=X_train.columns).sort_values(ascending=False)returnaccuracy, acc_cv, #feature_ranks
TOP Performing Algorithms
acc_grad_boost, cross_grad_boost, featureranks=fit_algo(GradientBoostingClassifier(n_estimators=300, learning_rate=0.1), X_train_con, y_train_con,10)
print('General Acc is ' , acc_grad_boost)
print()
print('With cross val is ',cross_grad_boost)
print()
print(featureranks)
General Acc is 94.94
With cross val is 84.03
Sex_female 22.476185
Fare 21.774555
Sex_male 17.517075
Age 16.592819
Pclass 12.358341
SibSp 5.821414
Embarked_S 1.495644
Parch 1.172607
Embarked_Q 0.420607
Embarked_C 0.370754
dtype: float64
acc_dTree, cross_dTree=fit_algo( DecisionTreeClassifier(), X_train_con, y_train_con, 10)
print('General Acc is ' , acc_dTree)
print()
print('With cross val is ',cross_dTree)
General Acc is 97.86
With cross val is 76.15
Worst Performing Algorithms
accuracy_LSVC, Cross_LSVC=fit_algo(LinearSVC(), X_train_bin, y_train_bin, 10)
print('General Acc is ' , accuracy_LSVC)
print()
print('With cross val is ',Cross_LSVC)
General Acc is 80.2
With cross val is 78.85
acc_log, cross_log=fit_algo( LogisticRegression(), X_train_bin, y_train_bin, 10)
print('General Acc is ' , acc_log)
print()
print('With cross val is ',cross_log)
General Acc is 79.64
With cross val is 78.85
acc_KN, cross_KN=fit_algo(KNeighborsClassifier(), X_train_bin, y_train_bin, 10)
print('General Acc is ' , acc_KN)
print()
print('With cross val is ',cross_KN)
General Acc is 82.45
With cross val is 78.29
acc_gaus, cross_gaus=fit_algo(GaussianNB(), X_train_con, y_train_con, 10)
print('General Acc is ' , acc_gaus)
print()
print('With cross val is ',cross_gaus)
General Acc is 79.08
With cross val is 78.52
acc_sgdc, cross_sgdc=fit_algo(SGDClassifier(), X_train_bin, y_train_bin, 10)
print('General Acc is ' , acc_sgdc)
print()
print('With cross val is ',cross_sgdc)
General Acc is 80.2
With cross val is 77.28
fromsklearn.svmimportSVC, NuSVCacc_grad_boost, cross_grad_boost=fit_algo(NuSVC(), X_train_bin, y_train_bin,10)
print('General Acc is ' , acc_grad_boost)
print()
print('With cross val is ',cross_grad_boost)
General Acc is 81.55
With cross val is 80.88
About
A Kaggle competition entry predicting Titanic survival rates.