project-classification/prepare.py at main · Joshua-Click/project-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats

def clean_telco(telco_df):
    '''USE THIS FOR INITIAL CLEAN - Cleans the telco dataset for exploring
    leaving the booleans and cleaning null values and dropping dup columns

    arguments: telco_df

    return: a clean dataframe ready to explore'''

    # drop duplicate columns and customer_id for view
    telco_df = telco_df.drop(columns =['payment_type_id','internet_service_type_id','contract_type_id', 'customer_id'])
    # fill nulls and change total_charges to float
    telco_df.total_charges = telco_df.total_charges.str.replace(' ', '0').astype(float)
    # drops (automatic) from the payment_type
    telco_df['payment_type'] = telco_df['payment_type'].str.replace(' (automatic)', '')

    return telco_df

def prep_telco(telco_df):
    '''USE THIS BEFORE MODELING - Preps the telco dataset for machine learning modeling by dropping unnessesary columns
    creating dummy values, and replacing null values in total_charges, keeps customer_id

    arguments: telco_df

    return: a dataframe ready for machine learning'''

    telco_df.set_index('customer_id', inplace=True)
    #drop duplicate columns
    telco_df = telco_df.drop(columns =['payment_type_id','internet_service_type_id','contract_type_id', 'gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'paperless_billing', 'streaming_tv', 'streaming_movies'])
    #create dummies
    dummy_list = ['online_security', 'online_backup', 'device_protection', 'tech_support', 'churn', 'contract_type', 'internet_service_type', 'payment_type']
    dummy_df = pd.get_dummies(telco_df[dummy_list], dtype=int, drop_first=True)
    # join dummy & telco_df
    telco_df = pd.concat([telco_df, dummy_df], axis=1)
    # drop str column categories
    cols_to_drop = ['online_security', 'online_backup', 'device_protection', 'tech_support', 'churn', 'contract_type', 'internet_service_type', 'payment_type']
    telco_df = telco_df.drop(columns= cols_to_drop)
    #total_charges.str.replace(' ', '0').astype(float)
    telco_df.total_charges = telco_df.total_charges.str.replace(' ', '0').astype(float)


    return telco_df


def split_telco_data(df, target):
    '''
    split telco data into train, validate, test

    argument: df, target variable

    return: train, validate, test
    '''

    train_val, test = train_test_split(df,
                                   train_size=0.8,
                                   random_state=1108,
                                   stratify=df[target])
    train, validate = train_test_split(train_val,
                                   train_size=0.7,
                                   random_state=1108,
                                   stratify=train_val[target])

    print(f'Train: {len(train)/len(df)}')
    print(f'Validate: {len(validate)/len(df)}')
    print(f'Test: {len(test)/len(df)}')


    return train, validate, test


def next_split(train, validate, test):
    '''This function creates your modeling variables with the train, validate, test
    sets and returns them

    argument: train, validate, test

    return: X_train, X_validate, X_test, y_train, y_validate, y_test'''

    X_train = train.drop(columns=['churn_Yes', 'senior_citizen'])

    X_validate = validate.drop(columns=['churn_Yes', 'senior_citizen'])

    X_test = test.drop(columns=['churn_Yes', 'senior_citizen'])

    y_train = train['churn_Yes']

    y_validate = validate['churn_Yes']

    y_test = test['churn_Yes']

    return X_train, X_validate, X_test, y_train, y_validate, y_test