Data-Science-from-Scratch-Python/logistic_regression.py at master · ruchikaverma-iitg/Data-Science-from-Scratch-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 12 13:30:27 2020

@author: Ruchika
"""
####################################################################################################
########################           Load and plot the data                ##########################
#############      Data current form is [experience, salary, paid account]##########################
####################################################################################################


tuples = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),]
data = [list(row) for row in tuples]

xs = [[1.0] + row[:2] for row in data]# [1, experience, salary]
ys = [row[2] for row in data] # paid account

import matplotlib.pyplot as plt
experience_0 = [xs[i][1] for i in range(len(xs)) if ys[i] == 0]
experience_1 = [xs[i][1] for i in range(len(xs)) if ys[i] == 1]

salary_0 = [xs[i][2] for i in range(len(xs)) if ys[i] == 0]
salary_1 = [xs[i][2] for i in range(len(xs)) if ys[i] == 1]


plt.scatter(experience_0, salary_0, c = 'magenta', marker = 'x',label = 'unpaid')
plt.scatter(experience_1, salary_1, c = 'green', marker = '<', label = 'paid')
plt.xlabel('years of experience')
plt.ylabel('annual salary')
plt.legend(loc = 8)
plt.show()

####################################################################################################
## Least squares fit
####################################################################################################

from matplotlib import pyplot as plt
from working_with_data import rescale
from multiple_regression import least_squares_fit, predict
from gradient_descent import gradient_step

learning_rate = 0.01
rescaled_xs = rescale(xs)
beta = least_squares_fit(rescaled_xs, ys, learning_rate, 10, 1)

predictions = [predict(x_i, beta) for x_i in rescaled_xs]

plt.scatter(predictions,ys)
plt.xlabel("predicted")
plt.ylabel("actual")
plt.show()

####################################################################################################
import math
from Vector_operations_on_data import Vector, dot
from typing import List

def logistic(x: float) -> float:
    return 1.0/(1 + math.exp(-x))

def logistic_prime(x: float) -> float:
    y = logistic(x)
    return y * (1 - y)

####################################################################################################
################## Maximizing the likelihood is same as minimizing its negative #####################
###########################      _negative_log_likelihood on one data point ##########################
####################################################################################################
def _negative_log_likelihood(x:Vector, y: float, beta: Vector) -> float:
    if y == 1:
        return  -math.log(logistic(dot(x, beta)))
    else:
        return  -math.log(1 - logistic(dot(x, beta)))

####################################################################################################
# If we assume that data points are independent from each other (iid)
# Overall log likelihood is just the product of individual likelihoods
# Overall log likelihood = sum of individual log likelihood
####################################################################################################
def negative_log_likelihood(xs: List[Vector],
                            ys: List[float],
                            beta: Vector) -> float:
    return sum(_negative_log_likelihood(x, y, beta)
               for x, y in zip(xs, ys))

from scratch.linear_algebra import vector_sum

def _negative_log_partial_j(x: Vector, y: float, beta: Vector, j: int) -> float:
    """
    The j-th partial derivative for one data pont
    here i is the index of the data point
    """
    return -(y - logistic(dot(x, beta))) * x[j]

def _negative_log_gradient(x: Vector, y: float, beta: Vector) -> Vector:
    """
    The gradient for one data point
    """
    return [_negative_log_partial_j(x, y, beta, j)
            for j in range(len(beta))]
def negative_log_gradient(xs: List[Vector],
                          ys: List[float],
                          beta: Vector) -> Vector:
    return vector_sum([_negative_log_gradient(x, y, beta)
                       for x, y in zip(xs, ys)])

####################################################################################################
## Apply model on the data
####################################################################################################

from machine_learning import train_test_split;
import random
import tqdm

random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(rescaled_xs, ys, 0.33)

learning_rate = 0.01

# Pick a random starting point
beta = [random.random() for _ in range(3)]

with tqdm.trange(5000) as t:
    for epoch in t:
        gradient = negative_log_gradient(x_train, y_train, beta)
        beta = gradient_step(beta, gradient, -learning_rate)
        loss = negative_log_likelihood(x_train, y_train, beta)
        t.set_description(f"loss:{loss:.3f} beta: {beta}")

# Coefficients with rescaled data
# Convert coefficients of rescaled data to the original data
from working_with_data import scale

means, stdevs = scale(xs)
beta_unscaled = [(beta[0]
                 - beta[1]*means[1] / stdevs[1]
                 - beta[2]*means[2] / stdevs[2]),
                 - beta[1]/ stdevs[1],
                 - beta[2]/ stdevs[2]]
beta_unscaled

#####################################################################################################
# Test the model's performance on the testing data
####################################################################################################

true_positives = false_positives = true_negatives = false_negatives = 0

for x_i, y_i in zip(x_test, y_test):
    prediction = logistic(dot(beta,x_i))

    if y_i == 1 and prediction>=0.5: #TP
        true_positives += 1
    elif y_i == 1:
        false_negatives += 1
    elif prediction >= 0.5:
        false_positives += 1
    else:
        true_negatives += 1

precision = true_positives/(true_positives+false_positives)
recalls = true_positives/(true_positives+false_negatives)

precision,recalls

# Plotting
predictions = [logistic(dot(beta,x_i)) for x_i in x_test]
plt.scatter(predictions, y_test, marker = '+')
plt.xlabel("predicted probability")
plt.ylabel("actual outcome")
plt.title("Logistic Regression Predicted vs. Actual")
plt.show()