-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsvd2.py
More file actions
82 lines (74 loc) · 2.59 KB
/
svd2.py
File metadata and controls
82 lines (74 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Dependencies:
# sudo pip install numpy scipy sparsesvd
import os, sys
import time
# from multiprocessing import Process, Queue
import numpy, scipy.sparse
from sparsesvd import sparsesvd
# 458293 users
# 17700 movies
# Training set size: 102416306
# Testing set size: 2749898
TRAINING_FILENAME = "all_um.dta"
TESTING_FILENAME = "qual_um.dta"
OUTPUT_FILENAME = "submission3.dta"
FILESIZE = os.stat(TRAINING_FILENAME).st_size
NUM_USERS = 458293
NUM_MOVIES = 17770
NUM_TRAINING = 102416306
NUM_TESTING = 2749898
NUM_COMPONENTS = 100
INCR = 1000
NUM_LEARN_ITER = 100
def add_data_to_matrix(mat):
f_training = open(TRAINING_FILENAME, 'r')
print "Loading data..."
i = 0
j = 0
start_time = time.time()
for line in f_training:
user, movie, date, rating = line.strip().split()
mat[int(user)-1, int(movie)-1] = float(rating)
i += 1
if i % (NUM_TRAINING / INCR) == 0:
j += 100.0 / INCR
sys.stdout.write("\r%.1f%% done, %d ratings inserted (elapsed time: %f s)." % (j, i, time.time() - start_time))
sys.stdout.flush()
print "Data loaded."
f_training.close()
def learn(mat):
print "Starting learning process..."
start_time = time.time()
user_mat, axis_weights, movie_mat = sparsesvd(mat, NUM_COMPONENTS)
print "Matrix decomposition complete (elapsed time: %f s)." % (time.time() - start_time)
print "Learning process complete."
return (user_mat, axis_weights, movie_mat)
def learn_iter(mat):
for i in range(NUM_LEARN_ITER):
mat = learn(mat)
return mat
def predict(user_mat, movie_mat):
f_testing = open(TESTING_FILENAME, 'r')
f_out = open(OUTPUT_FILENAME, 'w')
print "Making %d predictions..." % NUM_TESTING
start_time = time.time()
i = 0
j = 0
for line in f_testing:
user, movie, date = line.strip().split()
predicted_rating = numpy.dot(user_mat[:,int(user)-1], movie_mat[:,int(movie)-1])
f_out.write(str(predicted_rating) + '\n')
i += 1
if i % (NUM_TESTING / INCR) == 0:
j += 100.0 / INCR
sys.stdout.write("\r%.1f%% done (elapsed time: %f s)." % (j, time.time() - start_time))
sys.stdout.flush()
f_testing.close()
print "Predictions complete (elapsed time: %f s)." % (time.time() - start_time)
f_out.close()
if __name__=='__main__':
training_mat = scipy.sparse.lil_matrix((NUM_USERS, NUM_MOVIES))
add_data_to_matrix(training_mat)
training_smat = scipy.sparse.csc_matrix(training_mat)
(user_mat, axis_weights, movie_mat) = learn_iter(training_smat)
predict(user_mat, movie_mat)