-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathRank 3: SRK.py
More file actions
113 lines (89 loc) · 4.04 KB
/
Rank 3: SRK.py
File metadata and controls
113 lines (89 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import sys
from math import sqrt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn import ensemble
from sklearn import linear_model as lm
from sklearn.metrics import mean_squared_error as mse
import xgboost as xgb
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.002
params["min_child_weight"] = 1
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["seed"] = 1
plst = list(params.items())
num_rounds = 900
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest)
return pred_test_y
def rmse(act_y, pred_y):
return np.sqrt(mse(act_y, pred_y))
if __name__ == "__main__":
data_path = "../Data/"
train_file = data_path + "Train_KQyJ5eh.csv"
test_file = data_path + "Test_HmLwURQ.csv"
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
print "Converting to date format"
train_df["Date_mod"] = (pd.to_datetime(train_df["Date"], format="%d-%b-%y"))
test_df["Date_mod"] = (pd.to_datetime(test_df["Date"], format="%d-%b-%y"))
train_y = np.array(train_df.Number_SKU_Sold.values)
train_y[train_y > 20000000] = 20000000
test_id = test_df.Date.values
print "Processing Dates.."
train_df["DayOfMonth"] = train_df["Date_mod"].apply(lambda x: x.day)
test_df["DayOfMonth"] = test_df["Date_mod"].apply(lambda x: x.day)
train_df["Month"] = train_df["Date_mod"].apply(lambda x: x.month)
test_df["Month"] = test_df["Date_mod"].apply(lambda x: x.month)
#train_df["Year"] = train_df["Date"].apply(lambda x: x.year)
#test_df["Year"] = test_df["Date"].apply(lambda x: x.year)
#train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour)
#test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour)
train_df["WeekDay"] = train_df["Date_mod"].apply(lambda x: x.weekday())
test_df["WeekDay"] = test_df["Date_mod"].apply(lambda x: x.weekday())
#train_df["WeekNo"] = train_df["Date_mod"].apply(lambda x: x.isocalendar()[1])
#test_df["WeekNo"] = test_df["Date_mod"].apply(lambda x: x.isocalendar()[1])
train_df["DayOfYear"] = train_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday)
test_df["DayOfYear"] = test_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday)
train_df["DayCount"] = train_df["Date_mod"].apply(lambda x: x.toordinal())
test_df["DayCount"] = test_df["Date_mod"].apply(lambda x: x.toordinal())
#year_dict = {2012:2012, 2013:2013, 2014:2014, 2015:2014}
#train_df["Year"] = train_df["Year"].apply(lambda x: year_dict[x])
#test_df["Year"] = test_df["Year"].apply(lambda x: year_dict[x])
train_df.drop(["Date_mod","Date","Number_SKU_Sold"], axis=1, inplace=True)
test_df.drop(["Date_mod","Date"], axis=1, inplace=True)
print train_df.shape, test_df.shape
print train_df.head()
print test_df.head()
#cutoff_day = 365
#dev_X = np.array(train_df.iloc[:cutoff_day,:])
#dev_y = train_y[:cutoff_day]
#val_X = np.array(train_df.iloc[cutoff_day:,:])
#val_y = train_y[cutoff_day:]
#reg = lm.LinearRegression()
#reg = lm.Lasso()
#reg.fit(dev_X, dev_y)
#preds = reg.predict(val_X)
#print rmse(val_y, preds)
#preds = runXGB(dev_X, dev_y, val_X, val_y)
#print rmse(val_y, preds)
preds_xgb = runXGB(np.array(train_df)[299:,:], train_y[299:], np.array(test_df))
reg = lm.LinearRegression()
reg.fit(np.array(train_df)[:,:], train_y[:])
preds_lm = reg.predict( np.array(test_df))
train_y[train_y > 15000000] = 15000000
preds = 0.8*preds_xgb + 0.2*preds_lm
preds[357] = 70000000
# Saving the predictions #
sample = pd.read_csv(data_path + "Sample_Submission_6FjDs3p.csv")
sample["Number_SKU_Sold"] = preds
sample.to_csv("sub18_xgb.csv", index=False)