Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
128 commits
Select commit Hold shift + click to select a range
5a3b38b
add utils for widedeep
whatbeg Aug 7, 2017
37fc1db
Merge remote-tracking branch 'qiuxin/widedeep' into widedeep
whatbeg Aug 13, 2017
8a28ce8
change return type to Array[Sample]
whatbeg Aug 13, 2017
fbd3852
add WideDeepUtilSpec
whatbeg Aug 13, 2017
0800f7e
change to RDD
whatbeg Aug 14, 2017
81ff893
sparse widedeep Train
whatbeg Aug 29, 2017
e1ab62c
modify Train and Utils
whatbeg Aug 30, 2017
45dd3de
add SparseTensorSpec about concat
whatbeg Aug 30, 2017
b99de09
some change
whatbeg Aug 31, 2017
7b0165e
some change 2
whatbeg Aug 31, 2017
80258bc
some change to concat
whatbeg Aug 31, 2017
d4e95af
change to LF and some changes to SparseTensor
whatbeg Aug 31, 2017
bdb3ef8
fix some bugs
whatbeg Aug 31, 2017
c06cb2f
Merge branch 'widedeep' into widedeep
whatbeg Aug 31, 2017
50ef2e7
little change
whatbeg Aug 31, 2017
c5cece9
change shape from 100 to 1023219 in Train.scala
whatbeg Aug 31, 2017
f4b1c41
change Train, Util to LF instead of CRLF
whatbeg Aug 31, 2017
e74a3f1
add resize for concat
whatbeg Aug 31, 2017
2e98918
toy
whatbeg Aug 31, 2017
723c377
add unit test
whatbeg Aug 31, 2017
824c19a
modify unit test
whatbeg Aug 31, 2017
afaa1a1
change resize
whatbeg Aug 31, 2017
2625f37
debug for sparseTensor
whatbeg Sep 1, 2017
ed1f7f3
change util
whatbeg Sep 1, 2017
7d7dbf8
change format of input
whatbeg Sep 1, 2017
d50c9cc
change format of input 2
whatbeg Sep 1, 2017
896de7a
change format of input 3
whatbeg Sep 1, 2017
0e952be
change format of input 4
whatbeg Sep 1, 2017
0b78a3d
change format of input 5
whatbeg Sep 1, 2017
f99b2f7
fix UtilSpec
whatbeg Sep 1, 2017
d1e3a96
fix integer to float
whatbeg Sep 1, 2017
fdc54a5
smallfix for Util
whatbeg Sep 1, 2017
394e12f
add load2 for debug
whatbeg Sep 1, 2017
a967ef6
add load2 for debug 2
whatbeg Sep 1, 2017
f6bd854
change Util and add UtilSpec
whatbeg Sep 1, 2017
3d728db
add toString
whatbeg Sep 1, 2017
f4bfef3
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 1, 2017
9f50e41
update WideDeepUtilSpec.scala
whatbeg Sep 4, 2017
9b35ebf
DenseTensor concat res do not resize
whatbeg Sep 4, 2017
fe02c32
change target narrow size
whatbeg Sep 4, 2017
0129590
add 1-dim concat support
whatbeg Sep 4, 2017
5abb0b7
change Utils
whatbeg Sep 4, 2017
ce3b1de
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 4, 2017
b4ed049
Merge branch 'widedeep' into widedeep
whatbeg Sep 4, 2017
95be86e
some change for DenseTensor and SparseTensor
whatbeg Sep 4, 2017
3c66eac
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 4, 2017
ad2fc12
some change for Utils
whatbeg Sep 4, 2017
9eb1b8f
add setValidation support
whatbeg Sep 4, 2017
d2b5aff
fix coding of setValidation support
whatbeg Sep 4, 2017
3bad5f0
fix coding of MiniBatchSpec.scala
whatbeg Sep 4, 2017
c2c4457
add set() for sparseTensor
whatbeg Sep 4, 2017
3312680
change label to Tensor(1, 1)
whatbeg Sep 5, 2017
e99db23
change label to Tensor(1, 1) validation
whatbeg Sep 5, 2017
fd13074
add Test for WDUtilSpec
whatbeg Sep 5, 2017
6ef9997
change UtilSpec
whatbeg Sep 5, 2017
cfa3501
debug mode
whatbeg Sep 5, 2017
d1e091c
Train loss method change
whatbeg Sep 5, 2017
20c67cf
change flag to dim1Concat
whatbeg Sep 5, 2017
5d29b0b
debug, delete later
whatbeg Sep 5, 2017
3cd7f68
debug, delete later
whatbeg Sep 5, 2017
e5a8744
debug, delete later
whatbeg Sep 5, 2017
0037c8d
debug, delete later
whatbeg Sep 5, 2017
7d9727b
Utils: change == to contains
whatbeg Sep 5, 2017
4ddf071
fix bugs
whatbeg Sep 5, 2017
d457b59
add dense python widedeep
whatbeg Sep 6, 2017
8cb9fa8
widedeep tutorial
whatbeg Sep 6, 2017
b7403b6
add run_3ksparse.sh
whatbeg Sep 6, 2017
57ff7d6
modify widedeepUtilSpec.scala
whatbeg Sep 6, 2017
9c49127
add trim()
whatbeg Sep 6, 2017
77d78a6
change to 5006
whatbeg Sep 6, 2017
efffef6
change to 5006
whatbeg Sep 6, 2017
6cf7427
change 3kdense and run_3ksparse.sh
whatbeg Sep 6, 2017
b8481bf
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 6, 2017
fcbeae1
modify run_3k
whatbeg Sep 7, 2017
75ec98a
change coding and widedeep
whatbeg Sep 7, 2017
0f29710
change Train.scala
whatbeg Sep 7, 2017
d03530b
modify run_3ksparse.sh
whatbeg Sep 7, 2017
979ee7b
add ubuntu.sh and 8cores log
whatbeg Sep 7, 2017
738b86b
runlog
whatbeg Sep 7, 2017
42f253e
add dense log
whatbeg Sep 7, 2017
e2532d3
add dense log
whatbeg Sep 7, 2017
74641fb
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 7, 2017
b538158
debug SparseTensor
whatbeg Sep 8, 2017
90389e8
debug SparseTensor
whatbeg Sep 8, 2017
1077560
debug SparseTensor
whatbeg Sep 8, 2017
8aaefaa
09081700.log
whatbeg Sep 8, 2017
26012f6
debug SparseTensor
whatbeg Sep 10, 2017
cbcf606
debug SparseTensor
whatbeg Sep 11, 2017
3cc1c76
debug SparseTensor
whatbeg Sep 11, 2017
292dcc4
debug SparseTensor
whatbeg Sep 11, 2017
4691ed7
debug SparseTensor
whatbeg Sep 11, 2017
d4864b8
debug SparseTensor
whatbeg Sep 11, 2017
1a5b94d
debug SparseTensor
whatbeg Sep 11, 2017
b085eb1
debug SparseTensor
whatbeg Sep 11, 2017
a32a323
debug SparseTensor
whatbeg Sep 11, 2017
3c24c27
debug SparseTensor
whatbeg Sep 11, 2017
b17c460
debug SparseTensor
whatbeg Sep 11, 2017
1d3c47e
debug SparseTensor
whatbeg Sep 11, 2017
95750b3
debug SparseTensor
whatbeg Sep 11, 2017
20de460
debug SparseTensor
whatbeg Sep 11, 2017
7809f17
debug SparseTensor
whatbeg Sep 11, 2017
0bb441f
debug SparseTensor
whatbeg Sep 11, 2017
8993964
debug SparseTensor
whatbeg Sep 11, 2017
a66bb9a
debug SparseTensor
whatbeg Sep 11, 2017
a23edc8
debug SparseTensor
whatbeg Sep 11, 2017
f91ff6a
debug SparseTensor
whatbeg Sep 11, 2017
a5e3629
debug SparseTensor
whatbeg Sep 11, 2017
b3cb7c5
debug SparseTensor
whatbeg Sep 11, 2017
bd53b42
debug SparseTensor
whatbeg Sep 11, 2017
060852e
debug SparseTensor
whatbeg Sep 11, 2017
b074688
debug SparseTensor
whatbeg Sep 11, 2017
10191f1
debug SparseTensor
whatbeg Sep 11, 2017
2c4e79f
debug SparseTensor
whatbeg Sep 11, 2017
fa4f692
debug SparseTensor
whatbeg Sep 11, 2017
9bb076a
debug SparseTensor
whatbeg Sep 11, 2017
8c092a7
debug SparseTensor
whatbeg Sep 11, 2017
a3cb13d
debug SparseTensor
whatbeg Sep 11, 2017
d2e868e
run_3ksparse.sh
whatbeg Sep 12, 2017
9c0a06b
debug SparseTensor
whatbeg Sep 12, 2017
7b51578
debug SparseTensor
whatbeg Sep 12, 2017
61bfca3
run_3ksparse.sh
whatbeg Sep 12, 2017
6a7f751
debug SparseTensor
whatbeg Sep 12, 2017
b56c502
Merge remote-tracking branch 'origin/widedeep' into widedeep
whatbeg Sep 12, 2017
f58a80a
debug SparseTensor
whatbeg Sep 12, 2017
bbcb637
debug SparseTensor
whatbeg Sep 12, 2017
b5be371
debug SparseTensor
whatbeg Sep 12, 2017
ea77ec6
debug SparseTensor
whatbeg Sep 12, 2017
21d3fd7
debug SparseTensor
whatbeg Sep 13, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ nohup.out
.ipynb_checkpoints/
pyspark/dist/
pyspark/build/

*tensor.data
769 changes: 769 additions & 0 deletions LOG/BigDL_3k_dense_09072017.log

Large diffs are not rendered by default.

3,209 changes: 3,209 additions & 0 deletions LOG/BigDL_3k_dense_1152_09072048_F.log

Large diffs are not rendered by default.

3,319 changes: 3,319 additions & 0 deletions LOG/BigDL_3k_sparse_1152_local24_09071646.log

Large diffs are not rendered by default.

3,319 changes: 3,319 additions & 0 deletions LOG/BigDL_3k_sparse_1152_local24_09081700.log

Large diffs are not rendered by default.

2,919 changes: 2,919 additions & 0 deletions LOG/BigDL_3k_sparse_1280_local8_09071536.log

Large diffs are not rendered by default.

16,283 changes: 16,283 additions & 0 deletions census/test.data

Large diffs are not rendered by default.

32,562 changes: 32,562 additions & 0 deletions census/train.data

Large diffs are not rendered by default.

Empty file.
228 changes: 228 additions & 0 deletions pyspark/bigdl/models/widedeep/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Still in experimental stage!


from optparse import OptionParser
import os
import sys
import copy
import numpy as np
import pandas as pd
import scipy as sp


CSV_COLUMNS = [
"age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"
]

AGE, WORKCLASS, FNLWGT, EDUCATION, EDUCATION_NUM, MARITAL_STATUS, OCCPATION, \
RELATIONSHIP, RACE, GENDER, CAPITAL_GAIN, CAPITAL_LOSS, HOURS_PER_WEEK, NATIVE_COUNTRY, \
AGE_BUCKETS, LABEL, EDUCATION_OCCUPATION, AGEBUCKET_EDUCATION_OCCUPATION, NATIVECOUNTRY_OCCUPATION = range(19)

LABEL_COLUMN = "label"
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
"relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
"hours_per_week"]


def get_data(train_file_name='train.data', test_file_name='test.data'):
df_train = pd.read_csv(train_file_name,
names=CSV_COLUMNS,
skipinitialspace=True,
engine="python")

df_test = pd.read_csv(test_file_name,
names=CSV_COLUMNS,
skipinitialspace=True,
skiprows=1, # skip first line: "|1x3 Cross Validator"
engine="python")

df_train = df_train.dropna(how='any', axis=0)
df_test = df_test.dropna(how='any', axis=0)
df_train[LABEL_COLUMN] = (
df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (
df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

return df_train, df_test


def binary_search(val, array, start=0):
"""
binary search implementation

:param val: value to search
:param array: data array to be searched
:param start: 0 if array starts with 0 else 1
:return: location of val in array, or bucket fall in if not in array
"""
low = start
high = len(array) - 1 + start
while low <= high:
mid = (low + high) / 2
if array[mid] == val:
return mid
elif array[mid] > val:
high = mid-1
else:
low = mid+1
return low


def bucketized_column(column, boundaries, start=0):
"""
transform every value of a column to corresponding bucket according to boundaries

:param column: primitive column
:param boundaries: boundaries to bucketize
:param start: start with 0 or 1
:return: bucketized column
"""
_column = copy.deepcopy(column)
for i in range(len(_column)):
_column[i] = binary_search(_column[i], boundaries) + start
return _column


def cross_column(columns, hash_backet_size=1000, start=1):
"""
generate cross column feature from `columns` with hash bucket.

:param columns: columns to use to generate cross column, Type must be ndarray
:param hash_backet_size: hash bucket size to bucketize cross columns to fixed hash bucket
:return: cross column, represented as a ndarray
"""
assert columns.shape[0] > 0 and columns.shape[1] > 0
_crossed_column = np.zeros(columns.shape[0])
for i in range(columns.shape[0]):
_crossed_column[i] = (hash("_".join(map(str, columns[i, :]))) % hash_backet_size
+ hash_backet_size) % hash_backet_size + start
return _crossed_column

def categorical_column_with_vocabulary_list(column, vocab_list, default=1):

n = column.shape[0]
assert n > 0 and len(vocab_list) > 0
vocab_dict = {}
for i, word in enumerate(vocab_list):
vocab_dict[word] = i+1
_newcol = np.zeros(n)
for row in range(n):
_newcol[row] = vocab_dict[column[row]] if column[row] in vocab_dict else default
return _newcol

def sparse_column(column, vocab_size):
"""
convert integer id to sparse representation.
For example, 3 -> [0, 0, 0, 1, 0, ...]

:param column: the whole column with integer ids of this feature, Type: ndarray
:param vocab_size: length of sparse vector
:return: new column consist of converted sparse features, Type: ndarray
"""
n = column.shape[0]
assert n > 0 and vocab_size > 0
_newcol = np.zeros((n, vocab_size))
for row in range(n):
ind = int(column[row])
# print("ind = {}".format(ind))
assert 0 < ind <= vocab_size
np.put(_newcol[row], ind-1, 1)
return _newcol


def feature_columns(df):

age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
age_bucket = bucketized_column(df[:, AGE], boundaries=age_boundaries, start=1)
df[:, AGE_BUCKETS] = age_bucket
df[:, GENDER] = categorical_column_with_vocabulary_list(df[:, GENDER], ["Female", "Male"])
df[:, EDUCATION] = categorical_column_with_vocabulary_list(df[:, EDUCATION], [ # 16
"Bachelors", "HS-grad", "11th", "Masters", "9th",
"Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
"Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
"Preschool", "12th"
])
df[:, MARITAL_STATUS] = categorical_column_with_vocabulary_list(df[:, MARITAL_STATUS], [ # 7
"Married-civ-spouse", "Divorced", "Married-spouse-absent",
"Never-married", "Separated", "Married-AF-spouse", "Widowed"
])
df[:, RELATIONSHIP] = categorical_column_with_vocabulary_list(df[:, RELATIONSHIP], [ #6
"Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
"Other-relative"
])
df[:, WORKCLASS] = categorical_column_with_vocabulary_list(df[:, WORKCLASS], [ # 9
"Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
"Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
])

for i in range(df.shape[0]):
df[i, OCCPATION] = (hash(df[i, 6]) % 1000 + 1000) % 1000 + 1 # occupation
df[i, NATIVE_COUNTRY] = (hash(df[i, 13]) % 1000 + 1000) % 1000 + 1 # native_country

education_occupation = cross_column(df[:, [EDUCATION, OCCPATION]], hash_backet_size=int(1000))
agebucket_education_occpation = cross_column(df[:, [AGE_BUCKETS, EDUCATION, OCCPATION]], hash_backet_size=int(1000))
nativecountry_occupation = cross_column(df[:, [NATIVE_COUNTRY, OCCPATION]], hash_backet_size=int(1000))
df = np.c_[df, education_occupation, nativecountry_occupation, agebucket_education_occpation]
return df


def make_wide_deep_columns(df):

base_columns = np.array(df[:, GENDER])
base_columns = np.c_[base_columns, df[:, EDUCATION], df[:, MARITAL_STATUS], df[:, RELATIONSHIP]]
base_columns = np.c_[base_columns, df[:, WORKCLASS], sparse_column(df[:, OCCPATION], 1000),
sparse_column(df[:, NATIVE_COUNTRY], 1000), df[:, AGE_BUCKETS]]

crossed_columns = np.array(sparse_column(df[:, EDUCATION_OCCUPATION], 1000))
crossed_columns = np.c_[crossed_columns, sparse_column(df[:, AGEBUCKET_EDUCATION_OCCUPATION], 1000)]
crossed_columns = np.c_[crossed_columns, sparse_column(df[:, NATIVECOUNTRY_OCCUPATION], 1000)]

deep_columns = np.array(sparse_column(df[:, WORKCLASS], 9))
deep_columns = np.c_[deep_columns, sparse_column(df[:, EDUCATION], 16), sparse_column(df[:, GENDER], 2)]
deep_columns = np.c_[deep_columns, sparse_column(df[:, RELATIONSHIP], 6)]

deep_columns = np.c_[deep_columns, df[:, NATIVE_COUNTRY]] # for embedding 8 dims
deep_columns = np.c_[deep_columns, df[:, OCCPATION]] # for embedding 8 dims

deep_columns = np.c_[deep_columns, df[:, AGE], df[:, EDUCATION_NUM], df[:, CAPITAL_GAIN]]
deep_columns = np.c_[deep_columns, df[:, CAPITAL_LOSS], df[:, HOURS_PER_WEEK]]

wide_deep_columns = np.c_[base_columns, crossed_columns, deep_columns]
return np.c_[wide_deep_columns, np.array(df[:, LABEL])]


def handle():
df_train, df_test = get_data()
df_train = np.array(df_train)
df_test = np.array(df_test)
df_train = feature_columns(df_train)
df_test = feature_columns(df_test)

train_data = make_wide_deep_columns(df_train[:])
np.savetxt("./data/train_tensor.data", train_data, fmt="%d", delimiter=',')
del train_data

test_data = make_wide_deep_columns(df_test[:])
np.savetxt("./data/test_tensor.data", test_data, fmt="%d", delimiter=',')
del test_data

handle()
Loading