-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdata_processor_loader.py
More file actions
167 lines (131 loc) · 5.36 KB
/
data_processor_loader.py
File metadata and controls
167 lines (131 loc) · 5.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import zipfile
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.interpolate import CubicSpline
from torch.utils.data import Dataset, DataLoader
import torch as t
path=os.getcwd()
class OhioT1DMDataset(Dataset):
def __init__(self, data_dirs, seq_length):
self.seq_length = seq_length
# Initialize an empty DataFrame
merged_data = pd.DataFrame()
dataframes = []
# Load and merge data from all directories provided
for data_dir in data_dirs:
for subdir, dirs, files in os.walk(data_dir):
for file in files:
file_path = os.path.join(subdir, file)
# Assuming the data is in a CSV format
data_df = pd.read_csv(file_path)
# Append the data to the merged_data DataFrame
merged_data = pd.concat([merged_data, data_df])
dataframes.append(data_df)
merged_data.reset_index(inplace=True)
# Apply the preprocess function to the merged data
scaler, fill_values = get_scaler(merged_data)
self.scaler = scaler
self.preprocessed_dfs = [preprocess(scaler, fill_values, df) for df in dataframes]
# Convert the DataFrame to a PyTorch tensor
self.data = [t.tensor(df.values, dtype=t.float32) for df in self.preprocessed_dfs]
def __len__(self):
# Return the total number of sequences available
return sum(len(data) - self.seq_length + 1 for data in self.data)
def __getitem__(self, index):
# Find which data point this index is referring to
data_idx = 0
while index >= len(self.data[data_idx]) - self.seq_length + 1:
index -= len(self.data[data_idx]) - self.seq_length + 1
data_idx += 1
# Extract the sequence
sequence = self.data[data_idx][index:index+self.seq_length]
# Split the sequence into inputs and target
inputs = sequence[:-1,:] # All but the last element
target = sequence[-1,:] # Only the last element
return inputs, target
# Adjust the create_dataloader function to accept a list of directories
def create_dataloader(data_dirs, seq_length, batch_size):
# Instantiate the custom dataset
dataset = OhioT1DMDataset(data_dirs, seq_length)
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True);
def unscale(data):
data = data.squeeze(0)
df = pd.DataFrame(data, columns=dataset.preprocessed_dfs[0].columns)
unscaled = dataset.scaler.inverse_transform(df)
return t.tensor(unscaled)[None]
dataloader.__dict__['unscale'] = unscale
return dataloader
# Example usage of the function to create a DataLoader for the combined train data
train_data_dirs = [
f"{path}/Ohio Data/Ohio2018_processed/train",
f"{path}/Ohio Data/Ohio2020_processed/train"
]
test_data_dirs = [
f"{path}/Ohio Data/Ohio2018_processed/test",
f"{path}/Ohio Data/Ohio2020_processed/test"
]
def preprocess(scaler, fill_values, data_df):
data_df1 = data_df.copy()
# Identify rows where 'missing_cbg' is 1
missing_cbg_indices = data_df1[data_df1['missing_cbg'] == 1].index
# Perform cubic spline interpolation for 'cbg' column
cs = CubicSpline(
data_df1.index[~data_df1.index.isin(missing_cbg_indices)],
data_df1.loc[~data_df1.index.isin(missing_cbg_indices), 'cbg']
)
data_df1.loc[missing_cbg_indices, 'cbg'] = cs(missing_cbg_indices)
data_df1 = data_df1.drop(columns=[
'5minute_intervals_timestamp',
'missing_cbg'
]
)
# Move 'cbg' to the end
cbg = data_df1.pop('cbg')
data_df1 = data_df1.assign(cbg=cbg)
column_mins = data_df1.min()
# Subtract a small percentile of the minimum from the minimum
values = data_df1.values
"""
print("## VALUES BEFORE")
print(values)
"""
values = np.where(np.isnan(values), fill_values, values)
"""
print("## FILL VALUES")
print(fill_values)
print(fill_values.shape)
print("## VALUES")
print(values)
"""
data_df2 = pd.DataFrame(values, columns=data_df1.columns)
data_df2 = pd.DataFrame(scaler.transform(data_df2), columns=data_df2.columns)
return data_df2
def get_scaler(data_df):
data_df1 = data_df.copy()
# Identify rows where 'missing_cbg' is 1
missing_cbg_indices = data_df1[data_df1['missing_cbg'] == 1].index
# Perform cubic spline interpolation for 'cbg' column
cs = CubicSpline(
data_df1.index[~data_df1.index.isin(missing_cbg_indices)],
data_df1.loc[~data_df1.index.isin(missing_cbg_indices), 'cbg']
)
data_df1.loc[missing_cbg_indices, 'cbg'] = cs(missing_cbg_indices)
# Move 'cbg' to the end
cbg = data_df1.pop('cbg')
data_df1 = data_df1.assign(cbg=cbg)
# Drop time column
data_df1 = data_df1.drop(columns=['5minute_intervals_timestamp', 'missing_cbg', 'index'])
# Calculate the minimum of each column
column_mins = data_df1.min()
# Subtract a small percentile of the minimum from the minimum
fill_values = column_mins - 0.01 * np.abs(column_mins)
# Fill missing values with the calculated values
data_df2 = data_df1.fillna(fill_values)
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
scaler.fit(data_df2)
assert np.isnan(fill_values.values).sum() == 0, 'oh nooo'
return scaler, fill_values