GlucoseProject/data_processor_loader.py at main · SopTax/GlucoseProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import zipfile
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.interpolate import CubicSpline
from torch.utils.data import Dataset, DataLoader
import torch as t

path=os.getcwd()


class OhioT1DMDataset(Dataset):
    def __init__(self, data_dirs, seq_length):
        self.seq_length = seq_length

        # Initialize an empty DataFrame
        merged_data = pd.DataFrame()
        dataframes = []
        # Load and merge data from all directories provided
        for data_dir in data_dirs:
            for subdir, dirs, files in os.walk(data_dir):
                for file in files:
                    file_path = os.path.join(subdir, file)
                    # Assuming the data is in a CSV format
                    data_df = pd.read_csv(file_path)
                    # Append the data to the merged_data DataFrame
                    merged_data = pd.concat([merged_data, data_df])
                    dataframes.append(data_df)

        merged_data.reset_index(inplace=True)
        # Apply the preprocess function to the merged data
        scaler, fill_values = get_scaler(merged_data)
        self.scaler = scaler
        self.preprocessed_dfs = [preprocess(scaler, fill_values, df) for df in dataframes]

        # Convert the DataFrame to a PyTorch tensor
        self.data = [t.tensor(df.values, dtype=t.float32) for df in self.preprocessed_dfs]

    def __len__(self):
        # Return the total number of sequences available
        return sum(len(data) - self.seq_length + 1 for data in self.data)

    def __getitem__(self, index):
        # Find which data point this index is referring to
        data_idx = 0
        while index >= len(self.data[data_idx]) - self.seq_length + 1:
            index -= len(self.data[data_idx]) - self.seq_length + 1
            data_idx += 1

        # Extract the sequence
        sequence = self.data[data_idx][index:index+self.seq_length]
        # Split the sequence into inputs and target
        inputs = sequence[:-1,:]  # All but the last element
        target = sequence[-1,:]  # Only the last element

        return inputs, target
# Adjust the create_dataloader function to accept a list of directories
def create_dataloader(data_dirs, seq_length, batch_size):
    # Instantiate the custom dataset
    dataset = OhioT1DMDataset(data_dirs, seq_length)
    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True);

    def unscale(data):
      data = data.squeeze(0)
      df = pd.DataFrame(data, columns=dataset.preprocessed_dfs[0].columns)
      unscaled = dataset.scaler.inverse_transform(df)
      return t.tensor(unscaled)[None]

    dataloader.__dict__['unscale'] = unscale
    return dataloader

# Example usage of the function to create a DataLoader for the combined train data
train_data_dirs = [
    f"{path}/Ohio Data/Ohio2018_processed/train",
    f"{path}/Ohio Data/Ohio2020_processed/train"
]
test_data_dirs = [
    f"{path}/Ohio Data/Ohio2018_processed/test",
    f"{path}/Ohio Data/Ohio2020_processed/test"
]


def preprocess(scaler, fill_values, data_df):
  data_df1 = data_df.copy()
  # Identify rows where 'missing_cbg' is 1
  missing_cbg_indices = data_df1[data_df1['missing_cbg'] == 1].index

  # Perform cubic spline interpolation for 'cbg' column
  cs = CubicSpline(
    data_df1.index[~data_df1.index.isin(missing_cbg_indices)],
    data_df1.loc[~data_df1.index.isin(missing_cbg_indices), 'cbg']
  )

  data_df1.loc[missing_cbg_indices, 'cbg'] = cs(missing_cbg_indices)

  data_df1 = data_df1.drop(columns=[
      '5minute_intervals_timestamp',
      'missing_cbg'
    ]
  )

  # Move 'cbg' to the end
  cbg = data_df1.pop('cbg')
  data_df1 = data_df1.assign(cbg=cbg)

  column_mins = data_df1.min()
  # Subtract a small percentile of the minimum from the minimum
  values = data_df1.values
  """
  print("## VALUES BEFORE")
  print(values)
  """
  values = np.where(np.isnan(values), fill_values, values)
  """
  print("## FILL VALUES")
  print(fill_values)
  print(fill_values.shape)
  print("## VALUES")
  print(values)
  """


  data_df2 = pd.DataFrame(values, columns=data_df1.columns)
  data_df2 = pd.DataFrame(scaler.transform(data_df2), columns=data_df2.columns)

  return data_df2


def get_scaler(data_df):
    data_df1 = data_df.copy()

    # Identify rows where 'missing_cbg' is 1
    missing_cbg_indices = data_df1[data_df1['missing_cbg'] == 1].index

    # Perform cubic spline interpolation for 'cbg' column
    cs = CubicSpline(
      data_df1.index[~data_df1.index.isin(missing_cbg_indices)],
      data_df1.loc[~data_df1.index.isin(missing_cbg_indices), 'cbg']
    )

    data_df1.loc[missing_cbg_indices, 'cbg'] = cs(missing_cbg_indices)

    # Move 'cbg' to the end
    cbg = data_df1.pop('cbg')
    data_df1 = data_df1.assign(cbg=cbg)


    # Drop time column
    data_df1 = data_df1.drop(columns=['5minute_intervals_timestamp', 'missing_cbg', 'index'])

    # Calculate the minimum of each column
    column_mins = data_df1.min()
    # Subtract a small percentile of the minimum from the minimum
    fill_values = column_mins - 0.01 * np.abs(column_mins)
    # Fill missing values with the calculated values
    data_df2 = data_df1.fillna(fill_values)

    # Initialize a scaler, then apply it to the features
    scaler = MinMaxScaler()  # default=(0, 1)
    scaler.fit(data_df2)

    assert np.isnan(fill_values.values).sum() == 0, 'oh nooo'

    return scaler, fill_values