-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPreProcess.py
More file actions
136 lines (118 loc) · 5.39 KB
/
PreProcess.py
File metadata and controls
136 lines (118 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import ast
import os
from sklearn.preprocessing import OneHotEncoder
# Define chunk size (adjust if memory issues persist)
CHUNK_SIZE = 10000
# Step 1: Initialize lists to store preprocessed data
X_chunks = []
y_reg_chunks = [] # Steer, Accel, Brake
y_cls_chunks = [] # Gear_output (one-hot)
# List of CSV files
files = ["Lancer_Dirt.csv", "Lancer_Oval.csv", "Lancer_Road.csv", "Corolla_Dirt.csv", "Corolla_Oval.csv", "Corolla_Road.csv", "Peugeot_Dirt.csv", "Peugeot_Oval.csv", "Peugeot_Road.csv", "datap406.csv", "corolla_data.csv", "Lancer_data.csv" ]
# Expected columns (from sample data)
expected_columns = [
"Angle", "TrackPos", "DistFromStart", "DistRaced", "Z",
"SpeedX", "SpeedY", "SpeedZ", "Gear", "RPM", "Fuel", "Damage",
"RacePos", "CurLapTime", "LastLapTime", "Focus", "Track",
"Opponents", "WheelSpinVel", "Steer", "Accel", "Brake",
"Clutch", "ControlFocus", "Gear_output", "Meta"
]
# Step 2: Process each CSV file in chunks
for file in files:
if not os.path.exists(file):
print(f"Error: {file} not found")
continue
print(f"Processing {file}...")
# Read CSV in chunks
for chunk in pd.read_csv(file, chunksize=CHUNK_SIZE):
# Verify columns
if "Gear.1" in chunk.columns:
chunk.rename(columns={"Gear.1": "Gear_output"},inplace=True)
chunk = chunk.drop(columns=["Pred_Accel", "Pred_Gear", "Pred_Brake", "Pred_Steer"], errors='ignore')
if set(chunk.columns) != set(expected_columns):
print(f"Warning: {file} has unexpected columns: {set(chunk.columns) - set(expected_columns)}")
continue
# Select relevant columns early to reduce memory
chunk = chunk[["Track", "TrackPos", "Angle", "SpeedX", "Gear", "RPM", "Opponents",
"Steer", "Accel", "Brake", "Gear_output"]]
# Step 3: Parse list columns
try:
chunk["Track"] = chunk["Track"].apply(ast.literal_eval)
chunk["Opponents"] = chunk["Opponents"].apply(ast.literal_eval)
except ValueError as e:
print(f"Error parsing list columns in {file}: {e}")
continue
# Step 4: Clean data
# Filter valid output ranges
chunk = chunk[
(chunk["Steer"].between(-1, 1)) &
(chunk["Accel"].between(0, 1)) &
(chunk["Brake"].between(0, 1)) &
(chunk["Gear_output"].isin([-1, 0, 1, 2, 3, 4, 5, 6]))
]
# Step 5: Derive MinOpponent
chunk["MinOpponent"] = chunk["Opponents"].apply(lambda x: min(x))
# Step 6: Create input array
X_chunk = np.hstack([
np.array(chunk["Track"].tolist()), # 19 columns
chunk[["TrackPos", "Angle", "SpeedX", "Gear", "RPM", "MinOpponent"]].values
])
# Step 7: Normalize inputs
# Track: Divide by 200m
X_chunk[:, :19] = np.clip(X_chunk[:, :19] / 200, 0, 1)
# TrackPos: Map [-1, 1] to [0, 1]
X_chunk[:, 19] = (X_chunk[:, 19] + 1) / 2
# Angle: Map [-π, π] to [0, 1]
X_chunk[:, 20] = (X_chunk[:, 20] + np.pi) / (2 * np.pi)
# SpeedX: Divide by 100 m/s
X_chunk[:, 21] = np.clip(X_chunk[:, 21] / 100, 0, 1)
# Gear: Map [-1, 6] to [0, 1]
X_chunk[:, 22] = (X_chunk[:, 22] + 1) / 7
# RPM: Divide by 10000
X_chunk[:, 23] = np.clip(X_chunk[:, 23] / 10000, 0, 1)
# MinOpponent: Divide by 200m
X_chunk[:, 24] = np.clip(X_chunk[:, 24] / 200, 0, 1)
# Step 8: Outputs
# Regression outputs: Steer, Accel, Brake
y_reg_chunk = chunk[["Steer", "Accel", "Brake"]].values
# Classification output: Gear_output (one-hot encoded)
gear_encoder = OneHotEncoder(categories=[[-1, 0, 1, 2, 3, 4, 5, 6]], sparse_output=False)
y_cls_chunk = gear_encoder.fit_transform(chunk[["Gear_output"]])
# Append to chunks
X_chunks.append(X_chunk)
y_reg_chunks.append(y_reg_chunk)
y_cls_chunks.append(y_cls_chunk)
# Step 9: Combine chunks
if not X_chunks:
raise ValueError("No valid data processed from CSV files")
X = np.vstack(X_chunks)
y = {
'regression': np.vstack(y_reg_chunks), # Steer, Accel, Brake
'classification': np.vstack(y_cls_chunks) # Gear_output (one-hot)
}
print(f"Combined data shape: X={X.shape}, y_regression={y['regression'].shape}, y_classification={y['classification'].shape}")
# Step 10: Split data
X_train, X_val, y_reg_train, y_reg_val = train_test_split(
X, y['regression'], test_size=0.2, random_state=42
)
_, _, y_cls_train, y_cls_val = train_test_split(
X, y['classification'], test_size=0.2, random_state=42
)
# Step 11: Save preprocessed data
np.save("X_train.npy", X_train)
np.save("y_reg_train.npy", y_reg_train) # Steer, Accel, Brake
np.save("y_cls_train.npy", y_cls_train) # Gear_output (one-hot)
np.save("X_val.npy", X_val)
np.save("y_reg_val.npy", y_reg_val)
np.save("y_cls_val.npy", y_cls_val)
# Print shapes for verification
print(f"X_train shape: {X_train.shape}") # (~92807, 25)
print(f"y_reg_train shape: {y_reg_train.shape}") # (~92807, 3)
print(f"y_cls_train shape: {y_cls_train.shape}") # (~92807, 8)
print(f"X_val shape: {X_val.shape}")
print(f"y_reg_val shape: {y_reg_val.shape}")
print(f"y_cls_val shape: {y_cls_val.shape}")