Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions dowhy/utils/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e

# Columns to keep in the result - not encoded.
columns_to_keep = data.columns.difference(data_to_encode.columns)
df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)
df_columns_to_keep = data[columns_to_keep]

if encoder is None: # Create new encoder
drop = None
Expand All @@ -51,10 +51,12 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
else: # Use existing encoder
encoded_data = encoder.transform(data_to_encode)

# Convert the encoded data to a DataFrame
# Convert the encoded data to a DataFrame, preserving the original index so that
# callers relying on index alignment (e.g. distance matching with a data subset) work
# correctly.
columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)

df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original
df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index)

# Concatenate the encoded DataFrame with the original non-categorical columns
df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)
Expand Down
26 changes: 26 additions & 0 deletions tests/utils/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,29 @@ def test_one_hot_encode_consistent_with_new_data():
c_z2 = df_encoded2["C_Z"]
assert c_z1[2] == c_z2[1]
assert c_z1[5] == c_z2[5]


def test_one_hot_encode_preserves_index():
"""Regression test for https://github.com/py-why/dowhy/issues/1372.

When a DataFrame with a non-default (non-sequential) index is encoded,
the output must retain the original index so that index-aligned operations
downstream (e.g. pd.concat or boolean .loc indexing) continue to work
correctly. This scenario occurs whenever a data-subset refuter passes a
sampled subset of the original DataFrame to an estimator like
DistanceMatchingEstimator.
"""
data = pd.DataFrame({"cat": ["a", "b", "a", "c", "b"], "num": [1.0, 2.0, 3.0, 4.0, 5.0]})
# Simulate a data-subset refuter that samples rows without resetting the index.
subset = data.iloc[[1, 3, 4]]

result, _ = one_hot_encode(subset)

assert list(result.index) == [1, 3, 4], "Index must be preserved after encoding"


def test_one_hot_encode_preserves_index_no_categorical():
"""Index must be preserved even when there are no categorical columns."""
data = pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=[5, 10, 15])
result, _ = one_hot_encode(data)
assert list(result.index) == [5, 10, 15], "Index must be preserved when no encoding is needed"
Loading