py-why · github-actions · Mar 20, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py
@@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
 
     # Columns to keep in the result - not encoded.
     columns_to_keep = data.columns.difference(data_to_encode.columns)
-    df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)
+    df_columns_to_keep = data[columns_to_keep]
 
     if encoder is None:  # Create new encoder
         drop = None
@@ -51,10 +51,12 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
     else:  # Use existing encoder
         encoded_data = encoder.transform(data_to_encode)
 
-    # Convert the encoded data to a DataFrame
+    # Convert the encoded data to a DataFrame, preserving the original index so that
+    # callers relying on index alignment (e.g. distance matching with a data subset) work
+    # correctly.
     columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)
 
-    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True)  # drop index from original
+    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index)
 
     # Concatenate the encoded DataFrame with the original non-categorical columns
     df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)

diff --git a/tests/utils/test_encoding.py b/tests/utils/test_encoding.py
@@ -85,3 +85,29 @@ def test_one_hot_encode_consistent_with_new_data():
     c_z2 = df_encoded2["C_Z"]
     assert c_z1[2] == c_z2[1]
     assert c_z1[5] == c_z2[5]
+
+
+def test_one_hot_encode_preserves_index():
+    """Regression test for https://github.com/py-why/dowhy/issues/1372.
+
+    When a DataFrame with a non-default (non-sequential) index is encoded,
+    the output must retain the original index so that index-aligned operations
+    downstream (e.g. pd.concat or boolean .loc indexing) continue to work
+    correctly. This scenario occurs whenever a data-subset refuter passes a
+    sampled subset of the original DataFrame to an estimator like
+    DistanceMatchingEstimator.
+    """
+    data = pd.DataFrame({"cat": ["a", "b", "a", "c", "b"], "num": [1.0, 2.0, 3.0, 4.0, 5.0]})
+    # Simulate a data-subset refuter that samples rows without resetting the index.
+    subset = data.iloc[[1, 3, 4]]
+
+    result, _ = one_hot_encode(subset)
+
+    assert list(result.index) == [1, 3, 4], "Index must be preserved after encoding"
+
+
+def test_one_hot_encode_preserves_index_no_categorical():
+    """Index must be preserved even when there are no categorical columns."""
+    data = pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=[5, 10, 15])
+    result, _ = one_hot_encode(data)
+    assert list(result.index) == [5, 10, 15], "Index must be preserved when no encoding is needed"