exploratory-io · kei51e · Jun 3, 2026 · Jun 3, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exploratory
 Type: Package
 Title: R package for Exploratory
-Version: 15.1.13
-Date: 2026-05-30
+Version: 15.1.14
+Date: 2026-06-03
 Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut")))
 URL: https://github.com/exploratory-io/exploratory_func
 Description: Functions for Exploratory

diff --git a/R/kmeans.R b/R/kmeans.R
@@ -33,19 +33,37 @@ iterate_kmeans <- function(df, max_centers = 10,
   ret %>% rowwise(center) %>% glance_rowwise(model)
 }
 
+# Pick the row index used for the O(n^2) silhouette dist/computation (#36126).
+# Returns the full index when sample_size is unset / non-positive / not smaller than n;
+# otherwise a sorted random subsample of size sample_size. The RNG seed is set once at the
+# top of exp_kmeans(), so the draw is reproducible across runs.
+silhouette_sample_index <- function(n, sample_size = NULL) {
+  size <- suppressWarnings(as.numeric(sample_size))
+  if (length(size) != 1 || is.na(size) || size < 1 || n <= size) {
+    return(seq_len(n))
+  }
+  sort(sample.int(n, floor(size)))
+}
+
 # internal function to iterate number of centers (k) from 2 to max_centers for silhouette method to find optimal k.
+# When `mat` is supplied it must already be the numeric (and, if normalize_data, scaled)
+# matrix that `dist` was computed from; in that case df/normalize_data are ignored for the
+# matrix build so the kmeans input and the silhouette dist stay consistent (#36126).
 iterate_silhouette <- function(df, max_centers = 10,
                                iter.max = 10,
                                nstart = 1,
                                algorithm = "Hartigan-Wong",
                                trace = FALSE,
                                normalize_data = TRUE,
                                seed = NULL,
-                               dist = NULL) {
-  mat <- as_numeric_matrix_(df, columns = colnames(df))
-  if (normalize_data) {
-    mat <- scale(mat)
-    mat[is.nan(mat)] <- 0
+                               dist = NULL,
+                               mat = NULL) {
+  if (is.null(mat)) {
+    mat <- as_numeric_matrix_(df, columns = colnames(df))
+    if (normalize_data) {
+      mat <- scale(mat)
+      mat[is.nan(mat)] <- 0
+    }
   }
   # Cap k by the number of distinct data points, not just the row count:
   # stats::kmeans() errors with "more cluster centers than distinct data points"
@@ -81,22 +99,32 @@ iterate_silhouette <- function(df, max_centers = 10,
 
 # Compute per-row silhouette widths for an already-built clustering.
 #
-# cluster_ids: integer cluster assignment vector (length n, aligned to mat rows).
-#              For K-Means this is x$kmeans$cluster (integer 1..k).
-# mat:         normalized numeric matrix used for clustering (n rows).
+# cluster_ids: integer cluster assignment vector (length n) for the FULL data.
+#              For K-Means this is x$kmeans$cluster (integer 1..k). The result is always
+#              length n and positionally aligned to it (prcomp.R binds it onto x$df).
+# mat:         normalized numeric matrix for the rows named by sample_idx (sample_size rows
+#              when subsampling, otherwise n rows).
 # d:           optional precomputed stats::dist(mat) to reuse. Computed when NULL.
+# sample_idx:  positions (into 1..n) that mat/d correspond to. NULL means the full data
+#              (mat has n rows). When a strict subset, the silhouette is computed only on
+#              those rows and the scores are scattered back into the length-n result with
+#              NA elsewhere -- this bounds the O(n^2) dist to sample_size rows (#36126).
 #
 # Returns a tibble with n rows and columns silhouette_score, nearest_cluster, cluster_width.
 # Returns all-NA columns (no error) when silhouette is undefined
 # (fewer than 2 clusters, or fewer than 2 distinct points).
-compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) {
+compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL, sample_idx = NULL) {
   n <- length(cluster_ids)
   na_result <- tibble::tibble(
     silhouette_score = rep(NA_real_, n),
     nearest_cluster = rep(NA_integer_, n),
     cluster_width = rep(NA_real_, n)
   )
-  ids <- as.integer(cluster_ids)
+  if (is.null(sample_idx)) {
+    sample_idx <- seq_len(n)
+  }
+  # mat/d already correspond to sample_idx rows; the cluster ids must be subset to match.
+  ids <- as.integer(cluster_ids[sample_idx])
   if (length(unique(ids)) < 2 || nrow(unique(mat)) < 2) {
     return(na_result)
   }
@@ -111,11 +139,12 @@ compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) {
   widths <- as.numeric(sil[, "sil_width"])
   clusters <- as.integer(sil[, "cluster"])
   clus_avg <- tapply(widths, clusters, mean, na.rm = TRUE)
-  tibble::tibble(
-    silhouette_score = widths,
-    nearest_cluster = as.integer(sil[, "neighbor"]),
-    cluster_width = as.numeric(clus_avg[as.character(clusters)])
-  )
+  # Scatter the per-sampled-row values back to full length (NA for non-sampled rows).
+  # silhouette() preserves the observation order of `ids`, so widths[i] -> sample_idx[i].
+  na_result$silhouette_score[sample_idx] <- widths
+  na_result$nearest_cluster[sample_idx] <- as.integer(sil[, "neighbor"])
+  na_result$cluster_width[sample_idx] <- as.numeric(clus_avg[as.character(clusters)])
+  na_result
 }
 
 #' analytics function for K-means view
@@ -130,7 +159,11 @@ exp_kmeans <- function(df, ...,
                        max_nrow = NULL,
                        seed = 1,
                        elbow_method_mode=FALSE,
-                       max_centers = 10
+                       max_centers = 10,
+                       # Bound the O(n^2) silhouette dist/computation to this many rows (#36126).
+                       # K-means itself still runs on the full (or max_nrow) data; only the
+                       # silhouette diagnostics are estimated from this subsample. NULL = no cap.
+                       silhouette_sample_size = 5000
                        ) {
   # this evaluates select arguments like starts_with
   selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
@@ -198,16 +231,22 @@ exp_kmeans <- function(df, ...,
   # O(n^2) cost and is shared with iterate_silhouette() (decision (a), #36106).
   # Only build the shared dist for the ungrouped (UI) case; grouped models build
   # their own per-group matrix below.
+  # The dist()/silhouette work is bounded to silhouette_sample_size rows (#36126):
+  # scale over the full data (cheap, O(n*p)) so the sampled rows' coordinates stay
+  # consistent with build_kmeans.cols' full-data scaling, THEN subset before dist().
   is_single_model <- length(ret$model) == 1
   shared_sil_mat <- NULL
   shared_sil_dist <- NULL
+  sil_sample_idx <- NULL
   if (is_single_model) {
     kmeans_df_shared <- df %>% dplyr::select(!!!rlang::syms(selected_cols))
     shared_sil_mat <- as_numeric_matrix_(kmeans_df_shared, columns = colnames(kmeans_df_shared))
     if (normalize_data) {
       shared_sil_mat <- scale(shared_sil_mat)
       shared_sil_mat[is.nan(shared_sil_mat)] <- 0
     }
+    sil_sample_idx <- silhouette_sample_index(nrow(shared_sil_mat), silhouette_sample_size)
+    shared_sil_mat <- shared_sil_mat[sil_sample_idx, , drop = FALSE]
     shared_sil_dist <- stats::dist(shared_sil_mat)
   }
 
@@ -234,24 +273,30 @@ exp_kmeans <- function(df, ...,
                                             trace = trace,
                                             normalize_data = normalize_data,
                                             seed = NULL, # Seed is already set once at the top of exp_kmeans(). Skip it.
-                                            dist = shared_sil_dist) # Reuse the single dist when available.
+                                            dist = shared_sil_dist, # Reuse the single (subsampled) dist when available.
+                                            mat = shared_sil_mat) # Use the SAME subsampled scaled matrix so kmeans input and dist match (#36126).
   }
 
   ret <- ret %>% dplyr::mutate(model = purrr::imap(model, function(x, idx) {
     tryCatch({
       y <- kmeans_model_df$model[[idx]]
       x$kmeans <- y # Might need to be more careful on guaranteeing x and y are from same group, but we are not supporting group_by on UI at this point.
       # Always attach per-row silhouette for the chosen clustering (#36106).
+      # The dist/silhouette is bounded to silhouette_sample_size rows; non-sampled rows
+      # get NA scores (compute_silhouette_per_row scatters back to full length, #36126).
       if (!is.null(shared_sil_mat)) {
-        x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist)
+        x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist,
+                                                   sample_idx = sil_sample_idx)
       } else {
-        # Grouped case: build the matrix from this group's own data.
+        # Grouped case: build the matrix from this group's own data, then subsample.
         gmat <- as_numeric_matrix_(x$df[, selected_cols, drop = FALSE], columns = selected_cols)
         if (normalize_data) {
           gmat <- scale(gmat)
           gmat[is.nan(gmat)] <- 0
         }
-        x$silhouette <- compute_silhouette_per_row(y$cluster, gmat)
+        g_sample_idx <- silhouette_sample_index(nrow(gmat), silhouette_sample_size)
+        x$silhouette <- compute_silhouette_per_row(y$cluster, gmat[g_sample_idx, , drop = FALSE],
+                                                   sample_idx = g_sample_idx)
       }
       x$sampled_nrow <- sampled_nrow
       x$excluded_nrow <- excluded_nrow

diff --git a/tests/testthat/test_kmeans_1.R b/tests/testthat/test_kmeans_1.R
@@ -221,3 +221,148 @@ test_that("exp_kmeans with strange column name still yields silhouette columns",
   smry <- model_df %>% tidy_rowwise(model, type = "summary")
   expect_true("avg_silhouette" %in% colnames(smry))
 })
+
+# ---- silhouette_sample_size (#36126: bound the O(n^2) silhouette dist) ----
+
+test_that("silhouette_sample_index returns full index unless sample_size < n", {
+  expect_equal(silhouette_sample_index(10, NULL), 1:10)   # NULL -> no cap
+  expect_equal(silhouette_sample_index(10, 0), 1:10)      # non-positive -> no cap
+  expect_equal(silhouette_sample_index(10, 10), 1:10)     # equal -> no cap
+  expect_equal(silhouette_sample_index(10, 20), 1:10)     # larger -> no cap
+  set.seed(1)
+  idx <- silhouette_sample_index(100, 5)                  # smaller -> subsample
+  expect_length(idx, 5)
+  expect_true(all(idx %in% 1:100))
+  expect_false(is.unsorted(idx))                          # returned sorted
+  expect_equal(length(unique(idx)), 5)                    # without replacement
+})
+
+test_that("compute_silhouette_per_row with sample_idx scatters to full length, NA off-sample", {
+  set.seed(1)
+  full_mat <- scale(as.matrix(iris[, 1:4]))
+  km <- stats::kmeans(full_mat, centers = 3)
+  n <- nrow(full_mat)
+  sample_idx <- sort(sample.int(n, 30))
+  res <- compute_silhouette_per_row(km$cluster, full_mat[sample_idx, , drop = FALSE],
+                                    sample_idx = sample_idx)
+  expect_equal(nrow(res), n)                                   # full length, aligned to data
+  expect_setequal(colnames(res), c("silhouette_score", "nearest_cluster", "cluster_width"))
+  expect_equal(sum(!is.na(res$silhouette_score)), 30)          # only sampled rows scored
+  expect_true(all(is.na(res$silhouette_score[-sample_idx])))   # off-sample rows are NA
+  on_sample <- res$silhouette_score[sample_idx]
+  expect_true(all(on_sample >= -1 & on_sample <= 1))
+})
+
+test_that("exp_kmeans bounds the per-row silhouette to silhouette_sample_size (no OOM, aligned)", {
+  df <- mtcars  # 32 rows, no NA
+  model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL,
+                         silhouette_sample_size = 10)
+  model <- model_df$model[[1]]
+  # Per-row silhouette stays positionally aligned to all rows ...
+  expect_equal(nrow(model$silhouette), nrow(model$df))
+  # ... but only up to silhouette_sample_size rows actually get a (finite) score.
+  expect_lte(sum(!is.na(model$silhouette$silhouette_score)), 10)
+  expect_gt(sum(!is.na(model$silhouette$silhouette_score)), 0)
+
+  res <- model_df %>% tidy_rowwise(model, type = "data")
+  expect_equal(nrow(res), nrow(model$df))
+  smry <- model_df %>% tidy_rowwise(model, type = "summary")
+  expect_true(all(c("avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(smry)))
+})
+
+test_that("exp_kmeans default (large) silhouette_sample_size leaves small data fully scored", {
+  df <- mtcars
+  model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL)  # default 5000 >> 32
+  sil <- model_df$model[[1]]$silhouette
+  expect_equal(sum(!is.na(sil$silhouette_score)), nrow(sil))  # no subsampling for small data
+})
+
+test_that("exp_kmeans silhouette method respects silhouette_sample_size (bounded, no error)", {
+  df <- mtcars
+  model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL,
+                         elbow_method_mode = "silhouette", max_centers = 4,
+                         silhouette_sample_size = 15)
+  sil_res <- model_df$model[[1]]$silhouette_result
+  expect_false(is.null(sil_res))
+  expect_true(all(c("center", "avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(sil_res)))
+})
+
+# #1: Grouped case exercises the else branch (per-group subsample via g_sample_idx).
+test_that("exp_kmeans grouped case bounds and aligns per-group silhouette (#36126)", {
+  df <- mtcars %>% dplyr::group_by(am)  # two groups: 19 and 13 rows, both > sample_size below
+  model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 2, max_nrow = NULL,
+                         silhouette_sample_size = 8)
+  expect_equal(nrow(model_df), length(unique(mtcars$am)))  # one model row per group
+  for (i in seq_len(nrow(model_df))) {
+    m <- model_df$model[[i]]
+    # Per-row silhouette stays positionally aligned to the group's rows ...
+    expect_equal(nrow(m$silhouette), nrow(m$df))
+    # ... but at most silhouette_sample_size rows actually get a finite score.
+    n_scored <- sum(!is.na(m$silhouette$silhouette_score))
+    expect_lte(n_scored, 8)
+    expect_gt(n_scored, 0)
+  }
+})
+
+# #2: The scatter must place each sampled row's value at ITS OWN position, not just
+# produce the right count/range. Compare against an independent reference silhouette
+# computed directly on the subsample (silhouette() preserves input observation order).
+test_that("compute_silhouette_per_row scatters each value to its own row, positionally (#36126)", {
+  set.seed(42)
+  full_mat <- scale(as.matrix(iris[, 1:4]))
+  km <- stats::kmeans(full_mat, centers = 3)
+  n <- nrow(full_mat)
+  sample_idx <- sort(sample.int(n, 40))
+
+  sub_mat <- full_mat[sample_idx, , drop = FALSE]
+  ref_sil <- cluster::silhouette(as.integer(km$cluster[sample_idx]), stats::dist(sub_mat))
+  ref_width <- as.numeric(ref_sil[, "sil_width"])
+  ref_neighbor <- as.integer(ref_sil[, "neighbor"])
+  ref_clusters <- as.integer(ref_sil[, "cluster"])
+  ref_clus_avg <- tapply(ref_width, ref_clusters, mean, na.rm = TRUE)
+  ref_cluster_width <- as.numeric(ref_clus_avg[as.character(ref_clusters)])
+
+  res <- compute_silhouette_per_row(km$cluster, sub_mat, sample_idx = sample_idx)
+
+  # Exact, position-by-position equality. A scatter that shuffled positions (but kept
+  # values in [-1, 1]) would pass the existing count/range checks yet fail here.
+  expect_equal(res$silhouette_score[sample_idx], ref_width)
+  expect_equal(res$nearest_cluster[sample_idx], ref_neighbor)
+  expect_equal(res$cluster_width[sample_idx], ref_cluster_width)
+  expect_true(all(is.na(res$silhouette_score[-sample_idx])))  # off-sample untouched
+})
+
+# #3: With the seed fixed, the subsample draw (and thus the scores and NA positions)
+# must be identical across runs -- otherwise the UI diagnostic would flicker per run.
+test_that("exp_kmeans silhouette subsample is reproducible across runs (same seed) (#36126)", {
+  run <- function() {
+    exp_kmeans(mtcars, cyl, mpg, hp, centers = 3, max_nrow = NULL,
+               silhouette_sample_size = 12, seed = 1)$model[[1]]$silhouette
+  }
+  s1 <- run()
+  s2 <- run()
+  expect_equal(s1$silhouette_score, s2$silhouette_score)            # identical scores incl. NAs
+  expect_equal(which(is.na(s1$silhouette_score)),
+               which(is.na(s2$silhouette_score)))                   # identical subsample draw
+  expect_equal(sum(!is.na(s1$silhouette_score)), 12)                # subsampling actually happened
+})
+
+# #4: When `mat` is supplied, iterate_silhouette must use it (and the supplied dist) and
+# ignore df/normalize_data for the matrix build -- this is the kmeans-input/dist consistency
+# fix. A bogus df must not change the result; it must match building from the equivalent df.
+test_that("iterate_silhouette uses supplied mat/dist and ignores df for the matrix build (#36126)", {
+  mat <- scale(as.matrix(mtcars[, c("cyl", "mpg", "hp")]))
+  mat[is.nan(mat)] <- 0
+  d <- stats::dist(mat)
+  df_equiv <- as.data.frame(mat)  # already normalized, so normalize_data=FALSE rebuilds `mat`
+
+  res_built <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE, seed = 42)
+  res_mat   <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE,
+                                  seed = 42, mat = mat, dist = d)
+  expect_equal(res_mat, res_built)  # supplying mat/dist matches the build-from-df path
+
+  bogus_df <- data.frame(a = seq_len(nrow(mat)), b = rev(seq_len(nrow(mat))))
+  res_bogus <- iterate_silhouette(bogus_df, max_centers = 4, normalize_data = TRUE,
+                                  seed = 42, mat = mat, dist = d)
+  expect_equal(res_bogus, res_mat)  # df is ignored entirely when mat is supplied
+})