diff --git a/DESCRIPTION b/DESCRIPTION index 50e86a90..329d1fb2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: exploratory Type: Package Title: R package for Exploratory -Version: 15.1.13 -Date: 2026-05-30 +Version: 15.1.14 +Date: 2026-06-03 Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut"))) URL: https://github.com/exploratory-io/exploratory_func Description: Functions for Exploratory diff --git a/R/kmeans.R b/R/kmeans.R index 154e88db..4b8e2ced 100644 --- a/R/kmeans.R +++ b/R/kmeans.R @@ -33,7 +33,22 @@ iterate_kmeans <- function(df, max_centers = 10, ret %>% rowwise(center) %>% glance_rowwise(model) } +# Pick the row index used for the O(n^2) silhouette dist/computation (#36126). +# Returns the full index when sample_size is unset / non-positive / not smaller than n; +# otherwise a sorted random subsample of size sample_size. The RNG seed is set once at the +# top of exp_kmeans(), so the draw is reproducible across runs. +silhouette_sample_index <- function(n, sample_size = NULL) { + size <- suppressWarnings(as.numeric(sample_size)) + if (length(size) != 1 || is.na(size) || size < 1 || n <= size) { + return(seq_len(n)) + } + sort(sample.int(n, floor(size))) +} + # internal function to iterate number of centers (k) from 2 to max_centers for silhouette method to find optimal k. +# When `mat` is supplied it must already be the numeric (and, if normalize_data, scaled) +# matrix that `dist` was computed from; in that case df/normalize_data are ignored for the +# matrix build so the kmeans input and the silhouette dist stay consistent (#36126). iterate_silhouette <- function(df, max_centers = 10, iter.max = 10, nstart = 1, @@ -41,11 +56,14 @@ iterate_silhouette <- function(df, max_centers = 10, trace = FALSE, normalize_data = TRUE, seed = NULL, - dist = NULL) { - mat <- as_numeric_matrix_(df, columns = colnames(df)) - if (normalize_data) { - mat <- scale(mat) - mat[is.nan(mat)] <- 0 + dist = NULL, + mat = NULL) { + if (is.null(mat)) { + mat <- as_numeric_matrix_(df, columns = colnames(df)) + if (normalize_data) { + mat <- scale(mat) + mat[is.nan(mat)] <- 0 + } } # Cap k by the number of distinct data points, not just the row count: # stats::kmeans() errors with "more cluster centers than distinct data points" @@ -81,22 +99,32 @@ iterate_silhouette <- function(df, max_centers = 10, # Compute per-row silhouette widths for an already-built clustering. # -# cluster_ids: integer cluster assignment vector (length n, aligned to mat rows). -# For K-Means this is x$kmeans$cluster (integer 1..k). -# mat: normalized numeric matrix used for clustering (n rows). +# cluster_ids: integer cluster assignment vector (length n) for the FULL data. +# For K-Means this is x$kmeans$cluster (integer 1..k). The result is always +# length n and positionally aligned to it (prcomp.R binds it onto x$df). +# mat: normalized numeric matrix for the rows named by sample_idx (sample_size rows +# when subsampling, otherwise n rows). # d: optional precomputed stats::dist(mat) to reuse. Computed when NULL. +# sample_idx: positions (into 1..n) that mat/d correspond to. NULL means the full data +# (mat has n rows). When a strict subset, the silhouette is computed only on +# those rows and the scores are scattered back into the length-n result with +# NA elsewhere -- this bounds the O(n^2) dist to sample_size rows (#36126). # # Returns a tibble with n rows and columns silhouette_score, nearest_cluster, cluster_width. # Returns all-NA columns (no error) when silhouette is undefined # (fewer than 2 clusters, or fewer than 2 distinct points). -compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) { +compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL, sample_idx = NULL) { n <- length(cluster_ids) na_result <- tibble::tibble( silhouette_score = rep(NA_real_, n), nearest_cluster = rep(NA_integer_, n), cluster_width = rep(NA_real_, n) ) - ids <- as.integer(cluster_ids) + if (is.null(sample_idx)) { + sample_idx <- seq_len(n) + } + # mat/d already correspond to sample_idx rows; the cluster ids must be subset to match. + ids <- as.integer(cluster_ids[sample_idx]) if (length(unique(ids)) < 2 || nrow(unique(mat)) < 2) { return(na_result) } @@ -111,11 +139,12 @@ compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) { widths <- as.numeric(sil[, "sil_width"]) clusters <- as.integer(sil[, "cluster"]) clus_avg <- tapply(widths, clusters, mean, na.rm = TRUE) - tibble::tibble( - silhouette_score = widths, - nearest_cluster = as.integer(sil[, "neighbor"]), - cluster_width = as.numeric(clus_avg[as.character(clusters)]) - ) + # Scatter the per-sampled-row values back to full length (NA for non-sampled rows). + # silhouette() preserves the observation order of `ids`, so widths[i] -> sample_idx[i]. + na_result$silhouette_score[sample_idx] <- widths + na_result$nearest_cluster[sample_idx] <- as.integer(sil[, "neighbor"]) + na_result$cluster_width[sample_idx] <- as.numeric(clus_avg[as.character(clusters)]) + na_result } #' analytics function for K-means view @@ -130,7 +159,11 @@ exp_kmeans <- function(df, ..., max_nrow = NULL, seed = 1, elbow_method_mode=FALSE, - max_centers = 10 + max_centers = 10, + # Bound the O(n^2) silhouette dist/computation to this many rows (#36126). + # K-means itself still runs on the full (or max_nrow) data; only the + # silhouette diagnostics are estimated from this subsample. NULL = no cap. + silhouette_sample_size = 5000 ) { # this evaluates select arguments like starts_with selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...)) @@ -198,9 +231,13 @@ exp_kmeans <- function(df, ..., # O(n^2) cost and is shared with iterate_silhouette() (decision (a), #36106). # Only build the shared dist for the ungrouped (UI) case; grouped models build # their own per-group matrix below. + # The dist()/silhouette work is bounded to silhouette_sample_size rows (#36126): + # scale over the full data (cheap, O(n*p)) so the sampled rows' coordinates stay + # consistent with build_kmeans.cols' full-data scaling, THEN subset before dist(). is_single_model <- length(ret$model) == 1 shared_sil_mat <- NULL shared_sil_dist <- NULL + sil_sample_idx <- NULL if (is_single_model) { kmeans_df_shared <- df %>% dplyr::select(!!!rlang::syms(selected_cols)) shared_sil_mat <- as_numeric_matrix_(kmeans_df_shared, columns = colnames(kmeans_df_shared)) @@ -208,6 +245,8 @@ exp_kmeans <- function(df, ..., shared_sil_mat <- scale(shared_sil_mat) shared_sil_mat[is.nan(shared_sil_mat)] <- 0 } + sil_sample_idx <- silhouette_sample_index(nrow(shared_sil_mat), silhouette_sample_size) + shared_sil_mat <- shared_sil_mat[sil_sample_idx, , drop = FALSE] shared_sil_dist <- stats::dist(shared_sil_mat) } @@ -234,7 +273,8 @@ exp_kmeans <- function(df, ..., trace = trace, normalize_data = normalize_data, seed = NULL, # Seed is already set once at the top of exp_kmeans(). Skip it. - dist = shared_sil_dist) # Reuse the single dist when available. + dist = shared_sil_dist, # Reuse the single (subsampled) dist when available. + mat = shared_sil_mat) # Use the SAME subsampled scaled matrix so kmeans input and dist match (#36126). } ret <- ret %>% dplyr::mutate(model = purrr::imap(model, function(x, idx) { @@ -242,16 +282,21 @@ exp_kmeans <- function(df, ..., y <- kmeans_model_df$model[[idx]] x$kmeans <- y # Might need to be more careful on guaranteeing x and y are from same group, but we are not supporting group_by on UI at this point. # Always attach per-row silhouette for the chosen clustering (#36106). + # The dist/silhouette is bounded to silhouette_sample_size rows; non-sampled rows + # get NA scores (compute_silhouette_per_row scatters back to full length, #36126). if (!is.null(shared_sil_mat)) { - x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist) + x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist, + sample_idx = sil_sample_idx) } else { - # Grouped case: build the matrix from this group's own data. + # Grouped case: build the matrix from this group's own data, then subsample. gmat <- as_numeric_matrix_(x$df[, selected_cols, drop = FALSE], columns = selected_cols) if (normalize_data) { gmat <- scale(gmat) gmat[is.nan(gmat)] <- 0 } - x$silhouette <- compute_silhouette_per_row(y$cluster, gmat) + g_sample_idx <- silhouette_sample_index(nrow(gmat), silhouette_sample_size) + x$silhouette <- compute_silhouette_per_row(y$cluster, gmat[g_sample_idx, , drop = FALSE], + sample_idx = g_sample_idx) } x$sampled_nrow <- sampled_nrow x$excluded_nrow <- excluded_nrow diff --git a/tests/testthat/test_kmeans_1.R b/tests/testthat/test_kmeans_1.R index 8cd4b87d..cb5daf5d 100644 --- a/tests/testthat/test_kmeans_1.R +++ b/tests/testthat/test_kmeans_1.R @@ -221,3 +221,148 @@ test_that("exp_kmeans with strange column name still yields silhouette columns", smry <- model_df %>% tidy_rowwise(model, type = "summary") expect_true("avg_silhouette" %in% colnames(smry)) }) + +# ---- silhouette_sample_size (#36126: bound the O(n^2) silhouette dist) ---- + +test_that("silhouette_sample_index returns full index unless sample_size < n", { + expect_equal(silhouette_sample_index(10, NULL), 1:10) # NULL -> no cap + expect_equal(silhouette_sample_index(10, 0), 1:10) # non-positive -> no cap + expect_equal(silhouette_sample_index(10, 10), 1:10) # equal -> no cap + expect_equal(silhouette_sample_index(10, 20), 1:10) # larger -> no cap + set.seed(1) + idx <- silhouette_sample_index(100, 5) # smaller -> subsample + expect_length(idx, 5) + expect_true(all(idx %in% 1:100)) + expect_false(is.unsorted(idx)) # returned sorted + expect_equal(length(unique(idx)), 5) # without replacement +}) + +test_that("compute_silhouette_per_row with sample_idx scatters to full length, NA off-sample", { + set.seed(1) + full_mat <- scale(as.matrix(iris[, 1:4])) + km <- stats::kmeans(full_mat, centers = 3) + n <- nrow(full_mat) + sample_idx <- sort(sample.int(n, 30)) + res <- compute_silhouette_per_row(km$cluster, full_mat[sample_idx, , drop = FALSE], + sample_idx = sample_idx) + expect_equal(nrow(res), n) # full length, aligned to data + expect_setequal(colnames(res), c("silhouette_score", "nearest_cluster", "cluster_width")) + expect_equal(sum(!is.na(res$silhouette_score)), 30) # only sampled rows scored + expect_true(all(is.na(res$silhouette_score[-sample_idx]))) # off-sample rows are NA + on_sample <- res$silhouette_score[sample_idx] + expect_true(all(on_sample >= -1 & on_sample <= 1)) +}) + +test_that("exp_kmeans bounds the per-row silhouette to silhouette_sample_size (no OOM, aligned)", { + df <- mtcars # 32 rows, no NA + model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL, + silhouette_sample_size = 10) + model <- model_df$model[[1]] + # Per-row silhouette stays positionally aligned to all rows ... + expect_equal(nrow(model$silhouette), nrow(model$df)) + # ... but only up to silhouette_sample_size rows actually get a (finite) score. + expect_lte(sum(!is.na(model$silhouette$silhouette_score)), 10) + expect_gt(sum(!is.na(model$silhouette$silhouette_score)), 0) + + res <- model_df %>% tidy_rowwise(model, type = "data") + expect_equal(nrow(res), nrow(model$df)) + smry <- model_df %>% tidy_rowwise(model, type = "summary") + expect_true(all(c("avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(smry))) +}) + +test_that("exp_kmeans default (large) silhouette_sample_size leaves small data fully scored", { + df <- mtcars + model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL) # default 5000 >> 32 + sil <- model_df$model[[1]]$silhouette + expect_equal(sum(!is.na(sil$silhouette_score)), nrow(sil)) # no subsampling for small data +}) + +test_that("exp_kmeans silhouette method respects silhouette_sample_size (bounded, no error)", { + df <- mtcars + model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL, + elbow_method_mode = "silhouette", max_centers = 4, + silhouette_sample_size = 15) + sil_res <- model_df$model[[1]]$silhouette_result + expect_false(is.null(sil_res)) + expect_true(all(c("center", "avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(sil_res))) +}) + +# #1: Grouped case exercises the else branch (per-group subsample via g_sample_idx). +test_that("exp_kmeans grouped case bounds and aligns per-group silhouette (#36126)", { + df <- mtcars %>% dplyr::group_by(am) # two groups: 19 and 13 rows, both > sample_size below + model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 2, max_nrow = NULL, + silhouette_sample_size = 8) + expect_equal(nrow(model_df), length(unique(mtcars$am))) # one model row per group + for (i in seq_len(nrow(model_df))) { + m <- model_df$model[[i]] + # Per-row silhouette stays positionally aligned to the group's rows ... + expect_equal(nrow(m$silhouette), nrow(m$df)) + # ... but at most silhouette_sample_size rows actually get a finite score. + n_scored <- sum(!is.na(m$silhouette$silhouette_score)) + expect_lte(n_scored, 8) + expect_gt(n_scored, 0) + } +}) + +# #2: The scatter must place each sampled row's value at ITS OWN position, not just +# produce the right count/range. Compare against an independent reference silhouette +# computed directly on the subsample (silhouette() preserves input observation order). +test_that("compute_silhouette_per_row scatters each value to its own row, positionally (#36126)", { + set.seed(42) + full_mat <- scale(as.matrix(iris[, 1:4])) + km <- stats::kmeans(full_mat, centers = 3) + n <- nrow(full_mat) + sample_idx <- sort(sample.int(n, 40)) + + sub_mat <- full_mat[sample_idx, , drop = FALSE] + ref_sil <- cluster::silhouette(as.integer(km$cluster[sample_idx]), stats::dist(sub_mat)) + ref_width <- as.numeric(ref_sil[, "sil_width"]) + ref_neighbor <- as.integer(ref_sil[, "neighbor"]) + ref_clusters <- as.integer(ref_sil[, "cluster"]) + ref_clus_avg <- tapply(ref_width, ref_clusters, mean, na.rm = TRUE) + ref_cluster_width <- as.numeric(ref_clus_avg[as.character(ref_clusters)]) + + res <- compute_silhouette_per_row(km$cluster, sub_mat, sample_idx = sample_idx) + + # Exact, position-by-position equality. A scatter that shuffled positions (but kept + # values in [-1, 1]) would pass the existing count/range checks yet fail here. + expect_equal(res$silhouette_score[sample_idx], ref_width) + expect_equal(res$nearest_cluster[sample_idx], ref_neighbor) + expect_equal(res$cluster_width[sample_idx], ref_cluster_width) + expect_true(all(is.na(res$silhouette_score[-sample_idx]))) # off-sample untouched +}) + +# #3: With the seed fixed, the subsample draw (and thus the scores and NA positions) +# must be identical across runs -- otherwise the UI diagnostic would flicker per run. +test_that("exp_kmeans silhouette subsample is reproducible across runs (same seed) (#36126)", { + run <- function() { + exp_kmeans(mtcars, cyl, mpg, hp, centers = 3, max_nrow = NULL, + silhouette_sample_size = 12, seed = 1)$model[[1]]$silhouette + } + s1 <- run() + s2 <- run() + expect_equal(s1$silhouette_score, s2$silhouette_score) # identical scores incl. NAs + expect_equal(which(is.na(s1$silhouette_score)), + which(is.na(s2$silhouette_score))) # identical subsample draw + expect_equal(sum(!is.na(s1$silhouette_score)), 12) # subsampling actually happened +}) + +# #4: When `mat` is supplied, iterate_silhouette must use it (and the supplied dist) and +# ignore df/normalize_data for the matrix build -- this is the kmeans-input/dist consistency +# fix. A bogus df must not change the result; it must match building from the equivalent df. +test_that("iterate_silhouette uses supplied mat/dist and ignores df for the matrix build (#36126)", { + mat <- scale(as.matrix(mtcars[, c("cyl", "mpg", "hp")])) + mat[is.nan(mat)] <- 0 + d <- stats::dist(mat) + df_equiv <- as.data.frame(mat) # already normalized, so normalize_data=FALSE rebuilds `mat` + + res_built <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE, seed = 42) + res_mat <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE, + seed = 42, mat = mat, dist = d) + expect_equal(res_mat, res_built) # supplying mat/dist matches the build-from-df path + + bogus_df <- data.frame(a = seq_len(nrow(mat)), b = rev(seq_len(nrow(mat)))) + res_bogus <- iterate_silhouette(bogus_df, max_centers = 4, normalize_data = TRUE, + seed = 42, mat = mat, dist = d) + expect_equal(res_bogus, res_mat) # df is ignored entirely when mat is supplied +})