Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: exploratory
Type: Package
Title: R package for Exploratory
Version: 15.1.13
Date: 2026-05-30
Version: 15.1.14
Date: 2026-06-03
Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut")))
URL: https://github.com/exploratory-io/exploratory_func
Description: Functions for Exploratory
Expand Down
85 changes: 65 additions & 20 deletions R/kmeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,37 @@ iterate_kmeans <- function(df, max_centers = 10,
ret %>% rowwise(center) %>% glance_rowwise(model)
}

# Pick the row index used for the O(n^2) silhouette dist/computation (#36126).
# Returns the full index when sample_size is unset / non-positive / not smaller than n;
# otherwise a sorted random subsample of size sample_size. The RNG seed is set once at the
# top of exp_kmeans(), so the draw is reproducible across runs.
silhouette_sample_index <- function(n, sample_size = NULL) {
size <- suppressWarnings(as.numeric(sample_size))
if (length(size) != 1 || is.na(size) || size < 1 || n <= size) {
return(seq_len(n))
}
sort(sample.int(n, floor(size)))
}
Comment thread
kei51e marked this conversation as resolved.

# internal function to iterate number of centers (k) from 2 to max_centers for silhouette method to find optimal k.
# When `mat` is supplied it must already be the numeric (and, if normalize_data, scaled)
# matrix that `dist` was computed from; in that case df/normalize_data are ignored for the
# matrix build so the kmeans input and the silhouette dist stay consistent (#36126).
iterate_silhouette <- function(df, max_centers = 10,
iter.max = 10,
nstart = 1,
algorithm = "Hartigan-Wong",
trace = FALSE,
normalize_data = TRUE,
seed = NULL,
dist = NULL) {
mat <- as_numeric_matrix_(df, columns = colnames(df))
if (normalize_data) {
mat <- scale(mat)
mat[is.nan(mat)] <- 0
dist = NULL,
mat = NULL) {
if (is.null(mat)) {
mat <- as_numeric_matrix_(df, columns = colnames(df))
if (normalize_data) {
mat <- scale(mat)
mat[is.nan(mat)] <- 0
}
}
# Cap k by the number of distinct data points, not just the row count:
# stats::kmeans() errors with "more cluster centers than distinct data points"
Expand Down Expand Up @@ -81,22 +99,32 @@ iterate_silhouette <- function(df, max_centers = 10,

# Compute per-row silhouette widths for an already-built clustering.
#
# cluster_ids: integer cluster assignment vector (length n, aligned to mat rows).
# For K-Means this is x$kmeans$cluster (integer 1..k).
# mat: normalized numeric matrix used for clustering (n rows).
# cluster_ids: integer cluster assignment vector (length n) for the FULL data.
# For K-Means this is x$kmeans$cluster (integer 1..k). The result is always
# length n and positionally aligned to it (prcomp.R binds it onto x$df).
# mat: normalized numeric matrix for the rows named by sample_idx (sample_size rows
# when subsampling, otherwise n rows).
# d: optional precomputed stats::dist(mat) to reuse. Computed when NULL.
# sample_idx: positions (into 1..n) that mat/d correspond to. NULL means the full data
# (mat has n rows). When a strict subset, the silhouette is computed only on
# those rows and the scores are scattered back into the length-n result with
# NA elsewhere -- this bounds the O(n^2) dist to sample_size rows (#36126).
#
# Returns a tibble with n rows and columns silhouette_score, nearest_cluster, cluster_width.
# Returns all-NA columns (no error) when silhouette is undefined
# (fewer than 2 clusters, or fewer than 2 distinct points).
compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) {
compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL, sample_idx = NULL) {
n <- length(cluster_ids)
na_result <- tibble::tibble(
silhouette_score = rep(NA_real_, n),
nearest_cluster = rep(NA_integer_, n),
cluster_width = rep(NA_real_, n)
)
ids <- as.integer(cluster_ids)
if (is.null(sample_idx)) {
sample_idx <- seq_len(n)
}
# mat/d already correspond to sample_idx rows; the cluster ids must be subset to match.
ids <- as.integer(cluster_ids[sample_idx])
if (length(unique(ids)) < 2 || nrow(unique(mat)) < 2) {
return(na_result)
}
Expand All @@ -111,11 +139,12 @@ compute_silhouette_per_row <- function(cluster_ids, mat, d = NULL) {
widths <- as.numeric(sil[, "sil_width"])
clusters <- as.integer(sil[, "cluster"])
clus_avg <- tapply(widths, clusters, mean, na.rm = TRUE)
tibble::tibble(
silhouette_score = widths,
nearest_cluster = as.integer(sil[, "neighbor"]),
cluster_width = as.numeric(clus_avg[as.character(clusters)])
)
# Scatter the per-sampled-row values back to full length (NA for non-sampled rows).
# silhouette() preserves the observation order of `ids`, so widths[i] -> sample_idx[i].
na_result$silhouette_score[sample_idx] <- widths
na_result$nearest_cluster[sample_idx] <- as.integer(sil[, "neighbor"])
na_result$cluster_width[sample_idx] <- as.numeric(clus_avg[as.character(clusters)])
na_result
}

#' analytics function for K-means view
Expand All @@ -130,7 +159,11 @@ exp_kmeans <- function(df, ...,
max_nrow = NULL,
seed = 1,
elbow_method_mode=FALSE,
max_centers = 10
max_centers = 10,
# Bound the O(n^2) silhouette dist/computation to this many rows (#36126).
# K-means itself still runs on the full (or max_nrow) data; only the
# silhouette diagnostics are estimated from this subsample. NULL = no cap.
silhouette_sample_size = 5000
) {
# this evaluates select arguments like starts_with
selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
Expand Down Expand Up @@ -198,16 +231,22 @@ exp_kmeans <- function(df, ...,
# O(n^2) cost and is shared with iterate_silhouette() (decision (a), #36106).
# Only build the shared dist for the ungrouped (UI) case; grouped models build
# their own per-group matrix below.
# The dist()/silhouette work is bounded to silhouette_sample_size rows (#36126):
# scale over the full data (cheap, O(n*p)) so the sampled rows' coordinates stay
# consistent with build_kmeans.cols' full-data scaling, THEN subset before dist().
is_single_model <- length(ret$model) == 1
shared_sil_mat <- NULL
shared_sil_dist <- NULL
sil_sample_idx <- NULL
if (is_single_model) {
kmeans_df_shared <- df %>% dplyr::select(!!!rlang::syms(selected_cols))
shared_sil_mat <- as_numeric_matrix_(kmeans_df_shared, columns = colnames(kmeans_df_shared))
if (normalize_data) {
shared_sil_mat <- scale(shared_sil_mat)
shared_sil_mat[is.nan(shared_sil_mat)] <- 0
}
sil_sample_idx <- silhouette_sample_index(nrow(shared_sil_mat), silhouette_sample_size)
shared_sil_mat <- shared_sil_mat[sil_sample_idx, , drop = FALSE]
shared_sil_dist <- stats::dist(shared_sil_mat)
}

Expand All @@ -234,24 +273,30 @@ exp_kmeans <- function(df, ...,
trace = trace,
normalize_data = normalize_data,
seed = NULL, # Seed is already set once at the top of exp_kmeans(). Skip it.
dist = shared_sil_dist) # Reuse the single dist when available.
dist = shared_sil_dist, # Reuse the single (subsampled) dist when available.
mat = shared_sil_mat) # Use the SAME subsampled scaled matrix so kmeans input and dist match (#36126).
}

ret <- ret %>% dplyr::mutate(model = purrr::imap(model, function(x, idx) {
tryCatch({
y <- kmeans_model_df$model[[idx]]
x$kmeans <- y # Might need to be more careful on guaranteeing x and y are from same group, but we are not supporting group_by on UI at this point.
# Always attach per-row silhouette for the chosen clustering (#36106).
# The dist/silhouette is bounded to silhouette_sample_size rows; non-sampled rows
# get NA scores (compute_silhouette_per_row scatters back to full length, #36126).
if (!is.null(shared_sil_mat)) {
x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist)
x$silhouette <- compute_silhouette_per_row(y$cluster, shared_sil_mat, shared_sil_dist,
sample_idx = sil_sample_idx)
} else {
# Grouped case: build the matrix from this group's own data.
# Grouped case: build the matrix from this group's own data, then subsample.
gmat <- as_numeric_matrix_(x$df[, selected_cols, drop = FALSE], columns = selected_cols)
if (normalize_data) {
gmat <- scale(gmat)
gmat[is.nan(gmat)] <- 0
}
x$silhouette <- compute_silhouette_per_row(y$cluster, gmat)
g_sample_idx <- silhouette_sample_index(nrow(gmat), silhouette_sample_size)
x$silhouette <- compute_silhouette_per_row(y$cluster, gmat[g_sample_idx, , drop = FALSE],
sample_idx = g_sample_idx)
}
x$sampled_nrow <- sampled_nrow
x$excluded_nrow <- excluded_nrow
Expand Down
145 changes: 145 additions & 0 deletions tests/testthat/test_kmeans_1.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,148 @@ test_that("exp_kmeans with strange column name still yields silhouette columns",
smry <- model_df %>% tidy_rowwise(model, type = "summary")
expect_true("avg_silhouette" %in% colnames(smry))
})

# ---- silhouette_sample_size (#36126: bound the O(n^2) silhouette dist) ----

test_that("silhouette_sample_index returns full index unless sample_size < n", {
expect_equal(silhouette_sample_index(10, NULL), 1:10) # NULL -> no cap
expect_equal(silhouette_sample_index(10, 0), 1:10) # non-positive -> no cap
expect_equal(silhouette_sample_index(10, 10), 1:10) # equal -> no cap
expect_equal(silhouette_sample_index(10, 20), 1:10) # larger -> no cap
set.seed(1)
idx <- silhouette_sample_index(100, 5) # smaller -> subsample
expect_length(idx, 5)
expect_true(all(idx %in% 1:100))
expect_false(is.unsorted(idx)) # returned sorted
expect_equal(length(unique(idx)), 5) # without replacement
})

test_that("compute_silhouette_per_row with sample_idx scatters to full length, NA off-sample", {
set.seed(1)
full_mat <- scale(as.matrix(iris[, 1:4]))
km <- stats::kmeans(full_mat, centers = 3)
n <- nrow(full_mat)
sample_idx <- sort(sample.int(n, 30))
res <- compute_silhouette_per_row(km$cluster, full_mat[sample_idx, , drop = FALSE],
sample_idx = sample_idx)
expect_equal(nrow(res), n) # full length, aligned to data
expect_setequal(colnames(res), c("silhouette_score", "nearest_cluster", "cluster_width"))
expect_equal(sum(!is.na(res$silhouette_score)), 30) # only sampled rows scored
expect_true(all(is.na(res$silhouette_score[-sample_idx]))) # off-sample rows are NA
on_sample <- res$silhouette_score[sample_idx]
expect_true(all(on_sample >= -1 & on_sample <= 1))
})

test_that("exp_kmeans bounds the per-row silhouette to silhouette_sample_size (no OOM, aligned)", {
df <- mtcars # 32 rows, no NA
model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL,
silhouette_sample_size = 10)
model <- model_df$model[[1]]
# Per-row silhouette stays positionally aligned to all rows ...
expect_equal(nrow(model$silhouette), nrow(model$df))
# ... but only up to silhouette_sample_size rows actually get a (finite) score.
expect_lte(sum(!is.na(model$silhouette$silhouette_score)), 10)
expect_gt(sum(!is.na(model$silhouette$silhouette_score)), 0)

res <- model_df %>% tidy_rowwise(model, type = "data")
expect_equal(nrow(res), nrow(model$df))
smry <- model_df %>% tidy_rowwise(model, type = "summary")
expect_true(all(c("avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(smry)))
})

test_that("exp_kmeans default (large) silhouette_sample_size leaves small data fully scored", {
df <- mtcars
model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL) # default 5000 >> 32
sil <- model_df$model[[1]]$silhouette
expect_equal(sum(!is.na(sil$silhouette_score)), nrow(sil)) # no subsampling for small data
})

test_that("exp_kmeans silhouette method respects silhouette_sample_size (bounded, no error)", {
df <- mtcars
model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 3, max_nrow = NULL,
elbow_method_mode = "silhouette", max_centers = 4,
silhouette_sample_size = 15)
sil_res <- model_df$model[[1]]$silhouette_result
expect_false(is.null(sil_res))
expect_true(all(c("center", "avg_silhouette", "min_silhouette", "pct_negative") %in% colnames(sil_res)))
})

# #1: Grouped case exercises the else branch (per-group subsample via g_sample_idx).
test_that("exp_kmeans grouped case bounds and aligns per-group silhouette (#36126)", {
df <- mtcars %>% dplyr::group_by(am) # two groups: 19 and 13 rows, both > sample_size below
model_df <- exp_kmeans(df, cyl, mpg, hp, centers = 2, max_nrow = NULL,
silhouette_sample_size = 8)
expect_equal(nrow(model_df), length(unique(mtcars$am))) # one model row per group
for (i in seq_len(nrow(model_df))) {
m <- model_df$model[[i]]
# Per-row silhouette stays positionally aligned to the group's rows ...
expect_equal(nrow(m$silhouette), nrow(m$df))
# ... but at most silhouette_sample_size rows actually get a finite score.
n_scored <- sum(!is.na(m$silhouette$silhouette_score))
expect_lte(n_scored, 8)
expect_gt(n_scored, 0)
}
})

# #2: The scatter must place each sampled row's value at ITS OWN position, not just
# produce the right count/range. Compare against an independent reference silhouette
# computed directly on the subsample (silhouette() preserves input observation order).
test_that("compute_silhouette_per_row scatters each value to its own row, positionally (#36126)", {
set.seed(42)
full_mat <- scale(as.matrix(iris[, 1:4]))
km <- stats::kmeans(full_mat, centers = 3)
n <- nrow(full_mat)
sample_idx <- sort(sample.int(n, 40))

sub_mat <- full_mat[sample_idx, , drop = FALSE]
ref_sil <- cluster::silhouette(as.integer(km$cluster[sample_idx]), stats::dist(sub_mat))
ref_width <- as.numeric(ref_sil[, "sil_width"])
ref_neighbor <- as.integer(ref_sil[, "neighbor"])
ref_clusters <- as.integer(ref_sil[, "cluster"])
ref_clus_avg <- tapply(ref_width, ref_clusters, mean, na.rm = TRUE)
ref_cluster_width <- as.numeric(ref_clus_avg[as.character(ref_clusters)])

res <- compute_silhouette_per_row(km$cluster, sub_mat, sample_idx = sample_idx)

# Exact, position-by-position equality. A scatter that shuffled positions (but kept
# values in [-1, 1]) would pass the existing count/range checks yet fail here.
expect_equal(res$silhouette_score[sample_idx], ref_width)
expect_equal(res$nearest_cluster[sample_idx], ref_neighbor)
expect_equal(res$cluster_width[sample_idx], ref_cluster_width)
expect_true(all(is.na(res$silhouette_score[-sample_idx]))) # off-sample untouched
})

# #3: With the seed fixed, the subsample draw (and thus the scores and NA positions)
# must be identical across runs -- otherwise the UI diagnostic would flicker per run.
test_that("exp_kmeans silhouette subsample is reproducible across runs (same seed) (#36126)", {
run <- function() {
exp_kmeans(mtcars, cyl, mpg, hp, centers = 3, max_nrow = NULL,
silhouette_sample_size = 12, seed = 1)$model[[1]]$silhouette
}
s1 <- run()
s2 <- run()
expect_equal(s1$silhouette_score, s2$silhouette_score) # identical scores incl. NAs
expect_equal(which(is.na(s1$silhouette_score)),
which(is.na(s2$silhouette_score))) # identical subsample draw
expect_equal(sum(!is.na(s1$silhouette_score)), 12) # subsampling actually happened
})

# #4: When `mat` is supplied, iterate_silhouette must use it (and the supplied dist) and
# ignore df/normalize_data for the matrix build -- this is the kmeans-input/dist consistency
# fix. A bogus df must not change the result; it must match building from the equivalent df.
test_that("iterate_silhouette uses supplied mat/dist and ignores df for the matrix build (#36126)", {
mat <- scale(as.matrix(mtcars[, c("cyl", "mpg", "hp")]))
mat[is.nan(mat)] <- 0
d <- stats::dist(mat)
df_equiv <- as.data.frame(mat) # already normalized, so normalize_data=FALSE rebuilds `mat`

res_built <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE, seed = 42)
res_mat <- iterate_silhouette(df_equiv, max_centers = 4, normalize_data = FALSE,
seed = 42, mat = mat, dist = d)
expect_equal(res_mat, res_built) # supplying mat/dist matches the build-from-df path

bogus_df <- data.frame(a = seq_len(nrow(mat)), b = rev(seq_len(nrow(mat))))
res_bogus <- iterate_silhouette(bogus_df, max_centers = 4, normalize_data = TRUE,
seed = 42, mat = mat, dist = d)
expect_equal(res_bogus, res_mat) # df is ignored entirely when mat is supplied
})