From aec38740378efde015036563b66e6684002979b0 Mon Sep 17 00:00:00 2001 From: Kei Saito Date: Tue, 2 Jun 2026 22:31:53 -0700 Subject: [PATCH 1/3] feat(#30488): support polychoric correlation method in do_cor Add method = "polychoric" to do_cor (Analytics Correlation). Since stats::cor()/cor.test() do not support it, compute the correlation matrix and its standard-error matrix in a single polycor::hetcor( ML = TRUE, std.err = TRUE) call, then derive a two-sided z-test p value (z = rho / SE). Columns are coerced to ordered factors so polychoric is used for every pair; non-estimable pairs (constant or near-perfect columns) come back as NA, which the UI treats as not-significant. - R/stats_wrapper.R: polychoric branch in do_cor_internal; the existing pearson/spearman path is unchanged - DESCRIPTION: add polycor to Imports - tests: polychoric core behaviour, constant-column safety, and complex column names (spaces / multibyte / symbols) The tam-side UI/config lives in the paired change. Co-Authored-By: Claude Opus 4.8 (1M context) --- DESCRIPTION | 1 + R/stats_wrapper.R | 90 ++++++++++++++++++----------- tests/testthat/test_stats_wrapper.R | 52 +++++++++++++++++ 3 files changed, 110 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 50e86a902..5eeae2f41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Imports: prediction, urltools, psych, + polycor, dplyr, tidyr, tibble, diff --git a/R/stats_wrapper.R b/R/stats_wrapper.R index 8ea32a67b..4e39b10cf 100644 --- a/R/stats_wrapper.R +++ b/R/stats_wrapper.R @@ -45,7 +45,7 @@ do_cor <- function(df, ..., skv = NULL, fun.aggregate=mean, fill=0){ #' @param dimension A column you want to use as a dimension to calculate the correlations. #' @param value A column for the values you want to use to calculate the correlations. #' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs" -#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman". +#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric". #' @param fun.aggregate Set an aggregate function when there are multiple entries for the key column per each category. #' @param time_unit Unit of time to aggregate key_col if key_col is Date or POSIXct. NULL doesn't aggregate. #' @return correlations between pairs of groups @@ -175,7 +175,7 @@ do_cor.kv_ <- function(df, #' @param df data frame in tidy format #' @param ... Arguments to select columns to calculate correlation. #' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs" -#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman". +#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric". #' @return correlations between pairs of columns #' @export do_cor.cols <- function(df, ..., use = "pairwise.complete.obs", method = "pearson", @@ -261,42 +261,66 @@ do_cor_internal <- function(mat, use, method, diag, output_cols, na.rm) { sorted_colnames <- stringr::str_sort(colnames(mat)) mat <- mat[,sorted_colnames] - cor_mat <- cor(mat, use = use, method = method) + # Create a matrix of P-values for Analytics View case. + dim <- length(sorted_colnames) + + if (identical(method, "polychoric")) { + # Polychoric correlation is for ordinal variables (e.g. survey scales 1-5). + # stats::cor()/cor.test() do not support it, so we use polycor::hetcor(), which + # returns the full correlation matrix and the matrix of standard errors in one call. + # We coerce every column to an ordered factor so that polychoric (not Pearson or + # polyserial) is used for every pair, and derive a two-sided z-test P value from + # rho / SE. Non-estimable pairs (constant or near-perfect columns) come back as NA, + # which the UI treats as not-significant. + loadNamespace("polycor") + ordered_df <- as.data.frame(lapply(seq_len(dim), function(k) { + ordered(mat[, k]) + })) + colnames(ordered_df) <- sorted_colnames + # use is "pairwise.complete.obs" by default, matching the other methods. + # (hetcor's own default is listwise "complete.obs", which we do not want.) + het <- polycor::hetcor(ordered_df, ML = TRUE, std.err = TRUE, use = use) + cor_mat <- het$correlations + tvalue_mat <- het$correlations / het$std.errors # z value; off-diagonal NA SE stays NA. + pvalue_mat <- 2 * stats::pnorm(-abs(tvalue_mat)) + diag(tvalue_mat) <- Inf # For i=j case, statistic should be Inf and P value 0. + diag(pvalue_mat) <- 0 + } else { + cor_mat <- cor(mat, use = use, method = method) + pvalue_mat <- matrix(NA, dim, dim) + tvalue_mat <- matrix(NA, dim, dim) + for (i in 2:dim) { + for (j in 1:(i-1)) { + tryCatch({ + cor_test_res <- cor.test(mat[,i], mat[,j], method = method) + pvalue_mat[i, j] <- cor_test_res$p.value + tvalue_mat[i, j] <- cor_test_res$statistic + }, error = function(e) { + if (e$message == "not enough finite observations") { + # This is the error cor.test returns when there is not enough non-NA data. + # Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI. + pvalue_mat[i, j] <- NA + tvalue_mat[i, j] <- NA + } + else { + stop(e) + } + }) + pvalue_mat[j, i] <- pvalue_mat[i, j] + tvalue_mat[j, i] <- tvalue_mat[i, j] + } + } + for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf. + pvalue_mat[i, i] <- 0 + tvalue_mat[i, i] <- Inf + } + } + ret <- mat_to_df(cor_mat, cnames=output_cols[1:3], diag=diag, na.rm=na.rm, zero.rm=FALSE) - - # Create a matrix of P-values for Analytics View case. - dim <- length(sorted_colnames) - pvalue_mat <- matrix(NA, dim, dim) - tvalue_mat <- matrix(NA, dim, dim) - for (i in 2:dim) { - for (j in 1:(i-1)) { - tryCatch({ - cor_test_res <- cor.test(mat[,i], mat[,j], method = method) - pvalue_mat[i, j] <- cor_test_res$p.value - tvalue_mat[i, j] <- cor_test_res$statistic - }, error = function(e) { - if (e$message == "not enough finite observations") { - # This is the error cor.test returns when there is not enough non-NA data. - # Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI. - pvalue_mat[i, j] <- NA - tvalue_mat[i, j] <- NA - } - else { - stop(e) - } - }) - pvalue_mat[j, i] <- pvalue_mat[i, j] - tvalue_mat[j, i] <- tvalue_mat[i, j] - } - } - for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf. - pvalue_mat[i, i] <- 0 - tvalue_mat[i, i] <- Inf - } colnames(pvalue_mat) <- sorted_colnames rownames(pvalue_mat) <- sorted_colnames colnames(tvalue_mat) <- sorted_colnames diff --git a/tests/testthat/test_stats_wrapper.R b/tests/testthat/test_stats_wrapper.R index fd6bd78c2..2f9d41ffb 100644 --- a/tests/testthat/test_stats_wrapper.R +++ b/tests/testthat/test_stats_wrapper.R @@ -120,6 +120,58 @@ test_that("do_cor should skip group with only one row.", { expect_equal(nrow(res %>% filter(z==F)), 0) }) +test_that("do_cor with polychoric method", { + # Polychoric correlation is for ordinal variables. The latent variables behind x and y + # are correlated at 0.7, while z is independent of them. + set.seed(123) + n <- 200 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n) + df <- data.frame(x = cut5(z1), y = cut5(z2), z = cut5(z3)) + model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model") + res <- model_df %>% tidy_rowwise(model, type = 'cor') + expect_equal(nrow(res), 9) # All 9 combinations. + xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y") + expect_true(xy$correlation > 0.5) # Strong positive polychoric correlation. + expect_true(xy$p_value < 0.05) # Statistically significant. + expect_true(is.finite(xy$statistic)) # z value is populated. + diag_row <- res %>% filter(pair.name.x == "x", pair.name.y == "x") + expect_equal(diag_row$correlation, 1) # Diagonal correlation is 1. + expect_equal(diag_row$p_value, 0) # Diagonal P value is 0. +}) + +test_that("do_cor with polychoric method handles a constant column without error", { + set.seed(123) + n <- 100 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n) + df <- data.frame(x = cut5(z1), y = cut5(z2), w = rep(3L, n)) # w is constant. + # hetcor warns for the non-estimable pairs involving the constant column; that is expected. + model_df <- suppressWarnings(df %>% do_cor(`x`, `y`, `w`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")) + res <- model_df %>% tidy_rowwise(model, type = 'cor') + xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y") + expect_equal(nrow(xy), 1) # The estimable pair is still computed. + expect_true(xy$correlation > 0.5) + # The constant-column pair has NA correlation and is dropped by na.rm in mat_to_df. + expect_equal(nrow(res %>% filter(pair.name.x == "x", pair.name.y == "w")), 0) +}) + +test_that("do_cor with polychoric method handles complex column names", { + set.seed(123) + n <- 100 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n) + sname <- "航空 会社 !\"#$%&'()*+, -./:;<=>?@[]^_'{|}~ 表" + df <- data.frame(a = cut5(z1), b = cut5(z2), check.names = FALSE) + names(df) <- c(sname, "plain") + model_df <- df %>% do_cor(tidyselect::everything(), method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model") + res <- model_df %>% tidy_rowwise(model, type = 'cor') + expect_true(sname %in% as.character(res$pair.name.x)) # Complex name survives the round trip. + pair <- res %>% filter(as.character(pair.name.x) == sname, pair.name.y == "plain") + expect_equal(nrow(pair), 1) + expect_true(is.finite(pair$correlation)) +}) + test_that("test do_svd.kv with fill", { test_df <- data.frame( rand=runif(20, min = 0, max=10), From 570f9871207c936325845db17bd37942691aa508 Mon Sep 17 00:00:00 2001 From: Kei Saito Date: Tue, 2 Jun 2026 22:38:02 -0700 Subject: [PATCH 2/3] Bump the version --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5eeae2f41..405f31c8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: exploratory Type: Package Title: R package for Exploratory -Version: 15.1.13 -Date: 2026-05-30 +Version: 15.1.14 +Date: 2026-06-02 Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut"))) URL: https://github.com/exploratory-io/exploratory_func Description: Functions for Exploratory From 5674892cf8036759b5c67a623058cb205f312e2a Mon Sep 17 00:00:00 2001 From: Kei Saito Date: Tue, 2 Jun 2026 23:31:46 -0700 Subject: [PATCH 3/3] fix(#30488): map unsupported `use` values for polychoric and expand tests polycor::hetcor() only accepts "complete.obs" and "pairwise.complete.obs", so do_cor(method="polychoric") errored when `use` was "everything", "all.obs", or "na.or.complete" -- values the other methods accept via cor()/cor.test(). Map those to "pairwise.complete.obs". Add tests for the use mapping, grouped (Repeat By) data, and NA values via pairwise complete obs, and tighten the value-correctness assertions so the strong pair recovers the latent ~0.7 (not inflated) and the independent pair is near-zero and not significant. Co-Authored-By: Claude Opus 4.8 (1M context) --- R/stats_wrapper.R | 10 +++-- tests/testthat/test_stats_wrapper.R | 61 ++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/R/stats_wrapper.R b/R/stats_wrapper.R index 4e39b10cf..b78554e24 100644 --- a/R/stats_wrapper.R +++ b/R/stats_wrapper.R @@ -277,9 +277,13 @@ do_cor_internal <- function(mat, use, method, diag, output_cols, na.rm) { ordered(mat[, k]) })) colnames(ordered_df) <- sorted_colnames - # use is "pairwise.complete.obs" by default, matching the other methods. - # (hetcor's own default is listwise "complete.obs", which we do not want.) - het <- polycor::hetcor(ordered_df, ML = TRUE, std.err = TRUE, use = use) + # hetcor only supports "complete.obs" and "pairwise.complete.obs", while the other + # methods (via cor()/cor.test()) also accept "everything", "all.obs", and + # "na.or.complete". Map those to pairwise (the do_cor default) so polychoric does not + # error where the other methods would succeed. (hetcor's own default is listwise + # "complete.obs", which we do not want.) + het_use <- if (identical(use, "complete.obs")) "complete.obs" else "pairwise.complete.obs" + het <- polycor::hetcor(ordered_df, ML = TRUE, std.err = TRUE, use = het_use) cor_mat <- het$correlations tvalue_mat <- het$correlations / het$std.errors # z value; off-diagonal NA SE stays NA. pvalue_mat <- 2 * stats::pnorm(-abs(tvalue_mat)) diff --git a/tests/testthat/test_stats_wrapper.R b/tests/testthat/test_stats_wrapper.R index 2f9d41ffb..28dc149f6 100644 --- a/tests/testthat/test_stats_wrapper.R +++ b/tests/testthat/test_stats_wrapper.R @@ -132,9 +132,12 @@ test_that("do_cor with polychoric method", { res <- model_df %>% tidy_rowwise(model, type = 'cor') expect_equal(nrow(res), 9) # All 9 combinations. xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y") - expect_true(xy$correlation > 0.5) # Strong positive polychoric correlation. + expect_true(xy$correlation > 0.5 && xy$correlation < 0.9) # Recovers the true latent 0.7, not inflated. expect_true(xy$p_value < 0.05) # Statistically significant. expect_true(is.finite(xy$statistic)) # z value is populated. + xz <- res %>% filter(pair.name.x == "x", pair.name.y == "z") + expect_true(abs(xz$correlation) < 0.3) # z is independent of x: near-zero polychoric correlation. + expect_true(xz$p_value > 0.05) # The independent pair is not statistically significant. diag_row <- res %>% filter(pair.name.x == "x", pair.name.y == "x") expect_equal(diag_row$correlation, 1) # Diagonal correlation is 1. expect_equal(diag_row$p_value, 0) # Diagonal P value is 0. @@ -172,6 +175,62 @@ test_that("do_cor with polychoric method handles complex column names", { expect_true(is.finite(pair$correlation)) }) +test_that("do_cor with polychoric method accepts use values hetcor does not support", { + # hetcor only accepts "complete.obs" and "pairwise.complete.obs", but the public + # do_cor API (and the other methods via cor()/cor.test()) also accept "everything", + # "all.obs", and "na.or.complete". Those must be mapped, not passed through as an error. + set.seed(123) + n <- 100 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n) + df <- data.frame(x = cut5(z1), y = cut5(z2)) + for (u in c("everything", "all.obs", "na.or.complete", "complete.obs")) { + model_df <- df %>% do_cor(`x`, `y`, method = "polychoric", use = u, distinct = FALSE, diag = TRUE, return_type = "model") + res <- model_df %>% tidy_rowwise(model, type = 'cor') + xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y") + expect_equal(nrow(xy), 1, info = u) # The pair is computed regardless of the use value. + expect_true(xy$correlation > 0.5, info = u) + } +}) + +test_that("do_cor with polychoric method for grouped (repeat-by) data", { + # Repeat By on Analytics View maps to group_by(). Each group must get its own + # polychoric correlation. Group A has a positive relationship, group B a negative one. + set.seed(123) + n <- 100 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + mk <- function(rho) { + z1 <- rnorm(n); z2 <- rho * z1 + sqrt(1 - rho^2) * rnorm(n) + data.frame(x = cut5(z1), y = cut5(z2)) + } + df <- dplyr::bind_rows(cbind(mk(0.7), grp = "A"), cbind(mk(-0.7), grp = "B")) + model_df <- df %>% group_by(grp) %>% do_cor(`x`, `y`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model") + res <- model_df %>% tidy_rowwise(model, type = 'cor') + expect_setequal(unique(as.character(res$grp)), c("A", "B")) # Both groups produced results. + a_xy <- res %>% filter(grp == "A", pair.name.x == "x", pair.name.y == "y") + b_xy <- res %>% filter(grp == "B", pair.name.x == "x", pair.name.y == "y") + expect_true(a_xy$correlation > 0.4) # Positive correlation in group A. + expect_true(b_xy$correlation < -0.4) # Negative correlation in group B, computed independently. +}) + +test_that("do_cor with polychoric method handles NA values via pairwise complete obs", { + # Survey data routinely has missing responses (NA). The default use="pairwise.complete.obs" + # must drop NAs pairwise rather than error, so every pair is still estimated. + set.seed(123) + n <- 200 + cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf))) + z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n) + x <- cut5(z1); y <- cut5(z2); z <- cut5(z3) + x[1:20] <- NA; y[15:30] <- NA; z[40:60] <- NA # Missing responses at different rows per column. + df <- data.frame(x = x, y = y, z = z) + model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model") + res <- model_df %>% tidy_rowwise(model, type = 'cor') + expect_equal(nrow(res), 9) # Every pair is still estimated from the pairwise-complete rows. + xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y") + expect_true(is.finite(xy$correlation)) # NAs did not break the estimate. + expect_true(xy$correlation > 0.4) # The x-y relationship is still recovered despite the NAs. +}) + test_that("test do_svd.kv with fill", { test_df <- data.frame( rand=runif(20, min = 0, max=10),