exploratory-io · kei51e · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exploratory
 Type: Package
 Title: R package for Exploratory
-Version: 15.1.13
-Date: 2026-05-30
+Version: 15.1.14
+Date: 2026-06-02
 Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut")))
 URL: https://github.com/exploratory-io/exploratory_func
 Description: Functions for Exploratory
@@ -14,6 +14,7 @@ Imports:
     prediction, 
     urltools,
     psych,
+    polycor,
     dplyr,
     tidyr,
     tibble,

diff --git a/R/stats_wrapper.R b/R/stats_wrapper.R
@@ -45,7 +45,7 @@ do_cor <- function(df, ..., skv = NULL, fun.aggregate=mean, fill=0){
 #' @param dimension A column you want to use as a dimension to calculate the correlations.
 #' @param value A column for the values you want to use to calculate the correlations.
 #' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
-#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
+#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric".
 #' @param fun.aggregate  Set an aggregate function when there are multiple entries for the key column per each category.
 #' @param time_unit Unit of time to aggregate key_col if key_col is Date or POSIXct. NULL doesn't aggregate.
 #' @return correlations between pairs of groups
@@ -175,7 +175,7 @@ do_cor.kv_ <- function(df,
 #' @param df data frame in tidy format
 #' @param ... Arguments to select columns to calculate correlation.
 #' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
-#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
+#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric".
 #' @return correlations between pairs of columns
 #' @export
 do_cor.cols <- function(df, ..., use = "pairwise.complete.obs", method = "pearson",
@@ -261,42 +261,70 @@ do_cor_internal <- function(mat, use, method, diag, output_cols, na.rm) {
   sorted_colnames <- stringr::str_sort(colnames(mat))
   mat <- mat[,sorted_colnames]
 
-  cor_mat <- cor(mat, use = use, method = method)
+  # Create a matrix of P-values for Analytics View case.
+  dim <- length(sorted_colnames)
+
+  if (identical(method, "polychoric")) {
+    # Polychoric correlation is for ordinal variables (e.g. survey scales 1-5).
+    # stats::cor()/cor.test() do not support it, so we use polycor::hetcor(), which
+    # returns the full correlation matrix and the matrix of standard errors in one call.
+    # We coerce every column to an ordered factor so that polychoric (not Pearson or
+    # polyserial) is used for every pair, and derive a two-sided z-test P value from
+    # rho / SE. Non-estimable pairs (constant or near-perfect columns) come back as NA,
+    # which the UI treats as not-significant.
+    loadNamespace("polycor")
+    ordered_df <- as.data.frame(lapply(seq_len(dim), function(k) {
+      ordered(mat[, k])
+    }))
+    colnames(ordered_df) <- sorted_colnames
+    # hetcor only supports "complete.obs" and "pairwise.complete.obs", while the other
+    # methods (via cor()/cor.test()) also accept "everything", "all.obs", and
+    # "na.or.complete". Map those to pairwise (the do_cor default) so polychoric does not
+    # error where the other methods would succeed. (hetcor's own default is listwise
+    # "complete.obs", which we do not want.)
+    het_use <- if (identical(use, "complete.obs")) "complete.obs" else "pairwise.complete.obs"
+    het <- polycor::hetcor(ordered_df, ML = TRUE, std.err = TRUE, use = het_use)
+    cor_mat <- het$correlations
+    tvalue_mat <- het$correlations / het$std.errors # z value; off-diagonal NA SE stays NA.
+    pvalue_mat <- 2 * stats::pnorm(-abs(tvalue_mat))
+    diag(tvalue_mat) <- Inf # For i=j case, statistic should be Inf and P value 0.
+    diag(pvalue_mat) <- 0
+  } else {
+    cor_mat <- cor(mat, use = use, method = method)
+    pvalue_mat <- matrix(NA, dim, dim)
+    tvalue_mat <- matrix(NA, dim, dim)
+    for (i in 2:dim) {
+      for (j in 1:(i-1)) {
+        tryCatch({
+          cor_test_res <- cor.test(mat[,i], mat[,j], method = method)
+          pvalue_mat[i, j] <- cor_test_res$p.value
+          tvalue_mat[i, j] <- cor_test_res$statistic
+        }, error = function(e) {
+          if (e$message == "not enough finite observations") {
+            # This is the error cor.test returns when there is not enough non-NA data.
+            # Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI.
+            pvalue_mat[i, j] <- NA
+            tvalue_mat[i, j] <- NA
+          }
+          else {
+            stop(e)
+          }
+        })
+        pvalue_mat[j, i] <- pvalue_mat[i, j]
+        tvalue_mat[j, i] <- tvalue_mat[i, j]
+      }
+    }
+    for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf.
+      pvalue_mat[i, i] <- 0
+      tvalue_mat[i, i] <- Inf
+    }
+  }
+
   ret <- mat_to_df(cor_mat,
                    cnames=output_cols[1:3],
                    diag=diag,
                    na.rm=na.rm,
                    zero.rm=FALSE)
-
-  # Create a matrix of P-values for Analytics View case.
-  dim <- length(sorted_colnames)
-  pvalue_mat <- matrix(NA, dim, dim)
-  tvalue_mat <- matrix(NA, dim, dim)
-  for (i in 2:dim) {
-    for (j in 1:(i-1)) {
-      tryCatch({
-        cor_test_res <- cor.test(mat[,i], mat[,j], method = method)
-        pvalue_mat[i, j] <- cor_test_res$p.value
-        tvalue_mat[i, j] <- cor_test_res$statistic
-      }, error = function(e) {
-        if (e$message == "not enough finite observations") {
-          # This is the error cor.test returns when there is not enough non-NA data.
-          # Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI.
-          pvalue_mat[i, j] <- NA
-          tvalue_mat[i, j] <- NA
-        }
-        else {
-          stop(e)
-        }
-      })
-      pvalue_mat[j, i] <- pvalue_mat[i, j]
-      tvalue_mat[j, i] <- tvalue_mat[i, j]
-    }
-  }
-  for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf.
-    pvalue_mat[i, i] <- 0
-    tvalue_mat[i, i] <- Inf
-  }
   colnames(pvalue_mat) <- sorted_colnames
   rownames(pvalue_mat) <- sorted_colnames
   colnames(tvalue_mat) <- sorted_colnames

diff --git a/tests/testthat/test_stats_wrapper.R b/tests/testthat/test_stats_wrapper.R
@@ -120,6 +120,117 @@ test_that("do_cor should skip group with only one row.", {
   expect_equal(nrow(res %>% filter(z==F)), 0)
 })
 
+test_that("do_cor with polychoric method", {
+  # Polychoric correlation is for ordinal variables. The latent variables behind x and y
+  # are correlated at 0.7, while z is independent of them.
+  set.seed(123)
+  n <- 200
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n)
+  df <- data.frame(x = cut5(z1), y = cut5(z2), z = cut5(z3))
+  model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
+  res <- model_df %>% tidy_rowwise(model, type = 'cor')
+  expect_equal(nrow(res), 9) # All 9 combinations.
+  xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
+  expect_true(xy$correlation > 0.5 && xy$correlation < 0.9) # Recovers the true latent 0.7, not inflated.
+  expect_true(xy$p_value < 0.05) # Statistically significant.
+  expect_true(is.finite(xy$statistic)) # z value is populated.
+  xz <- res %>% filter(pair.name.x == "x", pair.name.y == "z")
+  expect_true(abs(xz$correlation) < 0.3) # z is independent of x: near-zero polychoric correlation.
+  expect_true(xz$p_value > 0.05) # The independent pair is not statistically significant.
+  diag_row <- res %>% filter(pair.name.x == "x", pair.name.y == "x")
+  expect_equal(diag_row$correlation, 1) # Diagonal correlation is 1.
+  expect_equal(diag_row$p_value, 0) # Diagonal P value is 0.
+})
+
+test_that("do_cor with polychoric method handles a constant column without error", {
+  set.seed(123)
+  n <- 100
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
+  df <- data.frame(x = cut5(z1), y = cut5(z2), w = rep(3L, n)) # w is constant.
+  # hetcor warns for the non-estimable pairs involving the constant column; that is expected.
+  model_df <- suppressWarnings(df %>% do_cor(`x`, `y`, `w`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model"))
+  res <- model_df %>% tidy_rowwise(model, type = 'cor')
+  xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
+  expect_equal(nrow(xy), 1) # The estimable pair is still computed.
+  expect_true(xy$correlation > 0.5)
+  # The constant-column pair has NA correlation and is dropped by na.rm in mat_to_df.
+  expect_equal(nrow(res %>% filter(pair.name.x == "x", pair.name.y == "w")), 0)
+})
+
+test_that("do_cor with polychoric method handles complex column names", {
+  set.seed(123)
+  n <- 100
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
+  sname <- "航空 会社 !\"#$%&'()*+, -./:;<=>?@[]^_'{|}~ 表"
+  df <- data.frame(a = cut5(z1), b = cut5(z2), check.names = FALSE)
+  names(df) <- c(sname, "plain")
+  model_df <- df %>% do_cor(tidyselect::everything(), method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
+  res <- model_df %>% tidy_rowwise(model, type = 'cor')
+  expect_true(sname %in% as.character(res$pair.name.x)) # Complex name survives the round trip.
+  pair <- res %>% filter(as.character(pair.name.x) == sname, pair.name.y == "plain")
+  expect_equal(nrow(pair), 1)
+  expect_true(is.finite(pair$correlation))
+})
+
+test_that("do_cor with polychoric method accepts use values hetcor does not support", {
+  # hetcor only accepts "complete.obs" and "pairwise.complete.obs", but the public
+  # do_cor API (and the other methods via cor()/cor.test()) also accept "everything",
+  # "all.obs", and "na.or.complete". Those must be mapped, not passed through as an error.
+  set.seed(123)
+  n <- 100
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
+  df <- data.frame(x = cut5(z1), y = cut5(z2))
+  for (u in c("everything", "all.obs", "na.or.complete", "complete.obs")) {
+    model_df <- df %>% do_cor(`x`, `y`, method = "polychoric", use = u, distinct = FALSE, diag = TRUE, return_type = "model")
+    res <- model_df %>% tidy_rowwise(model, type = 'cor')
+    xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
+    expect_equal(nrow(xy), 1, info = u) # The pair is computed regardless of the use value.
+    expect_true(xy$correlation > 0.5, info = u)
+  }
+})
+
+test_that("do_cor with polychoric method for grouped (repeat-by) data", {
+  # Repeat By on Analytics View maps to group_by(). Each group must get its own
+  # polychoric correlation. Group A has a positive relationship, group B a negative one.
+  set.seed(123)
+  n <- 100
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  mk <- function(rho) {
+    z1 <- rnorm(n); z2 <- rho * z1 + sqrt(1 - rho^2) * rnorm(n)
+    data.frame(x = cut5(z1), y = cut5(z2))
+  }
+  df <- dplyr::bind_rows(cbind(mk(0.7), grp = "A"), cbind(mk(-0.7), grp = "B"))
+  model_df <- df %>% group_by(grp) %>% do_cor(`x`, `y`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
+  res <- model_df %>% tidy_rowwise(model, type = 'cor')
+  expect_setequal(unique(as.character(res$grp)), c("A", "B")) # Both groups produced results.
+  a_xy <- res %>% filter(grp == "A", pair.name.x == "x", pair.name.y == "y")
+  b_xy <- res %>% filter(grp == "B", pair.name.x == "x", pair.name.y == "y")
+  expect_true(a_xy$correlation > 0.4) # Positive correlation in group A.
+  expect_true(b_xy$correlation < -0.4) # Negative correlation in group B, computed independently.
+})
+
+test_that("do_cor with polychoric method handles NA values via pairwise complete obs", {
+  # Survey data routinely has missing responses (NA). The default use="pairwise.complete.obs"
+  # must drop NAs pairwise rather than error, so every pair is still estimated.
+  set.seed(123)
+  n <- 200
+  cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
+  z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n)
+  x <- cut5(z1); y <- cut5(z2); z <- cut5(z3)
+  x[1:20] <- NA; y[15:30] <- NA; z[40:60] <- NA # Missing responses at different rows per column.
+  df <- data.frame(x = x, y = y, z = z)
+  model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
+  res <- model_df %>% tidy_rowwise(model, type = 'cor')
+  expect_equal(nrow(res), 9) # Every pair is still estimated from the pairwise-complete rows.
+  xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
+  expect_true(is.finite(xy$correlation)) # NAs did not break the estimate.
+  expect_true(xy$correlation > 0.4) # The x-y relationship is still recovered despite the NAs.
+})
+
 test_that("test do_svd.kv with fill", {
   test_df <- data.frame(
     rand=runif(20, min = 0, max=10),