Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: exploratory
Type: Package
Title: R package for Exploratory
Version: 15.1.13
Date: 2026-05-30
Version: 15.1.14
Date: 2026-06-02
Authors@R: c(person("Hideaki", "Hayashi", email = "hideaki@exploratory.io", role = c("aut", "cre")), person("Hide", "Kojima", email = "hide@exploratory.io", role = c("aut")), person("Kan", "Nishida", email = "kan@exploratory.io", role = c("aut")), person("Kei", "Saito", email = "kei@exploratory.io", role = c("aut")), person("Yosuke", "Yasuda", email = "double.y.919.quick@gmail.com", role = c("aut")))
URL: https://github.com/exploratory-io/exploratory_func
Description: Functions for Exploratory
Expand All @@ -14,6 +14,7 @@ Imports:
prediction,
urltools,
psych,
polycor,
dplyr,
tidyr,
tibble,
Expand Down
94 changes: 61 additions & 33 deletions R/stats_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ do_cor <- function(df, ..., skv = NULL, fun.aggregate=mean, fill=0){
#' @param dimension A column you want to use as a dimension to calculate the correlations.
#' @param value A column for the values you want to use to calculate the correlations.
#' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric".
#' @param fun.aggregate Set an aggregate function when there are multiple entries for the key column per each category.
#' @param time_unit Unit of time to aggregate key_col if key_col is Date or POSIXct. NULL doesn't aggregate.
#' @return correlations between pairs of groups
Expand Down Expand Up @@ -175,7 +175,7 @@ do_cor.kv_ <- function(df,
#' @param df data frame in tidy format
#' @param ... Arguments to select columns to calculate correlation.
#' @param use Operation type for dealing with missing values. This can be one of "everything", "all.obs", "complete.obs", "na.or.complete", or "pairwise.complete.obs"
#' @param method Method of calculation. This can be one of "pearson", "kendall", or "spearman".
#' @param method Method of calculation. This can be one of "pearson", "kendall", "spearman", or "polychoric".
#' @return correlations between pairs of columns
#' @export
do_cor.cols <- function(df, ..., use = "pairwise.complete.obs", method = "pearson",
Expand Down Expand Up @@ -261,42 +261,70 @@ do_cor_internal <- function(mat, use, method, diag, output_cols, na.rm) {
sorted_colnames <- stringr::str_sort(colnames(mat))
mat <- mat[,sorted_colnames]

cor_mat <- cor(mat, use = use, method = method)
# Create a matrix of P-values for Analytics View case.
dim <- length(sorted_colnames)

if (identical(method, "polychoric")) {
# Polychoric correlation is for ordinal variables (e.g. survey scales 1-5).
# stats::cor()/cor.test() do not support it, so we use polycor::hetcor(), which
# returns the full correlation matrix and the matrix of standard errors in one call.
# We coerce every column to an ordered factor so that polychoric (not Pearson or
# polyserial) is used for every pair, and derive a two-sided z-test P value from
# rho / SE. Non-estimable pairs (constant or near-perfect columns) come back as NA,
# which the UI treats as not-significant.
loadNamespace("polycor")
ordered_df <- as.data.frame(lapply(seq_len(dim), function(k) {
ordered(mat[, k])
}))
colnames(ordered_df) <- sorted_colnames
# hetcor only supports "complete.obs" and "pairwise.complete.obs", while the other
# methods (via cor()/cor.test()) also accept "everything", "all.obs", and
# "na.or.complete". Map those to pairwise (the do_cor default) so polychoric does not
# error where the other methods would succeed. (hetcor's own default is listwise
# "complete.obs", which we do not want.)
het_use <- if (identical(use, "complete.obs")) "complete.obs" else "pairwise.complete.obs"
het <- polycor::hetcor(ordered_df, ML = TRUE, std.err = TRUE, use = het_use)
cor_mat <- het$correlations
tvalue_mat <- het$correlations / het$std.errors # z value; off-diagonal NA SE stays NA.
pvalue_mat <- 2 * stats::pnorm(-abs(tvalue_mat))
diag(tvalue_mat) <- Inf # For i=j case, statistic should be Inf and P value 0.
diag(pvalue_mat) <- 0
} else {
cor_mat <- cor(mat, use = use, method = method)
pvalue_mat <- matrix(NA, dim, dim)
tvalue_mat <- matrix(NA, dim, dim)
for (i in 2:dim) {
for (j in 1:(i-1)) {
tryCatch({
cor_test_res <- cor.test(mat[,i], mat[,j], method = method)
pvalue_mat[i, j] <- cor_test_res$p.value
tvalue_mat[i, j] <- cor_test_res$statistic
}, error = function(e) {
if (e$message == "not enough finite observations") {
# This is the error cor.test returns when there is not enough non-NA data.
# Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI.
pvalue_mat[i, j] <- NA
tvalue_mat[i, j] <- NA
}
else {
stop(e)
}
})
pvalue_mat[j, i] <- pvalue_mat[i, j]
tvalue_mat[j, i] <- tvalue_mat[i, j]
}
}
for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf.
pvalue_mat[i, i] <- 0
tvalue_mat[i, i] <- Inf
}
}

ret <- mat_to_df(cor_mat,
cnames=output_cols[1:3],
diag=diag,
na.rm=na.rm,
zero.rm=FALSE)

# Create a matrix of P-values for Analytics View case.
dim <- length(sorted_colnames)
pvalue_mat <- matrix(NA, dim, dim)
tvalue_mat <- matrix(NA, dim, dim)
for (i in 2:dim) {
for (j in 1:(i-1)) {
tryCatch({
cor_test_res <- cor.test(mat[,i], mat[,j], method = method)
pvalue_mat[i, j] <- cor_test_res$p.value
tvalue_mat[i, j] <- cor_test_res$statistic
}, error = function(e) {
if (e$message == "not enough finite observations") {
# This is the error cor.test returns when there is not enough non-NA data.
# Rather than stopping, set NA as the result, and we will handle it as a not-significant case on the UI.
pvalue_mat[i, j] <- NA
tvalue_mat[i, j] <- NA
}
else {
stop(e)
}
})
pvalue_mat[j, i] <- pvalue_mat[i, j]
tvalue_mat[j, i] <- tvalue_mat[i, j]
}
}
for (i in 1:dim) { # For i=j case, P value should be always 0 and t statistic should be Inf.
pvalue_mat[i, i] <- 0
tvalue_mat[i, i] <- Inf
}
colnames(pvalue_mat) <- sorted_colnames
rownames(pvalue_mat) <- sorted_colnames
colnames(tvalue_mat) <- sorted_colnames
Expand Down
111 changes: 111 additions & 0 deletions tests/testthat/test_stats_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,117 @@ test_that("do_cor should skip group with only one row.", {
expect_equal(nrow(res %>% filter(z==F)), 0)
})

test_that("do_cor with polychoric method", {
# Polychoric correlation is for ordinal variables. The latent variables behind x and y
# are correlated at 0.7, while z is independent of them.
set.seed(123)
n <- 200
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n)
df <- data.frame(x = cut5(z1), y = cut5(z2), z = cut5(z3))
model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
res <- model_df %>% tidy_rowwise(model, type = 'cor')
expect_equal(nrow(res), 9) # All 9 combinations.
xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
expect_true(xy$correlation > 0.5 && xy$correlation < 0.9) # Recovers the true latent 0.7, not inflated.
expect_true(xy$p_value < 0.05) # Statistically significant.
expect_true(is.finite(xy$statistic)) # z value is populated.
xz <- res %>% filter(pair.name.x == "x", pair.name.y == "z")
expect_true(abs(xz$correlation) < 0.3) # z is independent of x: near-zero polychoric correlation.
expect_true(xz$p_value > 0.05) # The independent pair is not statistically significant.
diag_row <- res %>% filter(pair.name.x == "x", pair.name.y == "x")
expect_equal(diag_row$correlation, 1) # Diagonal correlation is 1.
expect_equal(diag_row$p_value, 0) # Diagonal P value is 0.
})

test_that("do_cor with polychoric method handles a constant column without error", {
set.seed(123)
n <- 100
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
df <- data.frame(x = cut5(z1), y = cut5(z2), w = rep(3L, n)) # w is constant.
# hetcor warns for the non-estimable pairs involving the constant column; that is expected.
model_df <- suppressWarnings(df %>% do_cor(`x`, `y`, `w`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model"))
res <- model_df %>% tidy_rowwise(model, type = 'cor')
xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
expect_equal(nrow(xy), 1) # The estimable pair is still computed.
expect_true(xy$correlation > 0.5)
# The constant-column pair has NA correlation and is dropped by na.rm in mat_to_df.
expect_equal(nrow(res %>% filter(pair.name.x == "x", pair.name.y == "w")), 0)
})

test_that("do_cor with polychoric method handles complex column names", {
set.seed(123)
n <- 100
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
sname <- "航空 会社 !\"#$%&'()*+, -./:;<=>?@[]^_'{|}~ 表"
df <- data.frame(a = cut5(z1), b = cut5(z2), check.names = FALSE)
names(df) <- c(sname, "plain")
model_df <- df %>% do_cor(tidyselect::everything(), method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
res <- model_df %>% tidy_rowwise(model, type = 'cor')
expect_true(sname %in% as.character(res$pair.name.x)) # Complex name survives the round trip.
pair <- res %>% filter(as.character(pair.name.x) == sname, pair.name.y == "plain")
expect_equal(nrow(pair), 1)
expect_true(is.finite(pair$correlation))
})

test_that("do_cor with polychoric method accepts use values hetcor does not support", {
# hetcor only accepts "complete.obs" and "pairwise.complete.obs", but the public
# do_cor API (and the other methods via cor()/cor.test()) also accept "everything",
# "all.obs", and "na.or.complete". Those must be mapped, not passed through as an error.
set.seed(123)
n <- 100
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n)
df <- data.frame(x = cut5(z1), y = cut5(z2))
for (u in c("everything", "all.obs", "na.or.complete", "complete.obs")) {
model_df <- df %>% do_cor(`x`, `y`, method = "polychoric", use = u, distinct = FALSE, diag = TRUE, return_type = "model")
res <- model_df %>% tidy_rowwise(model, type = 'cor')
xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
expect_equal(nrow(xy), 1, info = u) # The pair is computed regardless of the use value.
expect_true(xy$correlation > 0.5, info = u)
}
})

test_that("do_cor with polychoric method for grouped (repeat-by) data", {
# Repeat By on Analytics View maps to group_by(). Each group must get its own
# polychoric correlation. Group A has a positive relationship, group B a negative one.
set.seed(123)
n <- 100
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
mk <- function(rho) {
z1 <- rnorm(n); z2 <- rho * z1 + sqrt(1 - rho^2) * rnorm(n)
data.frame(x = cut5(z1), y = cut5(z2))
}
df <- dplyr::bind_rows(cbind(mk(0.7), grp = "A"), cbind(mk(-0.7), grp = "B"))
model_df <- df %>% group_by(grp) %>% do_cor(`x`, `y`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
res <- model_df %>% tidy_rowwise(model, type = 'cor')
expect_setequal(unique(as.character(res$grp)), c("A", "B")) # Both groups produced results.
a_xy <- res %>% filter(grp == "A", pair.name.x == "x", pair.name.y == "y")
b_xy <- res %>% filter(grp == "B", pair.name.x == "x", pair.name.y == "y")
expect_true(a_xy$correlation > 0.4) # Positive correlation in group A.
expect_true(b_xy$correlation < -0.4) # Negative correlation in group B, computed independently.
})

test_that("do_cor with polychoric method handles NA values via pairwise complete obs", {
# Survey data routinely has missing responses (NA). The default use="pairwise.complete.obs"
# must drop NAs pairwise rather than error, so every pair is still estimated.
set.seed(123)
n <- 200
cut5 <- function(z) as.integer(cut(z, breaks = c(-Inf, -0.84, -0.25, 0.25, 0.84, Inf)))
z1 <- rnorm(n); z2 <- 0.7 * z1 + sqrt(1 - 0.49) * rnorm(n); z3 <- rnorm(n)
x <- cut5(z1); y <- cut5(z2); z <- cut5(z3)
x[1:20] <- NA; y[15:30] <- NA; z[40:60] <- NA # Missing responses at different rows per column.
df <- data.frame(x = x, y = y, z = z)
model_df <- df %>% do_cor(`x`, `y`, `z`, method = "polychoric", distinct = FALSE, diag = TRUE, return_type = "model")
res <- model_df %>% tidy_rowwise(model, type = 'cor')
expect_equal(nrow(res), 9) # Every pair is still estimated from the pairwise-complete rows.
xy <- res %>% filter(pair.name.x == "x", pair.name.y == "y")
expect_true(is.finite(xy$correlation)) # NAs did not break the estimate.
expect_true(xy$correlation > 0.4) # The x-y relationship is still recovered despite the NAs.
})

test_that("test do_svd.kv with fill", {
test_df <- data.frame(
rand=runif(20, min = 0, max=10),
Expand Down