Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 28 additions & 174 deletions R/analyse.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,99 +94,26 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU
stop("data's class is not ModelArray!")
}

# Validate that the formula's response matches the requested scalar
.validate_formula_response(formula, scalar)

# Early collision check: referenced scalar predictors vs phenotypes columns
all_vars <- all.vars(formula)
lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL)
rhs_vars <- setdiff(all_vars, lhs_name)
scalar_names <- names(scalars(data))
scalar_predictors <- intersect(rhs_vars, scalar_names)
.check_name_collisions(phenotypes, scalar_names, c(scalar, scalar_predictors), context = "modeling")

## element.subset:
if (is.null(element.subset)) { # request all elements
num.element.total <- numElementsTotal(modelarray = data, scalar_name = scalar)
element.subset <- 1:num.element.total
}
# checker for min and max of element.subset; and whether elements are integer
if (min(element.subset) < 1) {
stop("Minimal value in element.subset should >= 1")
}
if (max(element.subset) > nrow(scalars(data)[[scalar]])) {
stop(
paste0(
"Maximal value in element.subset should <= number of elements = ",
as.character(nrow(scalars(data)[[scalar]]))
)
)
}
if (class(element.subset) != "integer") {
stop("Please enter integers for element.subset!")
}

### sanity check: whether they match: modelarray's source file list and phenotypes' source file list:
sources.modelarray <- sources(data)[[scalar]]
sources.phenotypes <- phenotypes[["source_file"]]
if (is.null(sources.phenotypes)) {
stop(paste0("Did not find column 'source_file' in argument 'phenotypes'. Please check!"))
}

## length should be the same:
if (length(sources.modelarray) != length(sources.phenotypes)) {
stop(
paste0(
"The length of source file list from phenotypes's column 'source_file' ",
"is not the same as that in ModelArray 'data'! Please check out! ",
"The latter one can be accessed by: sources(data)[[scalar]]"
)
)
}

## check if the list is unique:
if (length(sources.modelarray) != length(unique(sources.modelarray))) {
stop(
paste0(
"The source files in ModelArray 'data' are not unique! Please check out! ",
"It can be accessed by: sources(data)[[scalar]]"
)
)
}
if (length(sources.phenotypes) != length(unique(sources.phenotypes))) {
stop(
paste0(
"The source files from phenotypes's column 'source_file' ",
"are not unique! Please check out and remove the duplicated one!"
)
)
}
.validate_element_subset(data, scalar, element.subset)

if (identical(sources.modelarray, sources.phenotypes)) {
# identical, pass
} else { # not identical (but length is the same):
# check if two lists can be matched (i.e. no unmatched source filename)
if ((all(sources.modelarray %in% sources.phenotypes)) && ((all(sources.phenotypes %in% sources.modelarray)))) {
# can be matched, just the order is different. Use match() function:
reorder_idx <- match(
sources.modelarray, # vector of values in the order we want
sources.phenotypes
) # vector to be reordered
# apply to phenotypes:
phenotypes <- phenotypes[reorder_idx, ]
# reset the row name, just to be safe for later adding scalar values...
# see ModelArray_paper/notebooks/test_match_sourceFiles.Rmd
row.names(phenotypes) <- NULL
if (!identical(phenotypes[["source_file"]], sources.modelarray)) {
stop("matching source file names were not successful...")
}
} else {
stop(
paste0(
"phenotypes's column 'source_file' have different element(s) from source file list",
" in ModelArray 'data'! Please check out! ",
"The latter one can be accessed by: sources(data)[[scalar]]"
)
)
}

# stop(
# paste0(
# "The source file list from phenotypes's column 'source_file' is not identical to that in ModelArray 'data'! ",
# "Please check out! ",
# "The latter one can be accessed by: sources(data)[[scalar]] "
# )
# )
}
### sanity check and alignment of phenotypes to sources
phenotypes <- .align_phenotypes_to_sources_or_error(sources(data)[[scalar]], phenotypes, scalar)



Expand Down Expand Up @@ -683,26 +610,23 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N
stop("data's class is not ModelArray!")
}

# Validate that the formula's response matches the requested scalar
.validate_formula_response(formula, scalar)

# Early collision check: referenced scalar predictors vs phenotypes columns
all_vars <- all.vars(formula)
lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL)
rhs_vars <- setdiff(all_vars, lhs_name)
scalar_names <- names(scalars(data))
scalar_predictors <- intersect(rhs_vars, scalar_names)
.check_name_collisions(phenotypes, scalar_names, c(scalar, scalar_predictors), context = "modeling")

## element.subset:
if (is.null(element.subset)) { # request all elements
num.element.total <- numElementsTotal(modelarray = data, scalar_name = scalar)
element.subset <- 1:num.element.total
}
# checker for min and max of element.subset; and whether elements are integer
if (min(element.subset) < 1) {
stop("Minimal value in element.subset should >= 1")
}
if (max(element.subset) > nrow(scalars(data)[[scalar]])) {
stop(
paste0(
"Maximal value in element.subset should <= number of elements = ",
as.character(nrow(scalars(data)[[scalar]]))
)
)
}
if (class(element.subset) != "integer") {
stop("Please enter integers for element.subset!")
}
.validate_element_subset(data, scalar, element.subset)

# check if the formula is valid in terms of mgcv::gam()
tryCatch(
Expand All @@ -729,78 +653,8 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N
checker_gam_formula(formula, gam.formula.breakdown)


### sanity check: whether they match: modelarray's source file list and phenotypes' source file list:
sources.modelarray <- sources(data)[[scalar]]
sources.phenotypes <- phenotypes[["source_file"]]
if (is.null(sources.phenotypes)) {
stop(paste0("Did not find column 'source_file' in argument 'phenotypes'. Please check!"))
}

## length should be the same:
if (length(sources.modelarray) != length(sources.phenotypes)) {
stop(
paste0(
"The length of source file list from phenotypes's column 'source_file'",
" is not the same as that in ModelArray 'data'! Please check out! ",
"The latter one can be accessed by: sources(data)[[scalar]]"
)
)
}

## check if the list is unique:
if (length(sources.modelarray) != length(unique(sources.modelarray))) {
stop(
paste0(
"The source files in ModelArray 'data' are not unique! Please check out!",
" It can be accessed by: sources(data)[[scalar]]"
)
)
}
if (length(sources.phenotypes) != length(unique(sources.phenotypes))) {
stop(
paste0(
"The source files from phenotypes's column 'source_file' are not unique! ",
"Please check out and remove the duplicated one!"
)
)
}

if (identical(sources.modelarray, sources.phenotypes)) {
# identical, pass
} else { # not identical (but length is the same):
# check if two lists can be matched (i.e. no unmatched source filename)
if ((all(sources.modelarray %in% sources.phenotypes)) && ((all(sources.phenotypes %in% sources.modelarray)))) {
# can be matched, just the order is different. Use match() function:
reorder_idx <- match(
sources.modelarray, # vector of values in the order we want
sources.phenotypes
) # vector to be reordered
# apply to phenotypes:
phenotypes <- phenotypes[reorder_idx, ]
row.names(phenotypes) <- NULL
# reset the row name, just to be safe for later adding scalar values...
# see ModelArray_paper/notebooks/test_match_sourceFiles.Rmd
if (!identical(phenotypes[["source_file"]], sources.modelarray)) {
stop("matching source file names were not successful...")
}
} else {
stop(
paste0(
"phenotypes's column 'source_file' have different element(s) from source file ",
"list in ModelArray 'data'! Please check out! The latter one can be accessed by: ",
"sources(data)[[scalar]]"
)
)
}

# stop(
# paste0(
# "The source file list from phenotypes's column 'source_file' is not identical ",
# "to that in ModelArray 'data'! Please check out! The latter one can be accessed by: ",
# "sources(data)[[scalar]] "
# )
# )
}
### sanity check and alignment of phenotypes to sources
phenotypes <- .align_phenotypes_to_sources_or_error(sources(data)[[scalar]], phenotypes, scalar)



Expand Down
94 changes: 94 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -489,3 +489,97 @@ bind_cols_check_emptyTibble <- function(a, b) {

c
}


# Internal helpers for multi-scalar validation/alignment
# These are intentionally lightweight and operate on vectors/indices to avoid data copies.
# @noRd
.validate_formula_response <- function(formula, scalar) {
lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL)
if (is.null(lhs_name) || lhs_name != scalar) {
stop(paste0(
"The formula's response variable ('",
if (is.null(lhs_name)) "<unknown>" else lhs_name,
"') must match the 'scalar' argument ('", scalar, "')."
))
}
invisible(TRUE)
}

# @noRd
.validate_element_subset <- function(data, scalar, element.subset) {
if (min(element.subset) < 1) {
stop("Minimal value in element.subset should >= 1")
}
if (max(element.subset) > nrow(scalars(data)[[scalar]])) {
stop(paste0(
"Maximal value in element.subset should <= number of elements = ",
as.character(nrow(scalars(data)[[scalar]]))
))
}
if (class(element.subset) != "integer") {
stop("Please enter integers for element.subset!")
}
invisible(TRUE)
}

# context: one of "modeling" or "wrapping" to preserve exact wording
# used_scalars: character vector of scalar names referenced (response and/or predictors)
# @noRd
.check_name_collisions <- function(phenotypes, scalar_names, used_scalars, context = "modeling") {
collisions <- intersect(used_scalars, colnames(phenotypes))
if (length(collisions) > 0) {
tail_msg <- if (identical(context, "wrapping")) "before wrapping." else "before modeling."
stop(paste0(
"Column name collision between phenotypes and scalar names: ",
paste(collisions, collapse = ", "),
". Please rename or remove these columns from phenotypes ", tail_msg
))
}
invisible(TRUE)
}

# Returns reordered phenotypes (may be identical) or errors with existing messages
# @noRd
.align_phenotypes_to_sources_or_error <- function(model_sources, phenotypes, scalar) {
sources.phenotypes <- phenotypes[["source_file"]]
if (is.null(sources.phenotypes)) {
stop(paste0("Did not find column 'source_file' in argument 'phenotypes'. Please check!"))
}
if (length(model_sources) != length(sources.phenotypes)) {
stop(paste0(
"The length of source file list from phenotypes's column 'source_file' ",
"is not the same as that in ModelArray 'data'! Please check out! ",
"The latter one can be accessed by: sources(data)[[scalar]]"
))
}
if (length(model_sources) != length(unique(model_sources))) {
stop(paste0(
"The source files in ModelArray 'data' are not unique! Please check out! ",
"It can be accessed by: sources(data)[[scalar]]"
))
}
if (length(sources.phenotypes) != length(unique(sources.phenotypes))) {
stop(paste0(
"The source files from phenotypes's column 'source_file' ",
"are not unique! Please check out and remove the duplicated one!"
))
}
if (identical(model_sources, sources.phenotypes)) {
return(phenotypes)
}
if ((all(model_sources %in% sources.phenotypes)) && (all(sources.phenotypes %in% model_sources))) {
reorder_idx <- match(model_sources, sources.phenotypes)
phenotypes <- phenotypes[reorder_idx, ]
row.names(phenotypes) <- NULL
if (!identical(phenotypes[["source_file"]], model_sources)) {
stop("matching source file names were not successful...")
}
return(phenotypes)
}
stop(paste0(
"phenotypes's column 'source_file' have different element(s) from source file list",
" in ModelArray 'data'! Please check out! ",
"The latter one can be accessed by: sources(data)[[scalar]]"
))
}
Loading
Loading