From dfb48b464809aab1393ab63ed383c28856e56c6f Mon Sep 17 00:00:00 2001 From: Siya Kale Date: Thu, 11 Jun 2026 06:30:12 +0530 Subject: [PATCH] Add validation for non-numeric observed variables in prepare_data --- Project.toml | 30 +++++++++-------- ext/SEMNLOptExt/NLopt.jl | 2 +- src/additional_functions/helper.jl | 6 ++-- src/additional_functions/params_array.jl | 10 ++---- src/frontend/StatsAPI.jl | 12 +++---- .../specification/EnsembleParameterTable.jl | 9 ++++- src/frontend/specification/ParameterTable.jl | 33 ++++++++----------- src/frontend/specification/checks.jl | 7 ++-- src/frontend/specification/documentation.jl | 1 - src/implied/RAM/generic.jl | 7 +++- src/observed/abstract.jl | 14 ++++++++ src/observed/data.jl | 1 + src/optimizer/abstract.jl | 5 ++- src/optimizer/optim.jl | 2 +- test/examples/helper.jl | 8 ++--- test/examples/multigroup/build_models.jl | 3 +- test/examples/multigroup/multigroup.jl | 8 ++--- test/examples/proximal/l0.jl | 3 +- test/unit_tests/StatsAPI.jl | 9 ++--- test/unit_tests/data_input_formats.jl | 9 +++++ test/unit_tests/model.jl | 1 - test/unit_tests/unit_tests.jl | 2 +- test/unit_tests/unit_tests_interactive.jl | 2 +- 23 files changed, 104 insertions(+), 80 deletions(-) diff --git a/Project.toml b/Project.toml index 376347083..b0f40572c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,14 @@ name = "StructuralEquationModels" uuid = "383ca8c5-e4ff-4104-b0a9-f7b279deed53" -authors = ["Maximilian Ernst", "Aaron Peikert"] version = "0.4.2" +authors = ["Maximilian Ernst", "Aaron Peikert"] [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" +JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -21,36 +22,37 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StenoGraphs = "78862bba-adae-4a83-bb4d-33c106177f81" -Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" +Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7" + +[weakdeps] +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +ProximalAlgorithms = "140ffc9f-1907-541a-a177-7475e0a401e9" + +[extensions] +SEMNLOptExt = "NLopt" +SEMProximalOptExt = "ProximalAlgorithms" [compat] -julia = "1.9, 1.10, 1.11" -StenoGraphs = "0.2 - 0.3, 0.4.1 - 0.5" DataFrames = "1" Distributions = "0.25" FiniteDiff = "2" +JuliaFormatter = "2.6.11" LineSearches = "7" NLSolversBase = "7" NLopt = "0.6, 1" Optim = "1" PrettyTables = "2" ProximalAlgorithms = "0.7" +StatsAPI = "1" StatsBase = "0.33, 0.34" -Symbolics = "4, 5, 6" +StenoGraphs = "0.2 - 0.3, 0.4.1 - 0.5" SymbolicUtils = "1.4 - 1.5, 1.7, 2, 3" -StatsAPI = "1" +Symbolics = "4, 5, 6" +julia = "1.9, 1.10, 1.11" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test"] - -[weakdeps] -NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" -ProximalAlgorithms = "140ffc9f-1907-541a-a177-7475e0a401e9" - -[extensions] -SEMNLOptExt = "NLopt" -SEMProximalOptExt = "ProximalAlgorithms" diff --git a/ext/SEMNLOptExt/NLopt.jl b/ext/SEMNLOptExt/NLopt.jl index c5e0ad6cb..27bc30039 100644 --- a/ext/SEMNLOptExt/NLopt.jl +++ b/ext/SEMNLOptExt/NLopt.jl @@ -19,7 +19,7 @@ function SemOptimizerNLopt(; applicable(iterate, equality_constraints) && !isa(equality_constraints, NamedTuple) || (equality_constraints = [equality_constraints]) applicable(iterate, inequality_constraints) && - !isa(inequality_constraints, NamedTuple) || + !isa(inequality_constraints, NamedTuple) || (inequality_constraints = [inequality_constraints]) return SemOptimizerNLopt( algorithm, diff --git a/src/additional_functions/helper.jl b/src/additional_functions/helper.jl index 6cbcb0573..d6a1fc6c8 100644 --- a/src/additional_functions/helper.jl +++ b/src/additional_functions/helper.jl @@ -21,7 +21,7 @@ function batch_inv!(fun, model) end # computes A*S*B -> C, where ind gives the entries of S that are 1 -function sparse_outer_mul!(C, A, B, ind) +function sparse_outer_mul!(C, A, B, ind) fill!(C, 0.0) for i in 1:length(ind) BLAS.ger!(1.0, A[:, ind[i][1]], B[ind[i][2], :], C) @@ -29,14 +29,14 @@ function sparse_outer_mul!(C, A, B, ind) end # computes A*∇m, where ∇m ind gives the entries of ∇m that are 1 -function sparse_outer_mul!(C, A, ind) +function sparse_outer_mul!(C, A, ind) fill!(C, 0.0) @views C .= sum(A[:, ind], dims = 2) return C end # computes A*S*B -> C, where ind gives the entries of S that are 1 -function sparse_outer_mul!(C, A, B::Vector, ind) +function sparse_outer_mul!(C, A, B::Vector, ind) fill!(C, 0.0) @views @inbounds for i in 1:length(ind) C .+= B[ind[i][2]] .* A[:, ind[i][1]] diff --git a/src/additional_functions/params_array.jl b/src/additional_functions/params_array.jl index 1031e349e..8cd89032e 100644 --- a/src/additional_functions/params_array.jl +++ b/src/additional_functions/params_array.jl @@ -199,11 +199,7 @@ materialize!( kwargs..., ) = materialize!(parent(dest), src, params; kwargs...) -function sparse_materialize( - ::Type{T}, - arr::ParamsMatrix, - params::AbstractVector, -) where {T} +function sparse_materialize(::Type{T}, arr::ParamsMatrix, params::AbstractVector) where {T} nparams(arr) == length(params) || throw( DimensionMismatch( "Number of values ($(length(params))) does not match the number of parameter ($(nparams(arr)))", @@ -251,8 +247,8 @@ sparse_gradient(arr::ParamsArray{T}) where {T} = sparse_gradient(T, arr) # range of parameters that are referenced in the matrix function params_range(arr::ParamsArray; allow_gaps::Bool = false) - first_i = findfirst(i -> arr.param_ptr[i+1] > arr.param_ptr[i], 1:nparams(arr)-1) - last_i = findlast(i -> arr.param_ptr[i+1] > arr.param_ptr[i], 1:nparams(arr)-1) + first_i = findfirst(i -> arr.param_ptr[i+1] > arr.param_ptr[i], 1:(nparams(arr)-1)) + last_i = findlast(i -> arr.param_ptr[i+1] > arr.param_ptr[i], 1:(nparams(arr)-1)) if !allow_gaps && !isnothing(first_i) && !isnothing(last_i) for i in first_i:last_i diff --git a/src/frontend/StatsAPI.jl b/src/frontend/StatsAPI.jl index edd677e34..b00c451af 100644 --- a/src/frontend/StatsAPI.jl +++ b/src/frontend/StatsAPI.jl @@ -13,11 +13,7 @@ Note that the function combines the duplicate occurences of the same parameter in `partable` and will raise an error if the values do not match. """ -function params!( - out::AbstractVector, - partable::ParameterTable, - col::Symbol = :estimate, -) +function params!(out::AbstractVector, partable::ParameterTable, col::Symbol = :estimate) (length(out) == nparams(partable)) || throw( DimensionMismatch( "The length of parameter values vector ($(length(out))) does not match the number of parameters ($(nparams(partable)))", @@ -75,4 +71,8 @@ Synonymous to [`nsamples`](@ref). """ nobs(model::AbstractSem) = nsamples(model) -coeftable(model::AbstractSem; level::Real=0.95) = throw(ArgumentError("StructuralEquationModels does not support the `CoefTable` interface; see [`ParameterTable`](@ref) instead.")) \ No newline at end of file +coeftable(model::AbstractSem; level::Real = 0.95) = throw( + ArgumentError( + "StructuralEquationModels does not support the `CoefTable` interface; see [`ParameterTable`](@ref) instead.", + ), +) diff --git a/src/frontend/specification/EnsembleParameterTable.jl b/src/frontend/specification/EnsembleParameterTable.jl index 14169dd94..fef9a7d35 100644 --- a/src/frontend/specification/EnsembleParameterTable.jl +++ b/src/frontend/specification/EnsembleParameterTable.jl @@ -30,7 +30,14 @@ function EnsembleParameterTable( param_labels = if isnothing(param_labels) # collect all SEM parameters in ensemble if not specified # and apply the set to all partables - unique(mapreduce(SEM.param_labels, vcat, values(spec_ensemble), init = Vector{Symbol}())) + unique( + mapreduce( + SEM.param_labels, + vcat, + values(spec_ensemble), + init = Vector{Symbol}(), + ), + ) else copy(param_labels) end diff --git a/src/frontend/specification/ParameterTable.jl b/src/frontend/specification/ParameterTable.jl index 2af269372..225ae415d 100644 --- a/src/frontend/specification/ParameterTable.jl +++ b/src/frontend/specification/ParameterTable.jl @@ -34,7 +34,9 @@ function ParameterTable( latent_vars::Union{AbstractVector{Symbol}, Nothing} = nothing, param_labels::Union{AbstractVector{Symbol}, Nothing} = nothing, ) - param_labels = isnothing(param_labels) ? unique!(filter(!=(:const), columns[:label])) : copy(param_labels) + param_labels = + isnothing(param_labels) ? unique!(filter(!=(:const), columns[:label])) : + copy(param_labels) check_param_labels(param_labels, columns[:label]) return ParameterTable( columns, @@ -389,7 +391,6 @@ function update_se_hessian!( return update_partable!(partable, :se, param_labels(fit), se) end - """ lavaan_params!(out::AbstractVector, partable_lav, partable::ParameterTable, @@ -438,8 +439,8 @@ function lavaan_params!( lav_ind = findallrows( r -> r[:lhs] == String(to) && - r[:op] == "~1" && - (isnothing(lav_group) || r[:group] == lav_group), + r[:op] == "~1" && + (isnothing(lav_group) || r[:group] == lav_group), partable_lav, ) else @@ -458,20 +459,20 @@ function lavaan_params!( lav_ind = findallrows( r -> ( - (r[:lhs] == String(from) && r[:rhs] == String(to)) || - (r[:lhs] == String(to) && r[:rhs] == String(from)) - ) && - r[:op] == lav_type && - (isnothing(lav_group) || r[:group] == lav_group), + (r[:lhs] == String(from) && r[:rhs] == String(to)) || + (r[:lhs] == String(to) && r[:rhs] == String(from)) + ) && + r[:op] == lav_type && + (isnothing(lav_group) || r[:group] == lav_group), partable_lav, ) else lav_ind = findallrows( r -> r[:lhs] == String(from) && - r[:rhs] == String(to) && - r[:op] == lav_type && - (isnothing(lav_group) || r[:group] == lav_group), + r[:rhs] == String(to) && + r[:op] == lav_type && + (isnothing(lav_group) || r[:group] == lav_group), partable_lav, ) end @@ -524,10 +525,4 @@ lavaan_params( partable::ParameterTable, lav_col::Symbol = :est, lav_group = nothing, -) = lavaan_params!( - fill(NaN, nparams(partable)), - partable_lav, - partable, - lav_col, - lav_group, -) +) = lavaan_params!(fill(NaN, nparams(partable)), partable_lav, partable, lav_col, lav_group) diff --git a/src/frontend/specification/checks.jl b/src/frontend/specification/checks.jl index 5ef41c59d..2d00be26d 100644 --- a/src/frontend/specification/checks.jl +++ b/src/frontend/specification/checks.jl @@ -4,8 +4,11 @@ function check_param_labels( param_refs::Union{AbstractVector{Symbol}, Nothing}, ) dup_param_labels = nonunique(param_labels) - isempty(dup_param_labels) || - throw(ArgumentError("Duplicate parameter labels detected: $(join(dup_param_labels, ", "))")) + isempty(dup_param_labels) || throw( + ArgumentError( + "Duplicate parameter labels detected: $(join(dup_param_labels, ", "))", + ), + ) any(==(:const), param_labels) && throw(ArgumentError("Parameters constain reserved :const name")) diff --git a/src/frontend/specification/documentation.jl b/src/frontend/specification/documentation.jl index e46620fbc..a3a8d2659 100644 --- a/src/frontend/specification/documentation.jl +++ b/src/frontend/specification/documentation.jl @@ -37,7 +37,6 @@ Return the vector of parameter labels (in the same order as [`params`](@ref)). """ param_labels(spec::SemSpecification) = spec.param_labels - """ `ParameterTable`s contain the specification of a structural equation model. diff --git a/src/implied/RAM/generic.jl b/src/implied/RAM/generic.jl index fd2ef61b5..9ce1f799a 100644 --- a/src/implied/RAM/generic.jl +++ b/src/implied/RAM/generic.jl @@ -163,7 +163,12 @@ end ### methods ############################################################################################ -function update!(targets::EvaluationTargets, implied::RAM, model::AbstractSemSingle, param_labels) +function update!( + targets::EvaluationTargets, + implied::RAM, + model::AbstractSemSingle, + param_labels, +) materialize!(implied.A, implied.ram_matrices.A, param_labels) materialize!(implied.S, implied.ram_matrices.S, param_labels) if !isnothing(implied.M) diff --git a/src/observed/abstract.jl b/src/observed/abstract.jl index cf5000e4f..16acfbade 100644 --- a/src/observed/abstract.jl +++ b/src/observed/abstract.jl @@ -124,6 +124,20 @@ function prepare_data( end # make sure data_mtx is a dense matrix (required for methods like mean_and_cov()) data_mtx = convert(Matrix, data_ordered) + # Validate that all columns contain numeric data + for j in axes(data_mtx, 2) + if !all(x -> x isa Number || ismissing(x), data_mtx[:, j]) + varname = + isnothing(obs_vars_reordered) ? "column $j" : + string(obs_vars_reordered[j]) + + throw( + ArgumentError( + "Observed variable '$varname' contains non-numeric data. SEM models require numeric variables.", + ), + ) + end + end elseif data isa NTuple{2, Integer} # given the dimensions of the data matrix, but no data itself data_mtx = nothing nobs_vars = data[2] diff --git a/src/observed/data.jl b/src/observed/data.jl index 7ba38edf5..fa66f4e78 100644 --- a/src/observed/data.jl +++ b/src/observed/data.jl @@ -40,6 +40,7 @@ function SemObservedData(; ) data, obs_vars, _ = prepare_data(data, observed_vars, specification; observed_var_prefix) + obs_mean, obs_cov = mean_and_cov(data, 1) return SemObservedData(data, obs_vars, obs_cov, vec(obs_mean), size(data, 1)) diff --git a/src/optimizer/abstract.jl b/src/optimizer/abstract.jl index 3b1e98842..c1ad72592 100644 --- a/src/optimizer/abstract.jl +++ b/src/optimizer/abstract.jl @@ -41,7 +41,7 @@ function fit(optim::SemOptimizer, model::AbstractSem; start_val = nothing, kwarg end fit(model::AbstractSem; engine::Symbol = :Optim, start_val = nothing, kwargs...) = -fit(SemOptimizer(; engine, kwargs...), model; start_val, kwargs...) + fit(SemOptimizer(; engine, kwargs...), model; start_val, kwargs...) # fallback method fit(optim::SemOptimizer, model::AbstractSem, start_params; kwargs...) = @@ -56,8 +56,7 @@ prepare_start_params(start_val::Nothing, model::AbstractSem; kwargs...) = start_simple(model; kwargs...) # first argument is a function -prepare_start_params(start_val, model::AbstractSem; kwargs...) = - start_val(model; kwargs...) +prepare_start_params(start_val, model::AbstractSem; kwargs...) = start_val(model; kwargs...) function prepare_start_params(start_val::AbstractVector, model::AbstractSem; kwargs...) (length(start_val) == nparams(model)) || throw( diff --git a/src/optimizer/optim.jl b/src/optimizer/optim.jl index bd57942d8..2d782473a 100644 --- a/src/optimizer/optim.jl +++ b/src/optimizer/optim.jl @@ -67,7 +67,7 @@ SemOptimizer{:Optim}(args...; kwargs...) = SemOptimizerOptim(args...; kwargs...) SemOptimizerOptim(; algorithm = LBFGS(), - options = Optim.Options(;f_reltol = 1e-10, x_abstol = 1.5e-8), + options = Optim.Options(; f_reltol = 1e-10, x_abstol = 1.5e-8), kwargs..., ) = SemOptimizerOptim(algorithm, options) diff --git a/test/examples/helper.jl b/test/examples/helper.jl index 4ff9bd507..acc3ccd08 100644 --- a/test/examples/helper.jl +++ b/test/examples/helper.jl @@ -90,12 +90,8 @@ function test_estimates( skip::Bool = false, ) actual = StructuralEquationModels.params(partable, col) - expected = StructuralEquationModels.lavaan_params( - partable_lav, - partable, - lav_col, - lav_group, - ) + expected = + StructuralEquationModels.lavaan_params(partable_lav, partable, lav_col, lav_group) @test !any(isnan, actual) @test !any(isnan, expected) diff --git a/test/examples/multigroup/build_models.jl b/test/examples/multigroup/build_models.jl index f6a7a230d..4b211b0f7 100644 --- a/test/examples/multigroup/build_models.jl +++ b/test/examples/multigroup/build_models.jl @@ -8,7 +8,8 @@ model_g1 = Sem(specification = specification_g1, data = dat_g1, implied = RAMSym model_g2 = Sem(specification = specification_g2, data = dat_g2, implied = RAM) -@test SEM.param_labels(model_g1.implied.ram_matrices) == SEM.param_labels(model_g2.implied.ram_matrices) +@test SEM.param_labels(model_g1.implied.ram_matrices) == + SEM.param_labels(model_g2.implied.ram_matrices) # test the different constructors model_ml_multigroup = SemEnsemble(model_g1, model_g2) diff --git a/test/examples/multigroup/multigroup.jl b/test/examples/multigroup/multigroup.jl index 239bf713c..43de554ce 100644 --- a/test/examples/multigroup/multigroup.jl +++ b/test/examples/multigroup/multigroup.jl @@ -9,11 +9,11 @@ dat = example_data("holzinger_swineford") dat_missing = example_data("holzinger_swineford_missing") solution_lav = example_data("holzinger_swineford_solution") -dat_g1 = dat[dat.school.=="Pasteur", :] -dat_g2 = dat[dat.school.=="Grant-White", :] +dat_g1 = dat[dat.school .== "Pasteur", :] +dat_g2 = dat[dat.school .== "Grant-White", :] -dat_miss_g1 = dat_missing[dat_missing.school.=="Pasteur", :] -dat_miss_g2 = dat_missing[dat_missing.school.=="Grant-White", :] +dat_miss_g1 = dat_missing[dat_missing.school .== "Pasteur", :] +dat_miss_g2 = dat_missing[dat_missing.school .== "Grant-White", :] dat.school = ifelse.(dat.school .== "Pasteur", :Pasteur, :Grant_White) dat_missing.school = ifelse.(dat_missing.school .== "Pasteur", :Pasteur, :Grant_White) diff --git a/test/examples/proximal/l0.jl b/test/examples/proximal/l0.jl index 374f8e58a..4d642bd2b 100644 --- a/test/examples/proximal/l0.jl +++ b/test/examples/proximal/l0.jl @@ -62,6 +62,7 @@ fit_prox = fit(model_prox, engine = :Proximal, operator_g = prox_operator) @test fit_prox.optimization_result.result[:iterations] < 1000 @test solution(fit_prox)[31] == 0.0 @test abs( - StructuralEquationModels.minimum(fit_prox) - StructuralEquationModels.minimum(sem_fit), + StructuralEquationModels.minimum(fit_prox) - + StructuralEquationModels.minimum(sem_fit), ) < 1.0 end diff --git a/test/unit_tests/StatsAPI.jl b/test/unit_tests/StatsAPI.jl index 8648fc363..5907ee7b5 100644 --- a/test/unit_tests/StatsAPI.jl +++ b/test/unit_tests/StatsAPI.jl @@ -5,10 +5,7 @@ end partable = ParameterTable(graph, observed_vars = [:a, :b], latent_vars = Symbol[]) update_partable!(partable, :estimate, param_labels(partable), [3.1415]) data = randn(100, 2) -model = Sem( - specification = partable, - data = data -) +model = Sem(specification = partable, data = data) model_fit = fit(model) @testset "params" begin @@ -25,5 +22,5 @@ end end @testset "coeftable" begin - @test_throws "StructuralEquationModels does not support" coeftable(model) -end \ No newline at end of file + @test_throws "StructuralEquationModels does not support" coeftable(model) +end diff --git a/test/unit_tests/data_input_formats.jl b/test/unit_tests/data_input_formats.jl index 183b067f5..3244023de 100644 --- a/test/unit_tests/data_input_formats.jl +++ b/test/unit_tests/data_input_formats.jl @@ -494,3 +494,12 @@ end # SemObservedCovariance ) end # meanstructure end # SemObservedMissing +@testset "Non-numeric observed variable" begin + bad_data = Any[ + 1.0 "a"; + 2.0 "b"; + 3.0 "c" + ] + + @test_throws ArgumentError SemObservedData(data = bad_data) +end diff --git a/test/unit_tests/model.jl b/test/unit_tests/model.jl index 2bf5dedaf..d2c1b7a02 100644 --- a/test/unit_tests/model.jl +++ b/test/unit_tests/model.jl @@ -25,7 +25,6 @@ graph = @StenoGraph begin y8 ↔ y4 + y6 end - ram_matrices = RAMMatrices(ParameterTable(graph, observed_vars = obs_vars, latent_vars = lat_vars)) diff --git a/test/unit_tests/unit_tests.jl b/test/unit_tests/unit_tests.jl index 7189addd4..4d7dad7cf 100644 --- a/test/unit_tests/unit_tests.jl +++ b/test/unit_tests/unit_tests.jl @@ -7,7 +7,7 @@ available_tests = Dict( "data_input_formats" => "SemObserved", "specification" => "SemSpecification", "model" => "Sem model", - "StatsAPI" => "StatsAPI" + "StatsAPI" => "StatsAPI", ) # Determine which tests to run based on command-line arguments diff --git a/test/unit_tests/unit_tests_interactive.jl b/test/unit_tests/unit_tests_interactive.jl index cf082fa60..10a384403 100644 --- a/test/unit_tests/unit_tests_interactive.jl +++ b/test/unit_tests/unit_tests_interactive.jl @@ -7,4 +7,4 @@ try catch e @warn "Error initializing Test Env" exception=(e, catch_backtrace()) end -include("unit_tests.jl") \ No newline at end of file +include("unit_tests.jl")