-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* VIF and GVIF * manual entry * Julia 1.6 compat * remove dead code * VERSION check * tests * tolerance * let documenter find the StatsAPI docstring on its own * efficiency ftw * piracy check * fix _find_intercept with nesting * Aqua 0.7 * add R run for reference values as comment --------- Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
- Loading branch information
Showing
7 changed files
with
247 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# putting this behind a function barrier means that the model matrix | ||
# can potentially be freed immediately if it's large and constructed on the fly | ||
function _find_intercept(model::RegressionModel) | ||
modelmat = modelmatrix(model) | ||
cols = eachcol(modelmat) | ||
# collect is necessary for Julia 1.6 | ||
# but it's :just: an array of references to views, so shouldn't be too | ||
# expensive | ||
@static if VERSION < v"1.7" | ||
cols = collect(cols) | ||
end | ||
return findfirst(Base.Fix1(all, isone), cols) | ||
end | ||
|
||
_find_intercept(form::FormulaTerm) = _find_intercept(form.rhs) | ||
# we need these in case the RHS is a single term | ||
_find_intercept(::AbstractTerm) = nothing | ||
_find_intercept(::InterceptTerm{true}) = 1 | ||
_find_intercept(m::MatrixTerm) = _find_intercept(m.terms) | ||
function _find_intercept(t::TupleTerm) | ||
return findfirst(!isnothing ∘ _find_intercept, t) | ||
end | ||
|
||
# borrowed from Effects.jl | ||
function get_matrix_term(x) | ||
x = collect_matrix_terms(x) | ||
x = x isa MatrixTerm ? x : first(x) | ||
x isa MatrixTerm || throw(ArgumentError("couldn't extract matrix term from $x")) | ||
# this is a work around for some weirdness that happens in MixedModels.jl | ||
x = first(x.terms) isa MatrixTerm ? only(x.terms) : x | ||
return x | ||
end | ||
|
||
function StatsAPI.vif(model::RegressionModel) | ||
vc = vcov(model) | ||
Base.require_one_based_indexing(vc) | ||
|
||
intercept = _find_intercept(model) | ||
isnothing(intercept) && | ||
throw(ArgumentError("VIF is only defined for models with an intercept term")) | ||
|
||
# copy just in case vc was a reference to an internal structure | ||
vc = StatsBase.cov2cor!(copy(vc), stderror(model)) | ||
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept) | ||
size(m, 2) > 1 || | ||
throw(ArgumentError("VIF not meaningful for models with only one non-intercept term")) | ||
# NB: The correlation matrix is positive definite and hence invertible | ||
# unless there is perfect rank deficiency, hence the warning in the docstring. | ||
return diag(inv(Symmetric(m))) | ||
end | ||
|
||
""" | ||
gvif(model::RegressionModel; scale=false) | ||
Compute the generalized variance inflation factor (GVIF). | ||
If `scale=true`, then each GVIF is scaled by the degrees of freedom | ||
for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``. | ||
Returns a vector of inflation factors computed for each term except | ||
for the intercept. | ||
In other words, the corresponding coefficients are `termnames(m)[2:end]`. | ||
The [GVIF](https://doi.org/10.2307/2290467) | ||
measures the increase in the variance of a (group of) parameter's estimate in a model | ||
with multiple parameters relative to the variance of a parameter's estimate in a | ||
model containing only that parameter. For continuous, numerical predictors, the GVIF | ||
is the same as the VIF, but for categorical predictors, the GVIF provides a single | ||
number for the entire group of contrast-coded coefficients associated with a categorical | ||
predictor. | ||
See also [`termnames`](@ref), [`vif`](@ref). | ||
!!! warning | ||
This method will fail if there is (numerically) perfect multicollinearity, | ||
i.e. rank deficiency (in the fixed effects). In that case though, the VIF | ||
isn't particularly informative anyway. | ||
# References | ||
Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics. | ||
Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467 | ||
""" | ||
function StatsAPI.gvif(model::RegressionModel; scale=false) | ||
form = formula(model) | ||
intercept = _find_intercept(form) | ||
isnothing(intercept) && | ||
throw(ArgumentError("GVIF only defined for models with an intercept term")) | ||
vc = vcov(model) | ||
Base.require_one_based_indexing(vc) | ||
|
||
vc = StatsBase.cov2cor!(copy(vc), stderror(model)) | ||
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept) | ||
size(m, 2) > 1 || | ||
throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term")) | ||
|
||
tn = last(termnames(model)) | ||
tn = view(tn, axes(tn, 1) .!= intercept) | ||
trms = get_matrix_term(form.rhs).terms | ||
# MatrixTerms.terms is a tuple or vector so always 1-based indexing | ||
trms = trms[1:length(trms) .!= intercept] | ||
|
||
df = width.(trms) | ||
vals = zeros(eltype(m), length(tn)) | ||
logdetm = logdet(m) | ||
acc = 0 | ||
for idx in axes(vals, 1) | ||
wt = df[idx] | ||
trm_only = [acc < i <= (acc + wt) for i in axes(m, 2)] | ||
trm_excl = .!trm_only | ||
vals[idx] = exp(logdet(view(m, trm_only, trm_only)) + | ||
logdet(view(m, trm_excl, trm_excl)) - | ||
logdetm) | ||
acc += wt | ||
end | ||
|
||
if scale | ||
vals .= vals .^ (1 ./ (2 .* df)) | ||
end | ||
return vals | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
using StatsAPI: RegressionModel | ||
|
||
struct MyRegressionModel <: RegressionModel | ||
end | ||
|
||
StatsAPI.modelmatrix(::MyRegressionModel) = [1 2; 3 4] | ||
StatsAPI.vcov(::MyRegressionModel) = [1 0; 0 1] | ||
|
||
struct MyRegressionModel2 <: RegressionModel | ||
end | ||
|
||
StatsAPI.modelmatrix(::MyRegressionModel2) = [1 2; 1 2] | ||
StatsAPI.vcov(::MyRegressionModel2) = [1 0; 0 1] | ||
|
||
struct MyRegressionModel3 <: RegressionModel | ||
end | ||
|
||
StatsAPI.modelmatrix(::MyRegressionModel3) = [1 2 3; 1 2 3] | ||
StatsAPI.vcov(::MyRegressionModel3) = [1 0 0; 0 1 0; 0 0 1] | ||
|
||
Base.@kwdef struct Duncan <: RegressionModel | ||
coefnames::Vector{String} | ||
coef::Vector{Float64} | ||
stderror::Vector{Float64} | ||
modelmatrix::Matrix{Float64} | ||
vcov::Matrix{Float64} | ||
formula::FormulaTerm | ||
end | ||
|
||
for f in [:coefnames, :coef, :stderror, :modelmatrix, :vcov] | ||
@eval StatsAPI.$f(model::Duncan) = model.$f | ||
end | ||
|
||
StatsModels.formula(model::Duncan) = model.formula | ||
|
||
@testset "VIF input checks" begin | ||
# no intercept term | ||
@test_throws ArgumentError vif(MyRegressionModel()) | ||
|
||
# only one non intercept term | ||
@test_throws ArgumentError vif(MyRegressionModel2()) | ||
|
||
# vcov is identity, so the VIF is just 1 | ||
@test vif(MyRegressionModel3()) ≈ [1, 1] | ||
end | ||
|
||
# Reference values from car::vif in R: | ||
# > library(car) | ||
# > data(Duncan) | ||
# > lm1 = lm(prestige ~ 1 + income + education, Duncan) | ||
# > vif(lm1) | ||
# income education | ||
# 2.1049 2.1049 | ||
# > lm2 = lm(prestige ~ 1 + income + education + type, Duncan) | ||
# > vif(lm2) | ||
# GVIF Df GVIF^(1/(2*Df)) | ||
# income 2.209178 1 1.486330 | ||
# education 5.297584 1 2.301648 | ||
# type 5.098592 2 1.502666 | ||
|
||
|
||
@testset "GVIF and VIF are the same for continuous predictors" begin | ||
# these are copied from a GLM fit to the car::duncan data | ||
duncan2 = Duncan(; coefnames=["(Intercept)", "Income", "Education"], | ||
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) + | ||
ContinuousTerm(:Education, 1,1,1,1), | ||
coef=[-6.064662922103356, 0.5987328215294941, 0.5458339094008812], | ||
stderror=[4.271941174529124, 0.11966734981235407, 0.0982526413303999], | ||
# we actually don't need the whole matrix -- just enough to find an intercept | ||
modelmatrix=[1.0 62.0 86.0], | ||
vcov=[18.2495 -0.151845 -0.150706 | ||
-0.151845 0.0143203 -0.00851855 | ||
-0.150706 -0.00851855 0.00965358]) | ||
@test vif(duncan2) ≈ [2.1049, 2.1049] atol=1e-5 | ||
# two different ways of calculating the same quantity | ||
@test vif(duncan2) ≈ gvif(duncan2) | ||
end | ||
|
||
@testset "GVIF" begin | ||
cm = StatsModels.ContrastsMatrix(DummyCoding("bc", ["bc", "prof", "wc"]), ["bc", "prof", "wc"]) | ||
duncan3 = Duncan(; coefnames=["(Intercept)", "Income", "Education", "Type: prof", "Type: wc"], | ||
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) + | ||
ContinuousTerm(:Education, 1,1,1,1) + CategoricalTerm(:Type, cm), | ||
coef=[0.185028, 0.597546, 0.345319, 16.6575, -14.6611], | ||
stderror=[3.71377, 0.0893553, 0.113609, 6.99301, 6.10877], | ||
# we actually don't need the whole matrix -- just enough to find an intercept | ||
modelmatrix=[1.0 62.0 86.0 1.0 0.0], | ||
vcov=[13.7921 -0.115637 -0.257486 14.0947 7.9022 | ||
-0.115637 0.00798437 -0.00292449 -0.126011 -0.109049 | ||
-0.257486 -0.00292449 0.012907 -0.616651 -0.38812 | ||
14.0947 -0.126011 -0.616651 48.9021 30.2139 | ||
7.9022 -0.109049 -0.38812 30.2139 37.3171]) | ||
@test gvif(duncan3) ≈ [2.209178, 5.297584, 5.098592] atol=1e-4 | ||
@test gvif(duncan3; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-5 | ||
end | ||
|
||
@testset "utils" begin | ||
int = InterceptTerm{true}() | ||
noint = InterceptTerm{false}() | ||
xterm = term(:x) | ||
@test StatsModels._find_intercept(xterm) === nothing | ||
@test StatsModels._find_intercept(int) == 1 | ||
@test StatsModels._find_intercept(noint) === nothing | ||
@test StatsModels._find_intercept(MatrixTerm((xterm, int))) == 2 | ||
@test StatsModels.get_matrix_term(MatrixTerm(MatrixTerm((xterm, int)))) == MatrixTerm((xterm, int)) | ||
end |
2c73de3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
2c73de3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/91326
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via: