Skip to content

Commit

Permalink
VIF and GVIF (#300)
Browse files Browse the repository at this point in the history
* VIF and GVIF

* manual entry

* Julia 1.6 compat

* remove dead code

* VERSION check

* tests

* tolerance

* let documenter find the StatsAPI docstring on its own

* efficiency ftw

* piracy check

* fix _find_intercept with nesting

* Aqua 0.7

* add R run for reference values as comment

---------

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
  • Loading branch information
palday and nalimilan authored Sep 13, 2023
1 parent 1632bff commit 2c73de3
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 10 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Aqua = "0.6"
Aqua = "0.7"
CategoricalArrays = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
DataAPI = "1.1"
DataFrames = "1"
DataStructures = "0.17, 0.18"
ShiftedArrays = "1, 2"
StatsAPI = "1"
StatsAPI = "1.7"
StatsBase = "0.33.5, 0.34"
StatsFuns = "0.9, 1.0"
Tables = "0.2, 1"
Expand Down
4 changes: 2 additions & 2 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Documenter, StatsModels
using Documenter, StatsModels, StatsAPI

DocMeta.setdocmeta!(StatsModels, :DocTestSetup, :(using StatsModels, StatsBase); recursive=true)

Expand All @@ -15,7 +15,7 @@ makedocs(
"Temporal variables and Time Series Terms" => "temporal_terms.md",
"API documentation" => "api.md"
],
modules = [StatsModels],
modules = [StatsModels, StatsAPI],
doctestfilters = [r"([a-z]*) => \1", r"getfield\(.*##[0-9]+#[0-9]+"],
strict=Documenter.except(:missing_docs)
)
Expand Down
6 changes: 4 additions & 2 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,12 @@ apply_schema

```@docs
fit
response
modelmatrix
gvif
lrtest
formula
modelmatrix
response
vif
```

### Traits
Expand Down
8 changes: 6 additions & 2 deletions src/StatsModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ using DataStructures
using DataAPI
using DataAPI: levels
using Printf: @sprintf
using StatsAPI: coefnames, fit, predict, dof
using StatsAPI: coefnames, dof, fit, gvif, predict, vif
using StatsFuns: chisqccdf

using SparseArrays
Expand Down Expand Up @@ -70,7 +70,10 @@ export
omitsintercept,
hasresponse,

lrtest
lrtest,

vif,
gvif

const SPECIALS = (:+, :&, :*, :~)

Expand All @@ -84,6 +87,7 @@ include("formula.jl")
include("modelframe.jl")
include("statsmodel.jl")
include("lrtest.jl")
include("vif.jl")
include("deprecated.jl")

end # module StatsModels
121 changes: 121 additions & 0 deletions src/vif.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# putting this behind a function barrier means that the model matrix
# can potentially be freed immediately if it's large and constructed on the fly
function _find_intercept(model::RegressionModel)
modelmat = modelmatrix(model)
cols = eachcol(modelmat)
# collect is necessary for Julia 1.6
# but it's :just: an array of references to views, so shouldn't be too
# expensive
@static if VERSION < v"1.7"
cols = collect(cols)
end
return findfirst(Base.Fix1(all, isone), cols)
end

_find_intercept(form::FormulaTerm) = _find_intercept(form.rhs)
# we need these in case the RHS is a single term
_find_intercept(::AbstractTerm) = nothing
_find_intercept(::InterceptTerm{true}) = 1
_find_intercept(m::MatrixTerm) = _find_intercept(m.terms)
function _find_intercept(t::TupleTerm)
return findfirst(!isnothing _find_intercept, t)
end

# borrowed from Effects.jl
function get_matrix_term(x)
x = collect_matrix_terms(x)
x = x isa MatrixTerm ? x : first(x)
x isa MatrixTerm || throw(ArgumentError("couldn't extract matrix term from $x"))
# this is a work around for some weirdness that happens in MixedModels.jl
x = first(x.terms) isa MatrixTerm ? only(x.terms) : x
return x
end

function StatsAPI.vif(model::RegressionModel)
vc = vcov(model)
Base.require_one_based_indexing(vc)

intercept = _find_intercept(model)
isnothing(intercept) &&
throw(ArgumentError("VIF is only defined for models with an intercept term"))

# copy just in case vc was a reference to an internal structure
vc = StatsBase.cov2cor!(copy(vc), stderror(model))
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
size(m, 2) > 1 ||
throw(ArgumentError("VIF not meaningful for models with only one non-intercept term"))
# NB: The correlation matrix is positive definite and hence invertible
# unless there is perfect rank deficiency, hence the warning in the docstring.
return diag(inv(Symmetric(m)))
end

"""
gvif(model::RegressionModel; scale=false)
Compute the generalized variance inflation factor (GVIF).
If `scale=true`, then each GVIF is scaled by the degrees of freedom
for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``.
Returns a vector of inflation factors computed for each term except
for the intercept.
In other words, the corresponding coefficients are `termnames(m)[2:end]`.
The [GVIF](https://doi.org/10.2307/2290467)
measures the increase in the variance of a (group of) parameter's estimate in a model
with multiple parameters relative to the variance of a parameter's estimate in a
model containing only that parameter. For continuous, numerical predictors, the GVIF
is the same as the VIF, but for categorical predictors, the GVIF provides a single
number for the entire group of contrast-coded coefficients associated with a categorical
predictor.
See also [`termnames`](@ref), [`vif`](@ref).
!!! warning
This method will fail if there is (numerically) perfect multicollinearity,
i.e. rank deficiency (in the fixed effects). In that case though, the VIF
isn't particularly informative anyway.
# References
Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics.
Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467
"""
function StatsAPI.gvif(model::RegressionModel; scale=false)
form = formula(model)
intercept = _find_intercept(form)
isnothing(intercept) &&
throw(ArgumentError("GVIF only defined for models with an intercept term"))
vc = vcov(model)
Base.require_one_based_indexing(vc)

vc = StatsBase.cov2cor!(copy(vc), stderror(model))
m = view(vc, axes(vc, 1) .!= intercept, axes(vc, 2) .!= intercept)
size(m, 2) > 1 ||
throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term"))

tn = last(termnames(model))
tn = view(tn, axes(tn, 1) .!= intercept)
trms = get_matrix_term(form.rhs).terms
# MatrixTerms.terms is a tuple or vector so always 1-based indexing
trms = trms[1:length(trms) .!= intercept]

df = width.(trms)
vals = zeros(eltype(m), length(tn))
logdetm = logdet(m)
acc = 0
for idx in axes(vals, 1)
wt = df[idx]
trm_only = [acc < i <= (acc + wt) for i in axes(m, 2)]
trm_excl = .!trm_only
vals[idx] = exp(logdet(view(m, trm_only, trm_only)) +
logdet(view(m, trm_excl, trm_excl)) -
logdetm)
acc += wt
end

if scale
vals .= vals .^ (1 ./ (2 .* df))
end
return vals
end
8 changes: 6 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@ my_tests = ["ambiguity.jl",
"contrasts.jl",
"extension.jl",
"traits.jl",
"protect.jl"]
"protect.jl",
"vif.jl"]

@testset "StatsModels" begin
@testset "aqua" begin
Aqua.test_all(StatsModels; ambiguities=false)
# because VIF and GVIF are defined in StatsAPI for RegressionModel,
# which is also defined there, it's flagged as piracy. But
# we're the offical implementers so it's privateering.
Aqua.test_all(StatsModels; ambiguities=false, piracy=(treat_as_own=[vif, gvif],))
end

for tf in my_tests
Expand Down
106 changes: 106 additions & 0 deletions test/vif.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
using StatsAPI: RegressionModel

struct MyRegressionModel <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel) = [1 2; 3 4]
StatsAPI.vcov(::MyRegressionModel) = [1 0; 0 1]

struct MyRegressionModel2 <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel2) = [1 2; 1 2]
StatsAPI.vcov(::MyRegressionModel2) = [1 0; 0 1]

struct MyRegressionModel3 <: RegressionModel
end

StatsAPI.modelmatrix(::MyRegressionModel3) = [1 2 3; 1 2 3]
StatsAPI.vcov(::MyRegressionModel3) = [1 0 0; 0 1 0; 0 0 1]

Base.@kwdef struct Duncan <: RegressionModel
coefnames::Vector{String}
coef::Vector{Float64}
stderror::Vector{Float64}
modelmatrix::Matrix{Float64}
vcov::Matrix{Float64}
formula::FormulaTerm
end

for f in [:coefnames, :coef, :stderror, :modelmatrix, :vcov]
@eval StatsAPI.$f(model::Duncan) = model.$f
end

StatsModels.formula(model::Duncan) = model.formula

@testset "VIF input checks" begin
# no intercept term
@test_throws ArgumentError vif(MyRegressionModel())

# only one non intercept term
@test_throws ArgumentError vif(MyRegressionModel2())

# vcov is identity, so the VIF is just 1
@test vif(MyRegressionModel3()) [1, 1]
end

# Reference values from car::vif in R:
# > library(car)
# > data(Duncan)
# > lm1 = lm(prestige ~ 1 + income + education, Duncan)
# > vif(lm1)
# income education
# 2.1049 2.1049
# > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
# > vif(lm2)
# GVIF Df GVIF^(1/(2*Df))
# income 2.209178 1 1.486330
# education 5.297584 1 2.301648
# type 5.098592 2 1.502666


@testset "GVIF and VIF are the same for continuous predictors" begin
# these are copied from a GLM fit to the car::duncan data
duncan2 = Duncan(; coefnames=["(Intercept)", "Income", "Education"],
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
ContinuousTerm(:Education, 1,1,1,1),
coef=[-6.064662922103356, 0.5987328215294941, 0.5458339094008812],
stderror=[4.271941174529124, 0.11966734981235407, 0.0982526413303999],
# we actually don't need the whole matrix -- just enough to find an intercept
modelmatrix=[1.0 62.0 86.0],
vcov=[18.2495 -0.151845 -0.150706
-0.151845 0.0143203 -0.00851855
-0.150706 -0.00851855 0.00965358])
@test vif(duncan2) [2.1049, 2.1049] atol=1e-5
# two different ways of calculating the same quantity
@test vif(duncan2) gvif(duncan2)
end

@testset "GVIF" begin
cm = StatsModels.ContrastsMatrix(DummyCoding("bc", ["bc", "prof", "wc"]), ["bc", "prof", "wc"])
duncan3 = Duncan(; coefnames=["(Intercept)", "Income", "Education", "Type: prof", "Type: wc"],
formula=term(:Prestige) ~ InterceptTerm{true}() + ContinuousTerm(:Income, 1,1,1,1) +
ContinuousTerm(:Education, 1,1,1,1) + CategoricalTerm(:Type, cm),
coef=[0.185028, 0.597546, 0.345319, 16.6575, -14.6611],
stderror=[3.71377, 0.0893553, 0.113609, 6.99301, 6.10877],
# we actually don't need the whole matrix -- just enough to find an intercept
modelmatrix=[1.0 62.0 86.0 1.0 0.0],
vcov=[13.7921 -0.115637 -0.257486 14.0947 7.9022
-0.115637 0.00798437 -0.00292449 -0.126011 -0.109049
-0.257486 -0.00292449 0.012907 -0.616651 -0.38812
14.0947 -0.126011 -0.616651 48.9021 30.2139
7.9022 -0.109049 -0.38812 30.2139 37.3171])
@test gvif(duncan3) [2.209178, 5.297584, 5.098592] atol=1e-4
@test gvif(duncan3; scale=true) [1.486330, 2.301648, 1.502666] atol=1e-5
end

@testset "utils" begin
int = InterceptTerm{true}()
noint = InterceptTerm{false}()
xterm = term(:x)
@test StatsModels._find_intercept(xterm) === nothing
@test StatsModels._find_intercept(int) == 1
@test StatsModels._find_intercept(noint) === nothing
@test StatsModels._find_intercept(MatrixTerm((xterm, int))) == 2
@test StatsModels.get_matrix_term(MatrixTerm(MatrixTerm((xterm, int)))) == MatrixTerm((xterm, int))
end

2 comments on commit 2c73de3

@palday
Copy link
Member Author

@palday palday commented on 2c73de3 Sep 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/91326

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.7.3 -m "<description of version>" 2c73de38d5158ae2d478c1e783c89f66751814cc
git push origin v0.7.3

Please sign in to comment.