From e6b5f231c17e5723864c573ef7ef8d4fb0992b0c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 28 Dec 2020 19:25:41 -0500 Subject: [PATCH] fix doctests and add missing docstrings (#192) * first round of doctest(; fix=true): coeftables, hypotheiscoding * use Documenter 0.25 series; docstrings for response/modelmatrix * include hypothesis_matrix in contrasts doc page * DataFrames.DataFrame -> DataFrame * build docs on 1.5 * bump patch version * specify rng in internals * add StableRNGs to project * stablerng everywhere there's a rand * don't need rand here * don't seed RNG in setdocmeta * run doctest(; fix=true) * missed some fixes... * use julia-repl for the ugly hypmat text and add some explanatory txt * bump patch version * fix a few more stray small exponents * Revert "fix a few more stray small exponents" This reverts commit c458cdf4a6352f93c3d049eebcbcc5a333812c9c. --- Project.toml | 2 +- docs/Project.toml | 3 +- docs/make.jl | 6 +- docs/src/contrasts.md | 1 + docs/src/formula.md | 126 ++++++++++++------------ docs/src/internals.md | 194 ++++++++++++++++++------------------- docs/src/temporal_terms.md | 18 ++-- src/contrasts.jl | 66 +++++++++---- src/lrtest.jl | 8 +- src/modelframe.jl | 43 ++++++++ src/schema.jl | 12 ++- src/terms.jl | 68 +++++++------ 12 files changed, 316 insertions(+), 231 deletions(-) diff --git a/Project.toml b/Project.toml index 2014359a..2b928948 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StatsModels" uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d" -version = "0.6.16" +version = "0.6.17" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/docs/Project.toml b/docs/Project.toml index 3b85a329..256dab37 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,9 +2,10 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -Documenter = "~0.24" +Documenter = "0.25" diff --git a/docs/make.jl b/docs/make.jl index 121a98d3..dd92ebe8 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,5 +1,8 @@ using Documenter, StatsModels +DocMeta.setdocmeta!(StatsModels, :DocTestSetup, :(using StatsModels, StatsBase); recursive=true) + + using Pkg Pkg.precompile() @@ -12,7 +15,8 @@ makedocs( "Contrast coding categorical variables" => "contrasts.md", "Temporal variables and Time Series Terms" => "temporal_terms.md", "API documentation" => "api.md" - ] + ], + modules = [StatsModels] ) deploydocs( diff --git a/docs/src/contrasts.md b/docs/src/contrasts.md index 16b36119..bea29ba5 100644 --- a/docs/src/contrasts.md +++ b/docs/src/contrasts.md @@ -51,6 +51,7 @@ EffectsCoding HelmertCoding SeqDiffCoding HypothesisCoding +hypothesis_matrix ``` ### Special internal contrasts diff --git a/docs/src/formula.md b/docs/src/formula.md index eed82670..850f7d3c 100644 --- a/docs/src/formula.md +++ b/docs/src/formula.md @@ -2,8 +2,6 @@ CurrentModule = StatsModels DocTestSetup = quote using StatsModels - using Random - Random.seed!(1) end ``` @@ -42,6 +40,8 @@ Here is an example of the `@formula` in action: ```jldoctest 1 julia> using StatsModels, DataFrames +julia> using StableRNGs; rng = StableRNG(1); + julia> f = @formula(y ~ 1 + a + b + c + b&c) FormulaTerm Response: @@ -53,20 +53,20 @@ Predictors: c(unknown) b(unknown) & c(unknown) -julia> df = DataFrame(y = rand(9), a = 1:9, b = rand(9), c = repeat(["d","e","f"], 3)) -9×4 DataFrames.DataFrame -│ Row │ y │ a │ b │ c │ -│ │ Float64 │ Int64 │ Float64 │ String │ -├─────┼────────────┼───────┼───────────┼────────┤ -│ 1 │ 0.236033 │ 1 │ 0.986666 │ d │ -│ 2 │ 0.346517 │ 2 │ 0.555751 │ e │ -│ 3 │ 0.312707 │ 3 │ 0.437108 │ f │ -│ 4 │ 0.00790928 │ 4 │ 0.424718 │ d │ -│ 5 │ 0.488613 │ 5 │ 0.773223 │ e │ -│ 6 │ 0.210968 │ 6 │ 0.28119 │ f │ -│ 7 │ 0.951916 │ 7 │ 0.209472 │ d │ -│ 8 │ 0.999905 │ 8 │ 0.251379 │ e │ -│ 9 │ 0.251662 │ 9 │ 0.0203749 │ f │ +julia> df = DataFrame(y = rand(rng, 9), a = 1:9, b = rand(rng, 9), c = repeat(["d","e","f"], 3)) +9×4 DataFrame + Row │ y a b c + │ Float64 Int64 Float64 String +─────┼──────────────────────────────────── + 1 │ 0.585195 1 0.236782 d + 2 │ 0.0773379 2 0.943741 e + 3 │ 0.716628 3 0.445671 f + 4 │ 0.320357 4 0.763679 d + 5 │ 0.653093 5 0.145071 e + 6 │ 0.236639 6 0.021124 f + 7 │ 0.709684 7 0.152545 d + 8 │ 0.557787 8 0.617492 e + 9 │ 0.05079 9 0.481531 f julia> f = apply_schema(f, schema(f, df)) FormulaTerm @@ -83,15 +83,15 @@ julia> resp, pred = modelcols(f, df); julia> pred 9×7 Array{Float64,2}: - 1.0 1.0 0.986666 0.0 0.0 0.0 0.0 - 1.0 2.0 0.555751 1.0 0.0 0.555751 0.0 - 1.0 3.0 0.437108 0.0 1.0 0.0 0.437108 - 1.0 4.0 0.424718 0.0 0.0 0.0 0.0 - 1.0 5.0 0.773223 1.0 0.0 0.773223 0.0 - 1.0 6.0 0.28119 0.0 1.0 0.0 0.28119 - 1.0 7.0 0.209472 0.0 0.0 0.0 0.0 - 1.0 8.0 0.251379 1.0 0.0 0.251379 0.0 - 1.0 9.0 0.0203749 0.0 1.0 0.0 0.0203749 + 1.0 1.0 0.236782 0.0 0.0 0.0 0.0 + 1.0 2.0 0.943741 1.0 0.0 0.943741 0.0 + 1.0 3.0 0.445671 0.0 1.0 0.0 0.445671 + 1.0 4.0 0.763679 0.0 0.0 0.0 0.0 + 1.0 5.0 0.145071 1.0 0.0 0.145071 0.0 + 1.0 6.0 0.021124 0.0 1.0 0.0 0.021124 + 1.0 7.0 0.152545 0.0 0.0 0.0 0.0 + 1.0 8.0 0.617492 1.0 0.0 0.617492 0.0 + 1.0 9.0 0.481531 0.0 1.0 0.0 0.481531 julia> coefnames(f) ("y", ["(Intercept)", "a", "b", "c: e", "c: f", "b & c: e", "b & c: f"]) @@ -224,13 +224,13 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De :(log(y)) ~ 1 + a + b Coefficients: -────────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -────────────────────────────────────────────────────────────────────────────── -(Intercept) -4.16168 2.98788 -1.39285 0.2131 -11.4727 3.14939 -a 0.357482 0.342126 1.04489 0.3363 -0.479669 1.19463 -b 2.32528 3.13735 0.741159 0.4866 -5.35154 10.0021 -────────────────────────────────────────────────────────────────────────────── +────────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +────────────────────────────────────────────────────────────────────────── +(Intercept) 0.0698025 0.928295 0.08 0.9425 -2.20165 2.34126 +a -0.105669 0.128107 -0.82 0.4410 -0.419136 0.207797 +b -1.63199 1.12678 -1.45 0.1977 -4.38911 1.12513 +────────────────────────────────────────────────────────────────────────── julia> df.log_y = log.(df.y); @@ -240,13 +240,13 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De log_y ~ 1 + a + b Coefficients: -────────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -────────────────────────────────────────────────────────────────────────────── -(Intercept) -4.16168 2.98788 -1.39285 0.2131 -11.4727 3.14939 -a 0.357482 0.342126 1.04489 0.3363 -0.479669 1.19463 -b 2.32528 3.13735 0.741159 0.4866 -5.35154 10.0021 -────────────────────────────────────────────────────────────────────────────── +────────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +────────────────────────────────────────────────────────────────────────── +(Intercept) 0.0698025 0.928295 0.08 0.9425 -2.20165 2.34126 +a -0.105669 0.128107 -0.82 0.4410 -0.419136 0.207797 +b -1.63199 1.12678 -1.45 0.1977 -4.38911 1.12513 +────────────────────────────────────────────────────────────────────────── ``` @@ -256,15 +256,15 @@ interpretation of `+`, `*`, and `&`: ```jldoctest 1 julia> modelmatrix(@formula(y ~ 1 + b + identity(1+b)), df) 9×3 Array{Float64,2}: - 1.0 0.986666 1.98667 - 1.0 0.555751 1.55575 - 1.0 0.437108 1.43711 - 1.0 0.424718 1.42472 - 1.0 0.773223 1.77322 - 1.0 0.28119 1.28119 - 1.0 0.209472 1.20947 - 1.0 0.251379 1.25138 - 1.0 0.0203749 1.02037 + 1.0 0.236782 1.23678 + 1.0 0.943741 1.94374 + 1.0 0.445671 1.44567 + 1.0 0.763679 1.76368 + 1.0 0.145071 1.14507 + 1.0 0.021124 1.02112 + 1.0 0.152545 1.15255 + 1.0 0.617492 1.61749 + 1.0 0.481531 1.48153 ``` ## Constructing a formula programmatically @@ -350,13 +350,15 @@ simulated coefficients. ```jldoctest julia> using GLM, DataFrames, StatsModels -julia> data = DataFrame(a = rand(100), b = repeat(["d", "e", "f", "g"], 25)); +julia> using StableRNGs; rng = StableRNG(1); + +julia> data = DataFrame(a = rand(rng, 100), b = repeat(["d", "e", "f", "g"], 25)); julia> X = StatsModels.modelmatrix(@formula(y ~ 1 + a*b).rhs, data); julia> β_true = 1:8; -julia> ϵ = randn(100)*0.1; +julia> ϵ = randn(rng, 100)*0.1; julia> data.y = X*β_true .+ ϵ; @@ -366,18 +368,18 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De y ~ 1 + a + b + a & b Coefficients: -────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -────────────────────────────────────────────────────────────────────────── -(Intercept) 0.98878 0.0384341 25.7266 <1e-43 0.912447 1.06511 -a 2.00843 0.0779388 25.7694 <1e-43 1.85364 2.16323 -b: e 3.03726 0.0616371 49.2764 <1e-67 2.91484 3.15967 -b: f 4.03909 0.0572857 70.5078 <1e-81 3.92531 4.15286 -b: g 5.02948 0.0587224 85.6484 <1e-88 4.91285 5.14611 -a & b: e 5.9385 0.10753 55.2264 <1e-71 5.72494 6.15207 -a & b: f 6.9073 0.112483 61.4075 <1e-75 6.6839 7.1307 -a & b: g 7.93918 0.111285 71.3407 <1e-81 7.71816 8.16021 -────────────────────────────────────────────────────────────────────────── +─────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +─────────────────────────────────────────────────────────────────────── +(Intercept) 1.01518 0.0400546 25.34 <1e-42 0.935626 1.09473 +a 1.97476 0.0701427 28.15 <1e-46 1.83545 2.11407 +b: e 3.01269 0.0571186 52.74 <1e-69 2.89925 3.12614 +b: f 4.01918 0.065827 61.06 <1e-75 3.88844 4.14992 +b: g 4.99176 0.0593715 84.08 <1e-88 4.87385 5.10968 +a & b: e 5.98288 0.0954641 62.67 <1e-76 5.79328 6.17248 +a & b: f 6.98622 0.107871 64.76 <1e-77 6.77197 7.20046 +a & b: g 7.92541 0.109873 72.13 <1e-82 7.70719 8.14362 +─────────────────────────────────────────────────────────────────────── ``` diff --git a/docs/src/internals.md b/docs/src/internals.md index 0e0554a7..bbe018fa 100644 --- a/docs/src/internals.md +++ b/docs/src/internals.md @@ -1,8 +1,6 @@ ```@meta DocTestSetup = quote using StatsModels - using Random - Random.seed!(1) end DocTestFilters = [r"([a-z]*) => \1"] ``` @@ -136,20 +134,22 @@ function. By default, it will create a schema for every column in the data: ```jldoctest 1 julia> using DataFrames # for pretty printing---any Table will do -julia> df = DataFrame(y = rand(9), a = 1:9, b = rand(9), c = repeat(["a","b","c"], 3)) -9×4 DataFrames.DataFrame -│ Row │ y │ a │ b │ c │ -│ │ Float64 │ Int64 │ Float64 │ String │ -├─────┼────────────┼───────┼───────────┼────────┤ -│ 1 │ 0.236033 │ 1 │ 0.986666 │ a │ -│ 2 │ 0.346517 │ 2 │ 0.555751 │ b │ -│ 3 │ 0.312707 │ 3 │ 0.437108 │ c │ -│ 4 │ 0.00790928 │ 4 │ 0.424718 │ a │ -│ 5 │ 0.488613 │ 5 │ 0.773223 │ b │ -│ 6 │ 0.210968 │ 6 │ 0.28119 │ c │ -│ 7 │ 0.951916 │ 7 │ 0.209472 │ a │ -│ 8 │ 0.999905 │ 8 │ 0.251379 │ b │ -│ 9 │ 0.251662 │ 9 │ 0.0203749 │ c │ +julia> using StableRNGs; rng = StableRNG(1); + +julia> df = DataFrame(y = rand(rng, 9), a = 1:9, b = rand(rng, 9), c = repeat(["a","b","c"], 3)) +9×4 DataFrame + Row │ y a b c + │ Float64 Int64 Float64 String +─────┼──────────────────────────────────── + 1 │ 0.585195 1 0.236782 a + 2 │ 0.0773379 2 0.943741 b + 3 │ 0.716628 3 0.445671 c + 4 │ 0.320357 4 0.763679 a + 5 │ 0.653093 5 0.145071 b + 6 │ 0.236639 6 0.021124 c + 7 │ 0.709684 7 0.152545 a + 8 │ 0.557787 8 0.617492 b + 9 │ 0.05079 9 0.481531 c julia> schema(df) StatsModels.Schema with 4 entries: @@ -313,27 +313,27 @@ julia> resp, pred = modelcols(f, df); julia> resp 9-element Array{Float64,1}: - 0.23603334566204692 - 0.34651701419196046 - 0.3127069683360675 - 0.00790928339056074 - 0.4886128300795012 - 0.21096820215853596 - 0.951916339835734 - 0.9999046588986136 - 0.25166218303197185 + 0.5851946422124186 + 0.07733793456911231 + 0.7166282400543453 + 0.3203570514066232 + 0.6530930076222579 + 0.2366391513734556 + 0.7096838914472361 + 0.5577872440804086 + 0.05079002172175784 julia> pred 9×7 Array{Float64,2}: - 1.0 1.0 0.986666 0.0 0.0 0.0 0.0 - 1.0 2.0 0.555751 1.0 0.0 0.555751 0.0 - 1.0 3.0 0.437108 0.0 1.0 0.0 0.437108 - 1.0 4.0 0.424718 0.0 0.0 0.0 0.0 - 1.0 5.0 0.773223 1.0 0.0 0.773223 0.0 - 1.0 6.0 0.28119 0.0 1.0 0.0 0.28119 - 1.0 7.0 0.209472 0.0 0.0 0.0 0.0 - 1.0 8.0 0.251379 1.0 0.0 0.251379 0.0 - 1.0 9.0 0.0203749 0.0 1.0 0.0 0.0203749 + 1.0 1.0 0.236782 0.0 0.0 0.0 0.0 + 1.0 2.0 0.943741 1.0 0.0 0.943741 0.0 + 1.0 3.0 0.445671 0.0 1.0 0.0 0.445671 + 1.0 4.0 0.763679 0.0 0.0 0.0 0.0 + 1.0 5.0 0.145071 1.0 0.0 0.145071 0.0 + 1.0 6.0 0.021124 0.0 1.0 0.0 0.021124 + 1.0 7.0 0.152545 0.0 0.0 0.0 0.0 + 1.0 8.0 0.617492 1.0 0.0 0.617492 0.0 + 1.0 9.0 0.481531 0.0 1.0 0.0 0.481531 ``` @@ -343,7 +343,7 @@ julia> pred julia> using Tables julia> modelcols(f, first(Tables.rowtable(df))) -(0.23603334566204692, [1.0, 1.0, 0.9866663668987996, 0.0, 0.0, 0.0, 0.0]) +(0.5851946422124186, [1.0, 1.0, 0.236781883208121, 0.0, 0.0, 0.0, 0.0]) ``` @@ -357,14 +357,14 @@ b(continuous) & c(DummyCoding:3→2) julia> modelcols(t, df) 9×2 Array{Float64,2}: 0.0 0.0 - 0.555751 0.0 - 0.0 0.437108 + 0.943741 0.0 + 0.0 0.445671 0.0 0.0 - 0.773223 0.0 - 0.0 0.28119 + 0.145071 0.0 + 0.0 0.021124 0.0 0.0 - 0.251379 0.0 - 0.0 0.0203749 + 0.617492 0.0 + 0.0 0.481531 ``` @@ -454,15 +454,15 @@ StatsBase.coefnames(p::PolyTerm) = coefnames(p.term) .* "^" .* string.(1:p.deg) Now, we can use `poly` in a formula: ```jldoctest 1 -julia> data = DataFrame(y = rand(4), a = rand(4), b = [1:4;]) -4×3 DataFrames.DataFrame -│ Row │ y │ a │ b │ -│ │ Float64 │ Float64 │ Int64 │ -├─────┼────────────┼──────────┼───────┤ -│ 1 │ 0.236033 │ 0.488613 │ 1 │ -│ 2 │ 0.346517 │ 0.210968 │ 2 │ -│ 3 │ 0.312707 │ 0.951916 │ 3 │ -│ 4 │ 0.00790928 │ 0.999905 │ 4 │ +julia> data = DataFrame(y = rand(rng, 4), a = rand(rng, 4), b = [1:4;]) +4×3 DataFrame + Row │ y a b + │ Float64 Float64 Int64 +─────┼─────────────────────────── + 1 │ 0.752223 0.757746 1 + 2 │ 0.314815 0.419294 2 + 3 │ 0.858522 0.412607 3 + 4 │ 0.698713 0.454589 4 julia> f = @formula(y ~ 1 + poly(b, 2) * a) FormulaTerm @@ -486,10 +486,10 @@ Predictors: julia> modelcols(f.rhs, data) 4×6 Array{Float64,2}: - 1.0 1.0 1.0 0.488613 0.488613 0.488613 - 1.0 2.0 4.0 0.210968 0.421936 0.843873 - 1.0 3.0 9.0 0.951916 2.85575 8.56725 - 1.0 4.0 16.0 0.999905 3.99962 15.9985 + 1.0 1.0 1.0 0.757746 0.757746 0.757746 + 1.0 2.0 4.0 0.419294 0.838587 1.67717 + 1.0 3.0 9.0 0.412607 1.23782 3.71347 + 1.0 4.0 16.0 0.454589 1.81836 7.27343 julia> coefnames(f.rhs) 6-element Array{String,1}: @@ -508,9 +508,9 @@ and of `b^2` (but not `a^2` or `b^1`): ```jldoctest 1 julia> using GLM -julia> sim_dat = DataFrame(a=rand(100).-0.5, b=randn(100).-0.5); +julia> sim_dat = DataFrame(a=rand(rng, 100).-0.5, b=randn(rng, 100).-0.5); -julia> sim_dat.y = randn(100) .+ 1 .+ 2*sim_dat.a .+ 3*sim_dat.b.^2; +julia> sim_dat.y = randn(rng, 100) .+ 1 .+ 2*sim_dat.a .+ 3*sim_dat.b.^2; julia> fit(LinearModel, @formula(y ~ 1 + poly(a,2) + poly(b,2)), sim_dat) StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}} @@ -518,15 +518,15 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De y ~ 1 + poly(a, 2) + poly(b, 2) Coefficients: -──────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -──────────────────────────────────────────────────────────────────────────── -(Intercept) 0.693521 0.159469 4.34893 <1e-4 0.376934 1.01011 -a^1 2.1042 0.383115 5.49235 <1e-6 1.34362 2.86478 -a^2 2.34395 1.39244 1.68334 0.0956 -0.420386 5.10829 -b^1 -0.180113 0.148698 -1.21127 0.2288 -0.475317 0.11509 -b^2 2.89786 0.0794572 36.4707 <1e-56 2.74012 3.05561 -──────────────────────────────────────────────────────────────────────────── +────────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +────────────────────────────────────────────────────────────────────────── +(Intercept) 0.89288 0.181485 4.92 <1e-5 0.532586 1.25317 +a^1 2.73324 0.349194 7.83 <1e-11 2.04001 3.42648 +a^2 -1.0114 1.34262 -0.75 0.4531 -3.67684 1.65404 +b^1 0.214424 0.136868 1.57 0.1205 -0.0572944 0.486142 +b^2 3.15133 0.0811794 38.82 <1e-59 2.99016 3.31249 +────────────────────────────────────────────────────────────────────────── ``` ### [Making special syntax "runtime friendly"] (@id extend-runtime) @@ -627,15 +627,15 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De y ~ 1 + poly(a, 2) + poly(b, 2) Coefficients: -──────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -──────────────────────────────────────────────────────────────────────────── -(Intercept) 0.693521 0.159469 4.34893 <1e-4 0.376934 1.01011 -a^1 2.1042 0.383115 5.49235 <1e-6 1.34362 2.86478 -a^2 2.34395 1.39244 1.68334 0.0956 -0.420386 5.10829 -b^1 -0.180113 0.148698 -1.21127 0.2288 -0.475317 0.11509 -b^2 2.89786 0.0794572 36.4707 <1e-56 2.74012 3.05561 -──────────────────────────────────────────────────────────────────────────── +────────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +────────────────────────────────────────────────────────────────────────── +(Intercept) 0.89288 0.181485 4.92 <1e-5 0.532586 1.25317 +a^1 2.73324 0.349194 7.83 <1e-11 2.04001 3.42648 +a^2 -1.0114 1.34262 -0.75 0.4531 -3.67684 1.65404 +b^1 0.214424 0.136868 1.57 0.1205 -0.0572944 0.486142 +b^2 3.15133 0.0811794 38.82 <1e-59 2.99016 3.31249 +────────────────────────────────────────────────────────────────────────── ``` ### Defining the context where special syntax applies @@ -673,10 +673,10 @@ Predictors: julia> modelcols(f.rhs, data) 4×4 Array{Float64,2}: - 1.0 1.0 0.488613 0.488613 - 1.0 4.0 0.210968 0.843873 - 1.0 9.0 0.951916 8.56725 - 1.0 16.0 0.999905 15.9985 + 1.0 1.0 0.757746 0.757746 + 1.0 4.0 0.419294 1.67717 + 1.0 9.0 0.412607 3.71347 + 1.0 16.0 0.454589 7.27343 julia> coefnames(f.rhs) 4-element Array{String,1}: @@ -705,10 +705,10 @@ Predictors: julia> modelcols(f2.rhs, data) 4×6 Array{Float64,2}: - 1.0 1.0 1.0 0.488613 0.488613 0.488613 - 1.0 2.0 4.0 0.210968 0.421936 0.843873 - 1.0 3.0 9.0 0.951916 2.85575 8.56725 - 1.0 4.0 16.0 0.999905 3.99962 15.9985 + 1.0 1.0 1.0 0.757746 0.757746 0.757746 + 1.0 2.0 4.0 0.419294 0.838587 1.67717 + 1.0 3.0 9.0 0.412607 1.23782 3.71347 + 1.0 4.0 16.0 0.454589 1.81836 7.27343 julia> coefnames(f2.rhs) 6-element Array{String,1}: @@ -724,9 +724,9 @@ The definitions of these methods control how models of each type are _fit_ from a formula with a call to `poly`: ```jldoctest 1 -julia> sim_dat = DataFrame(b=randn(100)); +julia> sim_dat = DataFrame(b=randn(rng, 100)); -julia> sim_dat.y = randn(100) .+ 1 .+ 2*sim_dat.b .+ 3*sim_dat.b.^2; +julia> sim_dat.y = randn(rng, 100) .+ 1 .+ 2*sim_dat.b .+ 3*sim_dat.b.^2; julia> fit(LinearModel, @formula(y ~ 1 + poly(b,2)), sim_dat) StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}} @@ -734,12 +734,12 @@ StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.De y ~ 1 + :(poly(b, 2)) Coefficients: -─────────────────────────────────────────────────────────────────────────── - Estimate Std. Error t value Pr(>|t|) Lower 95% Upper 95% -─────────────────────────────────────────────────────────────────────────── -(Intercept) 0.911363 0.310486 2.93528 0.0042 0.295214 1.52751 -poly(b, 2) 2.94442 0.191024 15.4139 <1e-27 2.56534 3.3235 -─────────────────────────────────────────────────────────────────────────── +─────────────────────────────────────────────────────────────────────── + Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95% +─────────────────────────────────────────────────────────────────────── +(Intercept) 1.28118 0.324615 3.95 0.0001 0.636991 1.92537 +poly(b, 2) 2.95861 0.174347 16.97 <1e-30 2.61262 3.30459 +─────────────────────────────────────────────────────────────────────── julia> fit(GeneralizedLinearModel, @formula(y ~ 1 + poly(b,2)), sim_dat, Normal()) StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Array{Float64,1},Normal{Float64},IdentityLink},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}} @@ -747,13 +747,13 @@ StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Array{Float6 y ~ 1 + poly(b, 2) Coefficients: -────────────────────────────────────────────────────────────────────────── - Estimate Std. Error z value Pr(>|z|) Lower 95% Upper 95% -────────────────────────────────────────────────────────────────────────── -(Intercept) 0.829374 0.131582 6.3031 <1e-9 0.571478 1.08727 -b^1 2.13096 0.100552 21.1926 <1e-98 1.93388 2.32804 -b^2 3.1132 0.0813107 38.2877 <1e-99 2.95384 3.27257 -────────────────────────────────────────────────────────────────────────── +──────────────────────────────────────────────────────────────────────── + Coef. Std. Error z Pr(>|z|) Lower 95% Upper 95% +──────────────────────────────────────────────────────────────────────── +(Intercept) 0.906356 0.132613 6.83 <1e-11 0.64644 1.16627 +b^1 2.03194 0.0908937 22.36 <1e-99 1.85379 2.21008 +b^2 3.02886 0.0707228 42.83 <1e-99 2.89025 3.16748 +──────────────────────────────────────────────────────────────────────── ``` diff --git a/docs/src/temporal_terms.md b/docs/src/temporal_terms.md index a4db94f0..271222e8 100644 --- a/docs/src/temporal_terms.md +++ b/docs/src/temporal_terms.md @@ -21,15 +21,15 @@ Below is a simple example: julia> using StatsModels, DataFrames julia> df = DataFrame(y=1:5, x=2:2:10) -5×2 DataFrames.DataFrame -│ Row │ y │ x │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 2 │ -│ 2 │ 2 │ 4 │ -│ 3 │ 3 │ 6 │ -│ 4 │ 4 │ 8 │ -│ 5 │ 5 │ 10 │ +5×2 DataFrame + Row │ y x + │ Int64 Int64 +─────┼────────────── + 1 │ 1 2 + 2 │ 2 4 + 3 │ 3 6 + 4 │ 4 8 + 5 │ 5 10 julia> f = @formula(y ~ x + lag(x, 2) + lead(x, 2)) FormulaTerm diff --git a/src/contrasts.jl b/src/contrasts.jl index c8461a3e..2cc11e0d 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -488,6 +488,8 @@ julia> sdiff_hypothesis = [-1 1 0 0 Contrasts are derived the hypothesis matrix by taking the pseudoinverse: ```jldoctest hyp +julia> using LinearAlgebra + julia> sdiff_contrasts = pinv(sdiff_hypothesis) 4×3 Array{Float64,2}: -0.75 -0.5 -0.25 @@ -660,13 +662,18 @@ non-zero mean). If `tolerance != 0` (the default), the hypotheses are rounded to `Int`s if possible and `Rational`s if not, using the given tolerance. If `tolerance == 0`, then the hypothesis matrix is returned as-is. +The orientation of the hypothesis matrix is _opposite_ that of the contrast +matrix: each row of the contrasts matrix is a data level and each column is a +predictor, whereas each row of the hypothesis matrix is the interpretation of a +predictor with weights for each level given in the columns. + Note that this assumes a *balanced design* where there are the same number of observations in every cell. This is only important for non-orthgonal contrasts (including contrasts that are not orthogonal with the intercept). # Examples -```jldoctest +```jldoctest hypmat julia> cmat = StatsModels.contrasts_matrix(DummyCoding(), 1, 4) 4×3 Array{Float64,2}: 0.0 0.0 0.0 @@ -676,31 +683,48 @@ julia> cmat = StatsModels.contrasts_matrix(DummyCoding(), 1, 4) julia> StatsModels.hypothesis_matrix(cmat) 4×4 Array{Int64,2}: - 1 -1 -1 -1 - 0 1 0 0 - 0 0 1 0 - 0 0 0 1 - -julia> StatsModels.hypothesis_matrix(cmat, intercept=false) # wrong without intercept!! -4×3 Array{Int64,2}: - 0 0 0 - 1 0 0 - 0 1 0 - 0 0 1 + 1 0 0 0 + -1 1 0 0 + -1 0 1 0 + -1 0 0 1 +``` + +For non-centered contrasts like `DummyCoding`, without including the intercept +the hypothesis matrix is incorrect. So while `intercept=true` is the default for +non-centered contrasts, you can see the (wrong) hypothesis matrix when ignoring +it by forcing `intercept=false`: + +```jldoctest hypmat +julia> StatsModels.hypothesis_matrix(cmat, intercept=false) +3×4 Array{Int64,2}: + 0 1 0 0 + 0 0 1 0 + 0 0 0 1 +``` + +The default behavior is to coerce to the nearest integer or rational value, with +a tolerance of the `tolerance` kwarg (defaults to `1e-5`). The raw +pseudo-inverse matrix can be obtained as `Float64` by setting `tolerance=0`: +```julia-repl julia> StatsModels.hypothesis_matrix(cmat, tolerance=0) # ugly -4×4 Adjoint{Float64,Array{Float64,2}}: - 1.0 -1.0 -1.0 -1.0 - -2.23753e-16 1.0 4.94472e-17 1.04958e-16 - 6.91749e-18 -2.42066e-16 1.0 -1.31044e-16 - -1.31485e-16 9.93754e-17 9.93754e-17 1.0 +4×4 Array{Float64,2}: + 1.0 -2.23753e-16 6.91749e-18 -1.31485e-16 + -1.0 1.0 -2.42066e-16 9.93754e-17 + -1.0 4.94472e-17 1.0 9.93754e-17 + -1.0 1.04958e-16 -1.31044e-16 1.0 +``` + +Finally, the hypothesis matrix for a constructed `ContrastsMatrix` (as stored by +`CategoricalTerm`s) can also be extracted: +```jldoctest hypmat julia> StatsModels.hypothesis_matrix(StatsModels.ContrastsMatrix(DummyCoding(), ["a", "b", "c", "d"])) 4×4 Array{Int64,2}: - 1 -1 -1 -1 - 0 1 0 0 - 0 0 1 0 - 0 0 0 1 + 1 0 0 0 + -1 1 0 0 + -1 0 1 0 + -1 0 0 1 ``` """ diff --git a/src/lrtest.jl b/src/lrtest.jl index 84a270fc..8f115b08 100644 --- a/src/lrtest.jl +++ b/src/lrtest.jl @@ -50,21 +50,25 @@ julia> model = glm(@formula(Result ~ 1 + Treatment), dat, Binomial(), LogitLink( julia> bigmodel = glm(@formula(Result ~ 1 + Treatment + Other), dat, Binomial(), LogitLink()); julia> lrtest(nullmodel, model, bigmodel) +┌ Warning: Could not check whether models are nested as model type TableRegressionModel does not implement isnested: results may not be meaningful +└ @ StatsModels ~/.julia/dev/StatsModels/src/lrtest.jl:108 Likelihood-ratio test: 3 models fitted on 12 observations ────────────────────────────────────────────── DOF ΔDOF Deviance ΔDeviance p(>Chisq) ────────────────────────────────────────────── -[1] 1 16.3006 +[1] 1 16.3006 [2] 2 1 15.9559 -0.3447 0.5571 [3] 4 2 14.0571 -1.8988 0.3870 ────────────────────────────────────────────── julia> lrtest(bigmodel, model, nullmodel) +┌ Warning: Could not check whether models are nested as model type TableRegressionModel does not implement isnested: results may not be meaningful +└ @ StatsModels ~/.julia/dev/StatsModels/src/lrtest.jl:108 Likelihood-ratio test: 3 models fitted on 12 observations ────────────────────────────────────────────── DOF ΔDOF Deviance ΔDeviance p(>Chisq) ────────────────────────────────────────────── -[1] 4 14.0571 +[1] 4 14.0571 [2] 2 -2 15.9559 1.8988 0.3870 [3] 1 -1 16.3006 0.3447 0.5571 ────────────────────────────────────────────── diff --git a/src/modelframe.jl b/src/modelframe.jl index c5759ebc..cf31d122 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -84,6 +84,28 @@ ModelFrame(f::FormulaTerm, data; model=StatisticalModel, contrasts=Dict{Symbol,A StatsBase.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...) +""" + modelmatrix(t::AbstractTerm, data; hints=Dict(), mod=StatisticalModel) + modelmatrix(mf::ModelFrame; data=mf.data) + +Return the model matrix based on a term and a data source. If the term `t` is a +[`FormulaTerm`](@ref), this uses the right-hand side (predictor terms) of the +formula; otherwise all columns are generated. If a [`ModelFrame`](@ref) is +provided instead of an `AbstractTerm`, the wrapped table is used as the data +source by default. + +Like [`response`](@ref), this will compute and apply a [`Schema`](@ref) before +calling [`modelcols`](@ref) if necessary. The optional `hints` and `mod` +keyword arguments are passed to [`apply_schema`](@ref). + +!!! note + + `modelmatrix` is provided as a convenience for interactive use. For + modeling packages that wish to support a formula-based interface, it is + recommended to use the [`schema`](@ref) -- [`apply_schema`](@ref) -- + [`modelcols`](@ref) pipeline directly + +""" function StatsBase.modelmatrix(t::Union{AbstractTerm, TupleTerm}, data; hints=Dict{Symbol,Any}(), mod::Type{M}=StatisticalModel) where M Tables.istable(data) || @@ -91,6 +113,27 @@ function StatsBase.modelmatrix(t::Union{AbstractTerm, TupleTerm}, data; t = has_schema(t) ? t : apply_schema(t, schema(t, data, hints), M) modelcols(collect_matrix_terms(t), columntable(data)) end + +""" + response(f::FormulaTerm, data; hints=Dict(), mod=StatisticalModel) + response(mf::ModelFrame; data=mf.data) + +Return the response (left-hand side) of a formula generated by a data source. +If a [`ModelFrame`](@ref) is provided instead of an `AbstractTerm`, the wrapped +table is used by default. + +Like [`modelmatrix`](@ref), this will compute and apply a [`Schema`](@ref) +before calling [`modelcols`](@ref) if necessary. The optional `hints` and `mod` +keyword arguments are passed to [`apply_schema`](@ref). + +!!! note + + `response` is provided as a convenience for interactive use. For + modeling packages that wish to support a formula-based interface, it is + recommended to use the [`schema`](@ref) -- [`apply_schema`](@ref) -- + [`modelcols`](@ref) pipeline directly + +""" function StatsBase.response(f::FormulaTerm, data; hints=Dict{Symbol,Any}(), mod::Type{M}=StatisticalModel) where M diff --git a/src/schema.jl b/src/schema.jl index cb0bd698..3b031db3 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -72,19 +72,21 @@ mapping `Term`s to their concrete instantiations (`ContinuousTerm` or # Example ```jldoctest 1 -julia> d = (x=sample([:a, :b, :c], 10), y=rand(10)); +julia> using StableRNGs; rng = StableRNG(1); + +julia> d = (x=sample(rng, [:a, :b, :c], 10), y=rand(rng, 10)); julia> ts = [Term(:x), Term(:y)]; julia> schema(ts, d) StatsModels.Schema with 2 entries: - y => y x => x + y => y julia> schema(ts, d, Dict(:x => HelmertCoding())) StatsModels.Schema with 2 entries: - y => y x => x + y => y julia> schema(term(:y), d, Dict(:y => CategoricalTerm)) StatsModels.Schema with 1 entry: @@ -97,8 +99,8 @@ same in a container, but when printed alone are different: ```jldoctest 1 julia> sch = schema(ts, d) StatsModels.Schema with 2 entries: - y => y x => x + y => y julia> term(:x) x(unknown) @@ -160,7 +162,7 @@ a(EffectsCoding:3→2) julia> concrete_term(term(:a), [1, 2, 3], Dict(:a=>EffectsCoding())) a(EffectsCoding:3→2) -julia> concrete_term(term(:a), (a = [1, 2, 3], b = rand(3))) +julia> concrete_term(term(:a), (a = [1, 2, 3], b = [0.0, 0.5, 1.0])) a(continuous) ``` """ diff --git a/src/terms.jl b/src/terms.jl index a12ceb55..be86b7b0 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -99,7 +99,7 @@ Predictors: (a,b)->log(1 + a + b) julia> typeof(f.rhs) -FunctionTerm{typeof(log),var"##1#2",(:a, :b)} +FunctionTerm{typeof(log),var"#1#2",(:a, :b)} julia> f.rhs.forig(1 + 3 + 4) 2.0794415416798357 @@ -142,7 +142,9 @@ Generated by combining multiple `AbstractTerm`s with `&` (which is what calls to # Example ```jldoctest -julia> d = (y = rand(9), a = 1:9, b = rand(9), c = repeat(["d","e","f"], 3)); +julia> using StableRNGs; rng = StableRNG(1); + +julia> d = (y = rand(rng, 9), a = 1:9, b = rand(rng, 9), c = repeat(["d","e","f"], 3)); julia> t = InteractionTerm(term.((:a, :b, :c))) a(unknown) & b(unknown) & c(unknown) @@ -155,18 +157,18 @@ a(continuous) & b(continuous) & c(DummyCoding:3→2) julia> modelcols(t, d) 9×2 Array{Float64,2}: - 0.0 0.0 - 1.09793 0.0 - 0.0 2.6946 - 0.0 0.0 - 4.67649 0.0 - 0.0 4.47245 - 0.0 0.0 - 0.64805 0.0 - 0.0 6.97926 + 0.0 0.0 + 1.88748 0.0 + 0.0 1.33701 + 0.0 0.0 + 0.725357 0.0 + 0.0 0.126744 + 0.0 0.0 + 4.93994 0.0 + 0.0 4.33378 julia> modelcols(t.terms, d) -([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.8865801492659497, 0.5489667874821704, 0.8981985570141182, 0.5043129521484462, 0.9352977047074365, 0.7454079139997376, 0.4898716849925324, 0.08100620947201143, 0.7754728346104993], [0.0 0.0; 1.0 0.0; … ; 1.0 0.0; 0.0 1.0]) +([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.236781883208121, 0.9437409715735081, 0.4456708824294644, 0.7636794266904741, 0.14507148958283067, 0.021124039581375875, 0.15254507694061115, 0.617492416565387, 0.48153065407402607], [0.0 0.0; 1.0 0.0; … ; 1.0 0.0; 0.0 1.0]) ``` """ struct InteractionTerm{Ts} <: AbstractTerm @@ -439,7 +441,9 @@ terms. To create a single matrix, wrap the tuple in a [`MatrixTerm`](@ref). # Example ```jldoctest -julia> d = (a = [1:9;], b = rand(9), c = repeat(["d","e","f"], 3)); +julia> using StableRNGs; rng = StableRNG(1); + +julia> d = (a = [1:9;], b = rand(rng, 9), c = repeat(["d","e","f"], 3)); julia> ts = apply_schema(term.((:a, :b, :c)), schema(d)) a(continuous) @@ -447,31 +451,31 @@ b(continuous) c(DummyCoding:3→2) julia> cols = modelcols(ts, d) -([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.7184176729016183, 0.4881665815778522, 0.7081609847641785, 0.7743011281211944, 0.584295963367869, 0.32493666547553657, 0.9894077965577408, 0.3331747574477202, 0.6532298571732302], [0.0 0.0; 1.0 0.0; … ; 1.0 0.0; 0.0 1.0]) +([1, 2, 3, 4, 5, 6, 7, 8, 9], [0.5851946422124186, 0.07733793456911231, 0.7166282400543453, 0.3203570514066232, 0.6530930076222579, 0.2366391513734556, 0.7096838914472361, 0.5577872440804086, 0.05079002172175784], [0.0 0.0; 1.0 0.0; … ; 1.0 0.0; 0.0 1.0]) julia> reduce(hcat, cols) 9×4 Array{Float64,2}: - 1.0 0.718418 0.0 0.0 - 2.0 0.488167 1.0 0.0 - 3.0 0.708161 0.0 1.0 - 4.0 0.774301 0.0 0.0 - 5.0 0.584296 1.0 0.0 - 6.0 0.324937 0.0 1.0 - 7.0 0.989408 0.0 0.0 - 8.0 0.333175 1.0 0.0 - 9.0 0.65323 0.0 1.0 + 1.0 0.585195 0.0 0.0 + 2.0 0.0773379 1.0 0.0 + 3.0 0.716628 0.0 1.0 + 4.0 0.320357 0.0 0.0 + 5.0 0.653093 1.0 0.0 + 6.0 0.236639 0.0 1.0 + 7.0 0.709684 0.0 0.0 + 8.0 0.557787 1.0 0.0 + 9.0 0.05079 0.0 1.0 julia> modelcols(MatrixTerm(ts), d) 9×4 Array{Float64,2}: - 1.0 0.718418 0.0 0.0 - 2.0 0.488167 1.0 0.0 - 3.0 0.708161 0.0 1.0 - 4.0 0.774301 0.0 0.0 - 5.0 0.584296 1.0 0.0 - 6.0 0.324937 0.0 1.0 - 7.0 0.989408 0.0 0.0 - 8.0 0.333175 1.0 0.0 - 9.0 0.65323 0.0 1.0 + 1.0 0.585195 0.0 0.0 + 2.0 0.0773379 1.0 0.0 + 3.0 0.716628 0.0 1.0 + 4.0 0.320357 0.0 0.0 + 5.0 0.653093 1.0 0.0 + 6.0 0.236639 0.0 1.0 + 7.0 0.709684 0.0 0.0 + 8.0 0.557787 1.0 0.0 + 9.0 0.05079 0.0 1.0 ``` """ modelcols(ts::TupleTerm, d::NamedTuple) = modelcols.(ts, Ref(d))