Use DataTables instead of DataFrames

The code is currently written to work with Nullable.
JuliaStats · Feb 19, 2017 · 9a5ba5c · 9a5ba5c
1 parent 331347c
commit 9a5ba5c
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 66 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,7 @@ notifications:
 # uncomment the following lines to override the default test script
 script:
   - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-  - julia -e 'Pkg.clone(pwd()); Pkg.checkout("DataFrames", "dfk/statsmodel-purge"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
+  - julia -e 'Pkg.clone(pwd()); Pkg.clone("https://github.com/JuliaData/DataTables.jl.git"); Pkg.build("StatsModels"); Pkg.test("StatsModels"; coverage=true)'
 after_success:
   # build and deploy documentation with Documenter.jl
   - julia -e 'cd(Pkg.dir("StatsModels")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'

diff --git a/docs/src/formula.md b/docs/src/formula.md
@@ -14,7 +14,7 @@ fields with possibly heterogeneous types.  One of the primary goals of
 `StatsModels` is to make it simpler to transform tabular data into matrix format
 suitable for statistical modeling.
 
-At the moment, "tabular data" means an `AbstractDataFrame`.  Ultimately, the
+At the moment, "tabular data" means an `AbstractDataTable`.  Ultimately, the
 goal is to support any tabular data format that adheres to a minimal API,
 **regardless of backend**.
 
@@ -88,7 +88,7 @@ dropterm
 
 The main use of `Formula`s is for fitting statistical models based on tabular
 data.  From the user's perspective, this is done by `fit` methods that take a
-`Formula` and a `DataFrame` instead of numeric matrices.
+`Formula` and a `DataTable` instead of numeric matrices.
 
 Internally, this is accomplished in three stages:
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -21,5 +21,5 @@ developers when dealing with statistical models and tabular data.
     * `RegressionModel`
 
 Much of this package was formerly part
-of [`DataFrames`](https://www.github.com/JuliaStats/DataFrames.jl)
+of [`DataTables`](https://www.github.com/JuliaStats/DataTables.jl)
 and [`StatsBase`](https://www.github.com/JuliaStats/StatsBase.jl).
diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -3,7 +3,7 @@ __precompile__(true)
 module StatsModels
 
 using Compat
-using DataFrames
+using DataTables
 using StatsBase
 using NullableArrays
 using CategoricalArrays

diff --git a/src/modelframe.jl b/src/modelframe.jl
@@ -1,5 +1,5 @@
 """
-Wrapper which combines Formula (Terms) and an AbstractDataFrame
+Wrapper which combines Formula (Terms) and an AbstractDataTable
 
 This wrapper encapsulates all the information that's required to transform data
 of the same structure as the wrapped data frame into a model matrix.  This goes
@@ -13,19 +13,19 @@ then creates the necessary contrasts matrices and stores the results.
 # Constructors
 
 ```julia
-ModelFrame(f::Formula, df::AbstractDataFrame; contrasts::Dict = Dict())
-ModelFrame(ex::Expr, d::AbstractDataFrame; contrasts::Dict = Dict())
-ModelFrame(terms::Terms, df::AbstractDataFrame; contrasts::Dict = Dict())
+ModelFrame(f::Formula, df::AbstractDataTable; contrasts::Dict = Dict())
+ModelFrame(ex::Expr, d::AbstractDataTable; contrasts::Dict = Dict())
+ModelFrame(terms::Terms, df::AbstractDataTable; contrasts::Dict = Dict())
 # Inner constructors:
-ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray)
-ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
+ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray)
+ModelFrame(df::AbstractDataTable, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
 ```
 
 # Arguments
 
 * `f::Formula`: Formula whose left hand side is the *response* and right hand
   side are the *predictors*.
-* `df::AbstractDataFrame`: The data being modeled.  This is used at this stage
+* `df::AbstractDataTable`: The data being modeled.  This is used at this stage
   to determine which variables are categorical, and otherwise held for
   [`ModelMatrix`](@ref).
 * `contrasts::Dict`: An optional Dict of contrast codings for each categorical
@@ -41,13 +41,13 @@ ModelFrame(df::AbstractDataFrame, terms::Terms, missing::BitArray, contrasts::Di
 # Examples
 
 ```julia
-julia> df = DataFrame(x = 1:4, y = 5:9)
+julia> df = DataTable(x = 1:4, y = 5:9)
 julia> mf = ModelFrame(y ~ 1 + x, df)
 ```
 
 """
 type ModelFrame
-    df::AbstractDataFrame
+    df::AbstractDataTable
     terms::Terms
     msng::BitArray
     ## mapping from df keys to contrasts matrices
@@ -69,7 +69,7 @@ is_categorical(::AbstractArray) = true
 ##
 ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
 ## redundant evaluation terms.
-function check_non_redundancy!(trms::Terms, df::AbstractDataFrame)
+function check_non_redundancy!(trms::Terms, df::AbstractDataTable)
 
     (n_eterms, n_terms) = size(trms.factors)
 
@@ -123,7 +123,7 @@ end
 ## Combine actual DF columns and contrast types if necessary to compute the
 ## actual contrasts matrices, levels, and term names (using DummyCoding
 ## as the default)
-function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
+function evalcontrasts(df::AbstractDataTable, contrasts::Dict = Dict())
     evaledContrasts = Dict()
     for (term, col) in eachcol(df)
         is_categorical(col) || continue
@@ -136,17 +136,17 @@ function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict())
 end
 
 ## Default NULL handler.  Others can be added as keyword arguments
-function null_omit(df::DataFrame)
+function null_omit(df::DataTable)
     cc = complete_cases(df)
     df[cc,:], cc
 end
 
 _droplevels!(x::Any) = x
 _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x)
 
-function ModelFrame(trms::Terms, d::AbstractDataFrame;
+function ModelFrame(trms::Terms, d::AbstractDataTable;
                     contrasts::Dict = Dict())
-    df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms)))
+    df, msng = null_omit(DataTable(map(x -> d[x], trms.eterms)))
     names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
     for c in eachcol(df) _droplevels!(c[2]) end
 
@@ -158,9 +158,9 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame;
     ModelFrame(df, trms, msng, evaledContrasts)
 end
 
-ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
-ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
-ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
+ModelFrame(df::AbstractDataTable, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
+ModelFrame(f::Formula, d::AbstractDataTable; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
+ModelFrame(ex::Expr, d::AbstractDataTable; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
 
 """
     setcontrasts!(mf::ModelFrame, new_contrasts::Dict)

diff --git a/src/statsmodel.jl b/src/statsmodel.jl
@@ -31,23 +31,23 @@ macro delegate(source, targets)
     return result
 end
 
-# Wrappers for DataFrameStatisticalModel and DataFrameRegressionModel
-immutable DataFrameStatisticalModel{M,T} <: StatisticalModel
+# Wrappers for DataTableStatisticalModel and DataTableRegressionModel
+immutable DataTableStatisticalModel{M,T} <: StatisticalModel
     model::M
     mf::ModelFrame
     mm::ModelMatrix{T}
 end
 
-immutable DataFrameRegressionModel{M,T} <: RegressionModel
+immutable DataTableRegressionModel{M,T} <: RegressionModel
     model::M
     mf::ModelFrame
     mm::ModelMatrix{T}
 end
 
-for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
-                                 (:RegressionModel, DataFrameRegressionModel))
+for (modeltype, dfmodeltype) in ((:StatisticalModel, DataTableStatisticalModel),
+                                 (:RegressionModel, DataTableRegressionModel))
     @eval begin
-        function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame,
+        function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataTable,
                                               args...; contrasts::Dict = Dict(), kwargs...)
             mf = ModelFrame(f, df, contrasts=contrasts)
             mm = ModelMatrix(mf)
@@ -58,24 +58,24 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel),
 end
 
 # Delegate functions from StatsBase that use our new types
-typealias DataFrameModels @compat(Union{DataFrameStatisticalModel, DataFrameRegressionModel})
-@delegate DataFrameModels.model [StatsBase.coef, StatsBase.confint,
+typealias DataTableModels @compat(Union{DataTableStatisticalModel, DataTableRegressionModel})
+@delegate DataTableModels.model [StatsBase.coef, StatsBase.confint,
                                  StatsBase.deviance, StatsBase.nulldeviance,
                                  StatsBase.loglikelihood, StatsBase.nullloglikelihood,
                                  StatsBase.dof, StatsBase.dof_residual, StatsBase.nobs,
                                  StatsBase.stderr, StatsBase.vcov]
-@delegate DataFrameRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
+@delegate DataTableRegressionModel.model [StatsBase.residuals, StatsBase.model_response,
                                           StatsBase.predict, StatsBase.predict!]
 # Need to define these manually because of ambiguity using @delegate
-StatsBase.r2(mm::DataFrameRegressionModel) = r2(mm.model)
-StatsBase.adjr2(mm::DataFrameRegressionModel) = adjr2(mm.model)
-StatsBase.r2(mm::DataFrameRegressionModel, variant::Symbol) = r2(mm.model, variant)
-StatsBase.adjr2(mm::DataFrameRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
+StatsBase.r2(mm::DataTableRegressionModel) = r2(mm.model)
+StatsBase.adjr2(mm::DataTableRegressionModel) = adjr2(mm.model)
+StatsBase.r2(mm::DataTableRegressionModel, variant::Symbol) = r2(mm.model, variant)
+StatsBase.adjr2(mm::DataTableRegressionModel, variant::Symbol) = adjr2(mm.model, variant)
 
 # Predict function that takes data frame as predictor instead of matrix
-function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; kwargs...)
+function StatsBase.predict(mm::DataTableRegressionModel, df::AbstractDataTable; kwargs...)
     # copy terms, removing outcome if present (ModelFrame will complain if a
-    # term is not found in the DataFrame and we don't want to remove elements with missing y)
+    # term is not found in the DataTable and we don't want to remove elements with missing y)
     newTerms = dropresponse!(mm.mf.terms)
     # create new model frame/matrix
     mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts)
@@ -89,7 +89,7 @@ end
 
 
 # coeftable implementation
-function StatsBase.coeftable(model::DataFrameModels)
+function StatsBase.coeftable(model::DataTableModels)
     ct = coeftable(model.model)
     cfnames = coefnames(model.mf)
     if length(ct.rownms) == length(cfnames)
@@ -99,7 +99,7 @@ function StatsBase.coeftable(model::DataFrameModels)
 end
 
 # show function that delegates to coeftable
-function Base.show(io::IO, model::DataFrameModels)
+function Base.show(io::IO, model::DataTableModels)
     try
         ct = coeftable(model)
         println(io, "$(typeof(model))")

diff --git a/test/contrasts.jl b/test/contrasts.jl
@@ -1,11 +1,12 @@
 module TestContrasts
 
 using Base.Test
-using DataFrames
+using DataTables
+using CategoricalArrays
 using StatsModels
 
 
-d = DataFrame(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
+d = DataTable(x = CategoricalVector([:a, :b, :c, :a, :a, :b]))
 
 mf = ModelFrame(Formula(nothing, :x), d)
 

diff --git a/test/formula.jl b/test/formula.jl
@@ -6,7 +6,7 @@ using Compat
 
 # TODO:
 # - grouped variables in formulas with interactions
-# - is it fast?  Can expand() handle DataFrames?
+# - is it fast?  Can expand() handle DataTables?
 # - deal with intercepts
 # - implement ^2 for datavector's
 # - support more transformations with I()?

diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl
@@ -2,18 +2,19 @@ module TestModelMatrix
 
 using Base.Test
 using StatsModels
-using DataFrames
+using DataTables
 using Compat
+using CategoricalArrays
 
-# for testing while DataFrames still exports these:
+# for testing while DataTables still exports these:
 import StatsModels: @formula, Formula, ModelMatrix, ModelFrame, DummyCoding, EffectsCoding, HelmertCoding, ContrastsCoding, setcontrasts!, coefnames
 
 
 ## Tests for constructing ModelFrame and ModelMatrix
 
 sparsetype = SparseMatrixCSC{Float64,Int}
 
-d = DataFrame()
+d = DataTable()
 d[:y] = [1:4;]
 d[:x1] = [5:8;]
 d[:x2] = [9:12;]
@@ -50,59 +51,59 @@ mm = ModelMatrix(mf)
 @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"]
 @test mm.m == ModelMatrix{sparsetype}(mf).m
 
-#test_group("create a design matrix from interactions from two DataFrames")
+#test_group("create a design matrix from interactions from two DataTables")
 ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a
 
-## b = DataFrame()
+## b = DataTable()
 ## b["x2"] = DataVector(x2)
 ## df = interaction_design_matrix(a,b)
 ## @test df[:,1] == DataVector([0, 10., 0, 0])
 ## @test df[:,2] == DataVector([0, 0, 11., 0])
 ## @test df[:,3] == DataVector([0, 0, 0, 12.])
 
-#test_group("expanding an singleton expression/symbol into a DataFrame")
+#test_group("expanding an singleton expression/symbol into a DataTable")
 ## generalized expand was dropped, too
 ## df = deepcopy(d)
 ## r = expand(:x2, df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test r[:,1] == DataVector([9,10,11,12])  # TODO: test float vs int return
 
 ## df = deepcopy(d)
 ## ex = :(log(x2))
 ## r = expand(ex, df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test r[:,1] == DataVector(log([9,10,11,12]))
 
 # ex = :(x1 & x2)
 # r = expand(ex, df)
-# @test isa(r, DataFrame)
+# @test isa(r, DataTable)
 # @test ncol(r) == 1
 # @test r[:,1] == DataArray([45, 60, 77, 96])
 
 ## r = expand(:(x1 + x2), df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test ncol(r) == 2
 ## @test r[:,1] == DataVector(df["x1"])
 ## @test r[:,2] == DataVector(df["x2"])
 
 ## df["x1"] = CategoricalArray(x1)
 ## r = expand(:x1, df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test ncol(r) == 3
-## @test r == expand(CategoricalArray(x1), "x1", DataFrame())
+## @test r == expand(CategoricalArray(x1), "x1", DataTable())
 
 ## r = expand(:(x1 + x2), df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test ncol(r) == 4
-## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame())
+## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataTable())
 ## @test r[:,4] == DataVector(df["x2"])
 
 ## df["x2"] = CategoricalArray(x2)
 ## r = expand(:(x1 + x2), df)
-## @test isa(r, DataFrame)
+## @test isa(r, DataTable)
 ## @test ncol(r) == 6
-## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame())
-## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataFrame())
+## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataTable())
+## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataTable())
 
 #test_group("Creating a model matrix using full formulas: y => x1 + x2, etc")
 
@@ -236,7 +237,7 @@ mm = ModelMatrix(mf)
 ##
 ## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...)
 ##
-## df = DataFrame(y=1:27,
+## df = DataTable(y=1:27,
 ##                x1 = CategoricalArray(vec([x for x in 1:3, y in 4:6, z in 7:9])),
 ##                x2 = CategoricalArray(vec([y for x in 1:3, y in 4:6, z in 7:9])),
 ##                x3 = CategoricalArray(vec([z for x in 1:3, y in 4:6, z in 7:9])))
@@ -296,7 +297,7 @@ mm.m == float(model_response(mf))
 
 ## Promote non-redundant categorical terms to full rank
 
-d = DataFrame(x = Compat.repeat([:a, :b], outer = 4),
+d = DataTable(x = Compat.repeat([:a, :b], outer = 4),
               y = Compat.repeat([:c, :d], inner = 2, outer = 2),
               z = Compat.repeat([:e, :f], inner = 4))
 [categorical!(d, name) for name in names(d)]
@@ -434,7 +435,7 @@ mm = ModelMatrix(mf)
 
 
 # Ensure that random effects terms are dropped from coefnames
-df = DataFrame(x = [1,2,3], y = [4,5,6])
+df = DataTable(x = [1,2,3], y = [4,5,6])
 mf = ModelFrame(@formula(y ~ 1 + (1 | x)), df)
 @test coefnames(mf) == ["(Intercept)"]
 
@@ -444,7 +445,7 @@ mf = ModelFrame(@formula(y ~ 0 + (1 | x)), df)
 
 
 # Ensure X is not a view on df column
-df = DataFrame(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0])
+df = DataTable(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0])
 mf = ModelFrame(@formula(y ~ 0 + x), df)
 X = ModelMatrix(mf).m
 X[1] = 0.0