-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmbpls.R
226 lines (180 loc) · 9.17 KB
/
mbpls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
mbpls <- function(dudiY, ktabX, scale = TRUE, option = c("uniform", "none"), scannf = TRUE, nf = 2) {
## -------------------------------------------------------------------------------
## Some tests
##--------------------------------------------------------------------------------
if (!inherits(dudiY, "dudi"))
stop("object 'dudi' expected")
if (!inherits(ktabX, "ktab"))
stop("object 'ktab' expected")
if (any(row.names(ktabX) != row.names(dudiY$tab)))
stop("ktabX and dudiY must have the same rows")
if (!(all.equal(ktabX$lw/sum(ktabX$lw), dudiY$lw/sum(dudiY$lw))))
stop("ktabX and dudiY must have the same row weights")
if (nrow(dudiY$tab) < 6)
stop("Minimum six rows are required")
if (any(ktabX$blo < 2))
stop("Minimum two variables per explanatory block are required")
if (!(is.logical(scale)))
stop("Non convenient selection for scaling")
if (!(is.logical(scannf)))
stop("Non convenient selection for scannf")
if (nf < 0)
nf <- 2
## Only works with centred pca (dudi.pca with center=TRUE) with uniform row weights
if (!any(dudi.type(dudiY$call) == c(3,4)))
stop("Only implemented for centred pca")
# Vérifier la formule / arrondi
#if (any(dudiY$lw != 1/nrow(dudiY$tab)))
# stop("Only implemented for uniform row weights")
option <- match.arg(option)
## -------------------------------------------------------------------------------
## Arguments and data transformation
## -------------------------------------------------------------------------------
## Preparation of the data frames
Y <- as.matrix(dudiY$tab)
nblo <- length(ktabX$blo)
Xk <- lapply(unclass(ktabX)[1 : nblo], scalewt, wt = ktabX$lw, center = TRUE, scale = scale)
nr <- nrow(Y)
ncolY <- ncol(Y)
## Block weighting
if (option[1] == "uniform"){
Y <- Y / sqrt(sum(dudiY$eig)) ## Here we use biased variance. We should use Y <- Y / sqrt(nr/(nr-1)*sum(dudiY$eig)) for unbiased estimators
for (k in 1 : nblo){
Xk[[k]] <- Xk[[k]] / sqrt((nblo/nr) * sum(diag(crossprod(Xk[[k]])))) ## same : Xk[[k]] <- Xk[[k]] / sqrt((nblo/(nr-1)) * sum(diag(crossprod(Xk[[k]])))) for unbiased estimators
}
}
X <- cbind.data.frame(Xk)
colnames(X) <- col.names(ktabX)
ncolX <- ncol(X)
maxdim <- qr(X)$rank
##-----------------------------------------------------------------------
## Prepare the outputs
##-----------------------------------------------------------------------
## Yc1 (V in Bougeard et al): was c1
## lY (U): was ls
## Ajout: de Yco (cov(Y, lX)) -> norme total = eig
## lX (T): was li
## faX (W*): was Wstar
## TlX (Tk): was Tk
## Tfa (Wk): was Wk Tc1 !!!!!!!!!
## Ajout: cov2 (cov^2(lY, Tl1))
## XYcoef: (Beta) was beta
## bip, bipc
## vip, vipc
## Suppression: W
## Suppression: l1
## Suppression de C (remplacé par Yco)
## Suppression de Ak (remplacé par cov2)
dimlab <- paste("Ax", 1:maxdim, sep = "")
res <- list(tabX = X, tabY = as.data.frame(Y), nf = nf, lw = ktabX$lw, X.cw = ktabX$cw, blo = ktabX$blo, rank = maxdim, eig = rep(0, maxdim), TL = ktabX$TL, TC = ktabX$TC)
res$Yc1 <- matrix(0, nrow = ncolY, ncol = maxdim, dimnames = list(colnames(dudiY$tab), dimlab))
res$lX <- res$lY <- matrix(0, nrow = nr, ncol = maxdim, dimnames = list(row.names(dudiY$tab), dimlab))
res$cov2 <- Ak <- matrix(0, nrow = nblo, ncol = maxdim, dimnames = list(names(ktabX$blo), dimlab))
res$Tc1 <- lapply(1:nblo, function(k) matrix(0, nrow = ncol(Xk[[k]]), ncol = maxdim, dimnames = list(colnames(Xk[[k]]), dimlab)))
res$TlX <- rep(list(matrix(0, nrow = nr, ncol = maxdim, dimnames = list(row.names(dudiY$tab), dimlab))), nblo)
res$faX <- matrix(0, nrow = ncolX, ncol = maxdim, dimnames = list(col.names(ktabX), dimlab))
lX1 <- res$lX
W <- res$faX
##-----------------------------------------------------------------------
## Compute components and loadings by an iterative algorithm
##-----------------------------------------------------------------------
Y <- as.matrix(Y)
X <- as.matrix(X)
f1 <- function(x) crossprod(x * res$lw, Y)
for(h in 1 : maxdim) {
## iterative algorithm
## Compute the matrix M for the eigenanalysis
M <- lapply(lapply(Xk, f1), crossprod)
M <- Reduce("+", M)
## Compute the loadings V and the components U (Y dataset)
eig.M <- eigen(M)
if (eig.M$values[1] < sqrt(.Machine$double.eps)) {
res$rank <- h-1 ## update the rank
break
}
res$eig[h] <- eig.M$values[1]
res$Yc1[, h] <- eig.M$vectors[, 1, drop = FALSE]
res$lY[, h] <- Y %*% res$Yc1[, h]
## Compute the loadings Wk and the components Tk (Xk datasets)
covutk <- rep(0, nblo)
for (k in 1 : nblo) {
res$Tc1[[k]][, h] <- crossprod(Xk[[k]] * res$lw, res$lY[, h])
res$Tc1[[k]][, h] <- res$Tc1[[k]][, h] / sqrt(sum(res$Tc1[[k]][, h]^2))
res$TlX[[k]][, h] <- Xk[[k]] %*% res$Tc1[[k]][, h]
covutk[k] <- crossprod(res$lY[, h] * res$lw, res$TlX[[k]][, h])
res$cov2[k, h] <- covutk[k]^2
}
for(k in 1 : nblo) {
Ak[k, h] <- covutk[k] / sqrt(sum(res$cov2[,h]))
res$lX[, h] <- res$lX[, h] + Ak[k, h] * res$TlX[[k]][, h]
}
lX1[, h] <- res$lX[, h] / sqrt(sum(res$lX[, h]^2))
## use ginv to avoid NA in coefficients (collinear system)
W[, h] <- tcrossprod(ginv(crossprod(X)), X) %*% res$lX[, h]
## Deflation of the Xk datasets on the global components T
Xk <- lapply(Xk, function(y) stats::lm.wfit(x = as.matrix(res$lX[, h]), y = y, w = res$lw)$residuals)
X <- as.matrix(cbind.data.frame(Xk))
}
##-----------------------------------------------------------------------
## Compute regressions coefficients
##-----------------------------------------------------------------------
## Use of the original (and not the deflated) datasets X and Y
X <- as.matrix(res$tabX)
Y <- as.matrix(res$tabY)
## Computing the regression coefficients of X onto the global components T (Wstar)
## res$faX <- lm.wfit(x = X, y = res$lX, w = res$lw)$coefficients ## lm is not used to avoid NA coefficients in the case of not full rank matrices
res$faX[, 1] <- W[, 1, drop = FALSE]
A <- diag(ncolX)
if(maxdim >= 2){
for(h in 2:maxdim){
a <- crossprod(lX1[, h-1], X) / sqrt(sum(res$lX[, h-1]^2))
A <- A %*% (diag(ncolX) - W[, h-1] %*% a)
res$faX[, h] <- A %*% W[, h]
X <- X - tcrossprod(lX1[, h-1]) %*% X
}
}
## Computing the regression coefficients of X onto Y (Beta)
res$Yco <- t(Y) %*% diag(res$lw) %*% res$lX
norm.li <- diag(crossprod(res$lX * sqrt(res$lw)))
##res$C <- t(lm.wfit(x = res$lX, y = Y, w = res$lw)$coefficients)
##res$XYcoef <- lapply(1:ncolY, function(x) t(apply(sweep(res$faX, 2 , res$C[x,], "*"), 1, cumsum)))
res$XYcoef <- lapply(1:ncolY, function(x) t(apply(sweep(res$faX, 2 , res$Yco[x,] / norm.li, "*"), 1, cumsum)))
names(res$XYcoef) <- colnames(dudiY$tab)
##-----------------------------------------------------------------------
## Variable and block importances
##-----------------------------------------------------------------------
## Block importances
res$bip <- Ak^2
if (nblo == 1 | res$rank ==1)
res$bipc <- res$bip
else
res$bipc <- t(sweep(apply(sweep(res$bip, 2, res$eig, "*") , 1, cumsum), 1, cumsum(res$eig), "/"))
## Variable importances
WcarreAk <- res$faX^2 * res$bip[rep(1:nblo, ktabX$blo),]
res$vip <- sweep(WcarreAk, 2, colSums(WcarreAk), "/")
if (nblo == 1 | res$rank ==1)
res$vipc <- res$vip
else
res$vipc <- t(sweep(apply(sweep(res$vip, 2, res$eig, "*") , 1, cumsum), 1, cumsum(res$eig), "/"))
##-----------------------------------------------------------------------
## Modify the outputs
##-----------------------------------------------------------------------
if (scannf) {
graphics::barplot(res$eig[1:res$rank])
cat("Select the number of global components: ")
res$nf <- as.integer(readLines(n = 1))
messageScannf(match.call(), res$nf)
}
if(res$nf > res$rank)
res$nf <- res$rank
## keep results for the nf dimensions (except eigenvalues and lX)
res$eig <- res$eig[1:res$rank]
res$lX <- res$lX[, 1:res$rank]
res$Tc1 <- do.call("rbind", res$Tc1)
res$TlX <- do.call("rbind", res$TlX)
res <- utils::modifyList(res, lapply(res[c("Yc1", "Yco", "lY", "Tc1", "TlX", "cov2", "faX", "vip", "vipc", "bip", "bipc")], function(x) x[, 1:res$nf, drop = FALSE]))
res$XYcoef <- lapply(res$XYcoef, function(x) x[, 1:res$nf, drop = FALSE])
res$call <- match.call()
class(res) <- c("multiblock", "mbpls")
return(res)
}