R/mbpls.R

mbpls <- function(dudiY, ktabX, scale = TRUE, option = c("uniform", "none"), scannf = TRUE, nf = 2) {

    ## -------------------------------------------------------------------------------
    ##			       Some tests
    ##--------------------------------------------------------------------------------
    
    if (!inherits(dudiY, "dudi"))
        stop("object 'dudi' expected")
    if (!inherits(ktabX, "ktab"))
        stop("object 'ktab' expected")
    if (any(row.names(ktabX) != row.names(dudiY$tab)))
        stop("ktabX and dudiY must have the same rows")
    if (!(all.equal(ktabX$lw/sum(ktabX$lw), dudiY$lw/sum(dudiY$lw))))
        stop("ktabX and dudiY must have the same row weights")
    if (nrow(dudiY$tab) < 6)
        stop("Minimum six rows are required")
    if (any(ktabX$blo < 2))
        stop("Minimum two variables per explanatory block are required")
    if (!(is.logical(scale)))
        stop("Non convenient selection for scaling")
    if (!(is.logical(scannf)))
        stop("Non convenient selection for scannf")
    if (nf < 0)
        nf <- 2
    
    ## Only works with centred pca (dudi.pca with center=TRUE) with uniform row weights
    if (!any(dudi.type(dudiY$call) == c(3,4)))
        stop("Only implemented for centred pca")

    # Vérifier la formule / arrondi
    #if (any(dudiY$lw != 1/nrow(dudiY$tab)))
    #    stop("Only implemented for uniform row weights")

    option <- match.arg(option)
    
    ## -------------------------------------------------------------------------------
    ##			Arguments and data transformation
    ## -------------------------------------------------------------------------------
    
    ## Preparation of the data frames
    Y <- as.matrix(dudiY$tab)
    nblo  <- length(ktabX$blo)
    Xk <- lapply(unclass(ktabX)[1 : nblo], scalewt, wt = ktabX$lw, center = TRUE, scale = scale)
      
    nr <- nrow(Y) 
    ncolY <- ncol(Y) 
    
    ## Block weighting
    if (option[1] == "uniform"){
        Y <- Y / sqrt(sum(dudiY$eig)) ## Here we use biased variance. We should use Y <- Y / sqrt(nr/(nr-1)*sum(dudiY$eig)) for unbiased estimators
        for (k in 1 : nblo){
            Xk[[k]] <- Xk[[k]] / sqrt((nblo/nr) * sum(diag(crossprod(Xk[[k]])))) ## same : Xk[[k]] <- Xk[[k]] / sqrt((nblo/(nr-1)) * sum(diag(crossprod(Xk[[k]])))) for unbiased estimators
        }
    }
    
    X  <- cbind.data.frame(Xk)
    colnames(X) <- col.names(ktabX)
    ncolX <- ncol(X)

    maxdim <- qr(X)$rank
       
    ##-----------------------------------------------------------------------
    ##                         Prepare the outputs
    ##-----------------------------------------------------------------------

    ## Yc1 (V in Bougeard et al): was c1
    ## lY (U): was ls
    ## Ajout: de Yco (cov(Y, lX)) -> norme total = eig 
    
    ## lX (T): was li
    ## faX (W*): was Wstar  
   
    ## TlX (Tk): was Tk           
    ## Tfa (Wk): was Wk             Tc1 !!!!!!!!!
    ## Ajout: cov2 (cov^2(lY, Tl1))
    ## XYcoef: (Beta) was beta

    ## bip, bipc
    ## vip, vipc
    
    ## Suppression: W
    ## Suppression: l1
    ## Suppression de C (remplacÃ© par Yco)
    ## Suppression de Ak (remplacÃ© par cov2)
    
    dimlab <- paste("Ax", 1:maxdim, sep = "")
    res <- list(tabX = X, tabY = as.data.frame(Y), nf = nf, lw = ktabX$lw, X.cw = ktabX$cw, blo = ktabX$blo, rank = maxdim, eig = rep(0, maxdim), TL = ktabX$TL, TC = ktabX$TC)

    res$Yc1 <- matrix(0, nrow = ncolY, ncol = maxdim, dimnames = list(colnames(dudiY$tab), dimlab))
    res$lX <- res$lY <- matrix(0, nrow = nr, ncol = maxdim, dimnames = list(row.names(dudiY$tab), dimlab))
    res$cov2 <- Ak <- matrix(0, nrow = nblo, ncol = maxdim, dimnames = list(names(ktabX$blo), dimlab))
    res$Tc1 <- lapply(1:nblo, function(k)  matrix(0, nrow = ncol(Xk[[k]]), ncol = maxdim, dimnames = list(colnames(Xk[[k]]), dimlab)))
    res$TlX <- rep(list(matrix(0, nrow = nr, ncol = maxdim, dimnames = list(row.names(dudiY$tab), dimlab))), nblo)
    res$faX <- matrix(0, nrow = ncolX, ncol = maxdim, dimnames = list(col.names(ktabX), dimlab))
    lX1 <- res$lX
    W <- res$faX
    ##-----------------------------------------------------------------------
    ##     Compute components and loadings by an iterative algorithm
    ##-----------------------------------------------------------------------

    Y <- as.matrix(Y)
    X <- as.matrix(X)
      
    f1 <- function(x) crossprod(x * res$lw, Y)

    for(h in 1 : maxdim) {
        
        ## iterative algorithm
        
        ## Compute the matrix M for the eigenanalysis
        M <- lapply(lapply(Xk, f1), crossprod)
        M <- Reduce("+", M)
        
        ## Compute the loadings V and the components U (Y dataset)
        eig.M <- eigen(M)
        
        if (eig.M$values[1] < sqrt(.Machine$double.eps)) {
            res$rank <- h-1 ## update the rank
            break
        }

        res$eig[h] <- eig.M$values[1]    
        res$Yc1[, h]  <- eig.M$vectors[, 1, drop = FALSE]
        res$lY[, h]  <- Y %*% res$Yc1[, h]
        
        ## Compute the loadings Wk and the components Tk (Xk datasets)
        
        covutk <- rep(0, nblo)   
        for (k in 1 : nblo) {
            res$Tc1[[k]][, h] <- crossprod(Xk[[k]] * res$lw, res$lY[, h])
            res$Tc1[[k]][, h] <- res$Tc1[[k]][, h] / sqrt(sum(res$Tc1[[k]][, h]^2))
            res$TlX[[k]][, h] <- Xk[[k]] %*% res$Tc1[[k]][, h]
            
            covutk[k] <- crossprod(res$lY[, h] * res$lw, res$TlX[[k]][, h])
            res$cov2[k, h] <- covutk[k]^2  
        }
        
        for(k in 1 : nblo) {
            Ak[k, h] <- covutk[k] / sqrt(sum(res$cov2[,h]))
            res$lX[, h]  <- res$lX[, h] + Ak[k, h] * res$TlX[[k]][, h]
        }

        lX1[, h] <- res$lX[, h] / sqrt(sum(res$lX[, h]^2))
        ## use ginv to avoid NA in coefficients (collinear system)
        W[, h]  <- tcrossprod(ginv(crossprod(X)), X) %*% res$lX[, h]
        
        ## Deflation of the Xk datasets on the global components T
        Xk <- lapply(Xk, function(y) stats::lm.wfit(x = as.matrix(res$lX[, h]), y = y, w = res$lw)$residuals)
        X  <- as.matrix(cbind.data.frame(Xk))
    }

    ##-----------------------------------------------------------------------
    ##     Compute regressions coefficients
    ##-----------------------------------------------------------------------
    
    ## Use of the original (and not the deflated) datasets X and Y	
    X <- as.matrix(res$tabX)
    Y <- as.matrix(res$tabY)

    ## Computing the regression coefficients of X onto the global components T (Wstar)
    ## res$faX <- lm.wfit(x = X, y = res$lX, w = res$lw)$coefficients ## lm is not used to avoid NA coefficients in the case of not full rank matrices
    res$faX[, 1] <- W[, 1, drop = FALSE]
    A <- diag(ncolX)
    if(maxdim >= 2){
        for(h in 2:maxdim){
            a <- crossprod(lX1[, h-1], X) / sqrt(sum(res$lX[, h-1]^2))
            A <- A %*% (diag(ncolX) - W[, h-1] %*% a)
            res$faX[, h] <- A %*% W[, h]
            X <- X - tcrossprod(lX1[, h-1]) %*% X
        }
    }  
    ##  Computing the regression coefficients of X onto Y (Beta)
    res$Yco <-  t(Y) %*% diag(res$lw) %*% res$lX
    norm.li <- diag(crossprod(res$lX * sqrt(res$lw)))
    ##res$C  <- t(lm.wfit(x = res$lX, y = Y, w = res$lw)$coefficients)
    ##res$XYcoef <- lapply(1:ncolY, function(x) t(apply(sweep(res$faX, 2 , res$C[x,], "*"), 1, cumsum)))
    res$XYcoef <- lapply(1:ncolY, function(x) t(apply(sweep(res$faX, 2 , res$Yco[x,] / norm.li, "*"), 1, cumsum)))
    names(res$XYcoef) <- colnames(dudiY$tab)
    
    ##-----------------------------------------------------------------------
    ##   		Variable and block importances
    ##-----------------------------------------------------------------------

    ## Block importances
    res$bip <- Ak^2
    
    if (nblo == 1 | res$rank ==1)
        res$bipc <- res$bip
    else 
        res$bipc <- t(sweep(apply(sweep(res$bip, 2, res$eig, "*") , 1, cumsum), 1, cumsum(res$eig), "/"))
   
    ## Variable importances
    WcarreAk <- res$faX^2 * res$bip[rep(1:nblo, ktabX$blo),]
    res$vip  <- sweep(WcarreAk, 2, colSums(WcarreAk), "/")
    if (nblo == 1 | res$rank ==1)
        res$vipc <- res$vip
    else 
        res$vipc <- t(sweep(apply(sweep(res$vip, 2, res$eig, "*") , 1, cumsum), 1, cumsum(res$eig), "/"))
    
    ##-----------------------------------------------------------------------
    ##			         Modify the outputs
    ##-----------------------------------------------------------------------
    
    if (scannf) {
        graphics::barplot(res$eig[1:res$rank])
        cat("Select the number of global components: ")
        res$nf <- as.integer(readLines(n = 1))
        messageScannf(match.call(), res$nf)
    }

    if(res$nf > res$rank)
        res$nf <- res$rank
    
    ## keep results for the nf dimensions (except eigenvalues and lX)
    res$eig <- res$eig[1:res$rank]
    res$lX  <- res$lX[, 1:res$rank]
    res$Tc1 <- do.call("rbind", res$Tc1)
    res$TlX <- do.call("rbind", res$TlX)
    
    res <- utils::modifyList(res, lapply(res[c("Yc1", "Yco", "lY", "Tc1", "TlX", "cov2", "faX", "vip", "vipc", "bip", "bipc")], function(x) x[, 1:res$nf, drop = FALSE]))
    res$XYcoef <- lapply(res$XYcoef, function(x) x[, 1:res$nf, drop = FALSE])
    res$call   <- match.call()
    class(res) <- c("multiblock", "mbpls")
    return(res)
}