## ----setup, include=FALSE------------------------------------------------ knitr::opts_chunk$set(cache=TRUE) options(knitr.kable.NA = '-') ## ---- message = FALSE---------------------------------------------------- # Let's revisit the plastic film data library(heplots) library(tidyverse) Y <- Plastic %>% select(tear, gloss, opacity) %>% as.matrix X <- model.matrix(~ rate, data = Plastic) head(X) ## ------------------------------------------------------------------------ (B_hat <- solve(crossprod(X)) %*% t(X) %*% Y) ## ------------------------------------------------------------------------ # Compare with lm output fit <- lm(cbind(tear, gloss, opacity) ~ rate, data = Plastic) coef(fit) ## ------------------------------------------------------------------------ Y_hat <- fitted(fit) residuals <- residuals(fit) crossprod(Y_hat, residuals) crossprod(X, residuals) # Is this really zero? isZero <- function(mat) { all.equal(mat, matrix(0, ncol = ncol(mat), nrow = nrow(mat)), check.attributes = FALSE) } isZero(crossprod(Y_hat, residuals)) isZero(crossprod(X, residuals)) ## ---- message = FALSE---------------------------------------------------- library(candisc) dataset <- HSB[,c("math", "sci")] (corr_est <- cor(dataset)[1,2]) # Choose a number of bootstrap samples B <- 5000 corr_boot <- replicate(B, { samp_boot <- sample(nrow(dataset), replace = TRUE) dataset_boot <- dataset[samp_boot,] cor(dataset_boot)[1,2] }) quantile(corr_boot, probs = c(0.025, 0.975)) ## ---- message = FALSE---------------------------------------------------- hist(corr_boot, breaks = 50) abline(v = corr_est, col = 'red', lty = 2) ## ---- message=FALSE------------------------------------------------------ B_boot <- replicate(B, { samp_boot <- sample(nrow(Y), replace = TRUE) X_boot <- X[samp_boot,] Y_boot <- Y[samp_boot,] solve(crossprod(X_boot)) %*% t(X_boot) %*% Y_boot }) # The output is a 3-dim array dim(B_boot) B_boot[,,1] # CI for effect of rate on tear quantile(B_boot["rateHigh", "tear",], probs = c(0.025, 0.975)) # CI for effect of rate on gloss quantile(B_boot["rateHigh", "gloss",], probs = c(0.025, 0.975)) # CI for effect of rate on opacity quantile(B_boot["rateHigh", "opacity",], probs = c(0.025, 0.975)) ## ---- message=FALSE, eval = FALSE---------------------------------------- ## library(ggforce) ## ## B_boot["rateHigh",,] %>% ## t() %>% ## as.data.frame() %>% ## ggplot(aes(x = .panel_x, y = .panel_y)) + ## geom_point() + ## geom_autodensity() + ## geom_density2d() + ## facet_matrix(vars(everything()), ## layer.diag = 2, ## layer.upper = 3) ## ---- message=FALSE, echo = FALSE---------------------------------------- # Need to repeat code above with echo = FALSE # In order to get the plot on a separate slide library(ggforce) B_boot["rateHigh",,] %>% t() %>% as.data.frame() %>% ggplot(aes(x = .panel_x, y = .panel_y)) + geom_point() + geom_autodensity() + geom_density2d() + facet_matrix(vars(everything()), layer.diag = 2, layer.upper = 3) ## ------------------------------------------------------------------------ # There is some correlation, but not much B_boot["rateHigh",,] %>% t() %>% cor() ## ------------------------------------------------------------------------ # Recall our model fit <- lm(cbind(tear, gloss, opacity) ~ rate, data = Plastic) new_x <- data.frame(rate = factor("High", levels = c("Low", "High"))) (prediction <- predict(fit, newdata = new_x)) ## ------------------------------------------------------------------------ X <- model.matrix(fit) S <- crossprod(resid(fit))/(nrow(Plastic) - ncol(X)) new_x <- model.matrix(~rate, new_x) quad_form <- drop(new_x %*% solve(crossprod(X)) %*% t(new_x)) # Estimation covariance (est_cov <- S * quad_form) # Forecasting covariance (fct_cov <- S *(1 + quad_form)) ## ------------------------------------------------------------------------ # Estimation CIs cbind(drop(prediction) - 1.96*sqrt(diag(est_cov)), drop(prediction) + 1.96*sqrt(diag(est_cov))) # Forecasting CIs cbind(drop(prediction) - 1.96*sqrt(diag(fct_cov)), drop(prediction) + 1.96*sqrt(diag(fct_cov))) ## ---- warning = FALSE---------------------------------------------------- # Going back to our example full_model <- lm(cbind(tear, gloss, opacity) ~ rate*additive, data = Plastic) anova(full_model, test = "Wilks") %>% broom::tidy() %>% knitr::kable(digits = 3) anova(full_model, test = "Roy") %>% broom::tidy() %>% knitr::kable(digits = 3) ## ---- warning = FALSE---------------------------------------------------- # Fit a model with only rate rate_model <- lm(cbind(tear, gloss, opacity) ~ rate, data = Plastic) # Removing the dfs from approx anova(full_model, rate_model, test = "Wilks") %>% broom::tidy() %>% dplyr::select(-num.Df, -den.Df) %>% knitr::kable(digits = 3) anova(full_model, rate_model, test = "Roy") %>% broom::tidy() %>% dplyr::select(-num.Df, -den.Df) %>% knitr::kable(digits = 3) ## ------------------------------------------------------------------------ # Let's look at the eigenvalues E <- crossprod(residuals(full_model)) H <- crossprod(residuals(rate_model)) - E result <- eigen(H %*% solve(E), only.values = TRUE) result$values[seq_len(2)] ## ----eval = -1----------------------------------------------------------- AIC(full_model) # Error in logLik.lm(full_model) : # 'logLik.lm' does not support multiple responses class(full_model) ## ------------------------------------------------------------------------ logLik.mlm <- function(object, ...) { resids <- residuals(object) Sigma_ML <- crossprod(resids)/nrow(resids) ans <- sum(mvtnorm::dmvnorm(resids, sigma = Sigma_ML, log = TRUE)) df <- prod(dim(coef(object))) + choose(ncol(Sigma_ML) + 1, 2) attr(ans, "df") <- df class(ans) <- "logLik" return(ans) } ## ------------------------------------------------------------------------ logLik(full_model) AIC(full_model) AIC(rate_model) ## ------------------------------------------------------------------------ # Model selection lhs <- "cbind(tear, gloss, opacity) ~" rhs_form <- c("1", "rate", "additive", "rate+additive", "rate*additive") purrr::map_df(rhs_form, function(rhs) { form <- formula(paste(lhs, rhs)) fit <- lm(form, data = Plastic) return(data.frame(model = rhs, aic = AIC(fit), stringsAsFactors = FALSE)) }) ## ---- message = FALSE---------------------------------------------------- library(openintro) model <- lm(cbind(startPr, totalPr) ~ nBids + cond + sellerRate + wheels + stockPhoto, data = marioKart) X <- model.matrix(model) P <- X %*% solve(crossprod(X)) %*% t(X) lev_values <- diag(P) hist(lev_values, 50) ## ------------------------------------------------------------------------ n <- nrow(marioKart) resids <- residuals(model) S <- crossprod(resids)/(n - ncol(X)) S_inv <- solve(S) const <- lev_values/((1 - lev_values)^2*ncol(X)) cook_values <- const * diag(resids %*% S_inv %*% t(resids)) hist(cook_values, 50) ## ------------------------------------------------------------------------ # Cut-off value (cutoff <- qchisq(0.5, ncol(S)*(n - ncol(X)))) which(cook_values > cutoff)