## ----setup, include=FALSE------------------------------------------------
knitr::opts_chunk$set(cache=TRUE)


## ---- message=FALSE------------------------------------------------------
library(dslabs)
library(tidyverse)

dataset <- gapminder %>% 
  filter(year == 2012, 
         !is.na(infant_mortality)) %>% 
  select(infant_mortality, 
         life_expectancy, 
         fertility) %>% 
  as.matrix()


## ---- message=FALSE------------------------------------------------------
# Assume we know Sigma
Sigma <- matrix(c(555, -170, 30, -170, 65, -10, 
                  30, -10, 2), ncol = 3)

mu_hat <- colMeans(dataset) 
mu_hat


## ---- message=FALSE------------------------------------------------------
# Test mu = mu_0
mu_0 <- c(25, 50, 3)
test_statistic <- nrow(dataset) * t(mu_hat - mu_0) %*% 
  solve(Sigma) %*% (mu_hat - mu_0)

drop(test_statistic) > qchisq(0.95, df = 3)


## ---- message=FALSE------------------------------------------------------
n <- nrow(dataset); p <- ncol(dataset)

# Test mu = mu_0
mu_0 <- c(25, 50, 3)
test_statistic <- n * t(mu_hat - mu_0) %*% 
  solve(cov(dataset)) %*% (mu_hat - mu_0)

critical_val <- (n - 1)*p*qf(0.95, df1 = p,
                             df2 = n - p)/(n-p)

drop(test_statistic) > critical_val


## ------------------------------------------------------------------------
n <- nrow(dataset); p <- ncol(dataset)

# Test mu = mu_0
mu_0 <- c(25, 50, 3)
test_statistic <- n * t(mu_hat - mu_0) %*% 
  solve(cov(dataset)) %*% (mu_hat - mu_0)

critical_val <- (n - 1)*p*qf(0.95, df1 = p,
                             df2 = n - p)/(n-p)
sample_cov <- diag(cov(dataset))

cbind(mu_hat - sqrt(critical_val*
                      sample_cov/n),
      mu_hat + sqrt(critical_val*
                      sample_cov/n))


## ------------------------------------------------------------------------
U <- matrix(c(1, 0, 0,
              0, 1, 0), 
            ncol = 2)
R <- n*solve(t(U) %*% cov(dataset) %*% U)
transf <- chol(R)


## ------------------------------------------------------------------------
# First create a circle of radius c
theta_vect <- seq(0, 2*pi, length.out = 100)
circle <- sqrt(critical_val) * cbind(cos(theta_vect), sin(theta_vect))
# Then turn into ellipse
ellipse <- circle %*% t(solve(transf)) + 
  matrix(mu_hat[1:2], ncol = 2, 
         nrow = nrow(circle), 
         byrow = TRUE)


## ------------------------------------------------------------------------
# Eigendecomposition
decomp <- eigen(t(U) %*% cov(dataset) %*% U)
first <- sqrt(decomp$values[1]) *
  decomp$vectors[,1] * sqrt(critical_val)
second <- sqrt(decomp$values[2]) * 
  decomp$vectors[,2] * sqrt(critical_val)


## ---- echo = FALSE-------------------------------------------------------
plot(ellipse, type = 'l',
     xlab = colnames(dataset)[1],
     ylab = colnames(dataset)[2])
lines(x = c(0 + mu_hat[1], first[1]/sqrt(n) + mu_hat[1]),
      y = c(0 + mu_hat[2], first[2]/sqrt(n) + mu_hat[2]))
lines(x = c(0 + mu_hat[1], -first[1]/sqrt(n) + mu_hat[1]),
      y = c(0 + mu_hat[2], -first[2]/sqrt(n) + mu_hat[2]))
lines(x = c(0 + mu_hat[1], second[1]/sqrt(n) + mu_hat[1]),
      y = c(0 + mu_hat[2], second[2]/sqrt(n) + mu_hat[2]))
lines(x = c(0 + mu_hat[1], -second[1]/sqrt(n) + mu_hat[1]),
      y = c(0 + mu_hat[2], -second[2]/sqrt(n) + mu_hat[2]))
points(x = mu_hat[1],
       y = mu_hat[2])

# Add 1d projections
axis_proj <- cbind(mu_hat - sqrt(critical_val*
                                   sample_cov/n),
                   mu_hat + sqrt(critical_val*
                                   sample_cov/n))
abline(v = axis_proj[1,], lty = 2)
abline(h = axis_proj[2,], lty = 2)


## ---- message=FALSE------------------------------------------------------
# Let's focus on only two variables
dataset <- gapminder %>% 
  filter(year == 2012, 
         !is.na(infant_mortality)) %>% 
  select(infant_mortality, 
         life_expectancy) %>% 
  as.matrix()

n <- nrow(dataset); p <- ncol(dataset)


## ------------------------------------------------------------------------
alpha <- 0.05
mu_hat <- colMeans(dataset) 
sample_cov <- diag(cov(dataset))

# Simultaneous CIs
critical_val <- (n - 1)*p*qf(1-0.5*alpha, df1 = p,
                             df2 = n - p)/(n-p)

simul_ci <- cbind(mu_hat - sqrt(critical_val*
                                  sample_cov/n),
                  mu_hat + sqrt(critical_val*
                                  sample_cov/n))


## ------------------------------------------------------------------------
# Univariate without correction
univ_ci <- cbind(mu_hat - qt(1-0.5*alpha, n - 1) *
                   sqrt(sample_cov/n),
                 mu_hat + qt(1-0.5*alpha, n - 1) *
                   sqrt(sample_cov/n))


## ------------------------------------------------------------------------
# Bonferroni adjustment
bonf_ci <- cbind(mu_hat - qt(1-0.5*alpha/p, n - 1) *
                   sqrt(sample_cov/n),
                 mu_hat + qt(1-0.5*alpha/p, n - 1) *
                   sqrt(sample_cov/n))


## ------------------------------------------------------------------------
simul_ci
univ_ci
bonf_ci


## ---- echo = FALSE-------------------------------------------------------
transf_mat <- solve(chol(solve(cov(dataset)/n)))
# First create a circle of radius c
theta_vect <- seq(0, 2*pi, length.out = 100)
circle <- sqrt(critical_val) * cbind(cos(theta_vect), sin(theta_vect))
ellipse1 <- circle %*% t(transf_mat)
ellipse2 <- t(apply(ellipse1, 1, function(row) row + mu_hat))
colnames(ellipse2) <- c("X", "Y")

data_ellipse <- data.frame(ellipse2)

bind_rows(
  data.frame(t(simul_ci)) %>% mutate(type = 'T2-intervals'),
  data.frame(t(univ_ci)) %>% mutate(type = 'Unadjusted'),
  data.frame(t(bonf_ci)) %>% mutate(type = 'Bonferroni')
  ) %>% 
  ggplot() +
  geom_polygon(data = data_ellipse,
               aes(X, Y),
               fill = 'grey60')+
  geom_vline(aes(xintercept = infant_mortality,
                 linetype = type)) +
  geom_hline(aes(yintercept = life_expectancy,
                 linetype = type)) +
  theme_minimal() +
  geom_point(x = mu_hat[1],
             y = mu_hat[2],
             size = 2) +
  xlab("Infant mortality") +
  ylab("Life Expectancy") +
  theme(legend.position = "top",
        legend.title=element_blank()) +
  scale_linetype_discrete(breaks = c('T2-intervals',
                                     'Bonferroni',
                                     'Unadjusted'))