关于stat的笔记 - estimation | Underlake Reading Room

为stat331附录的estimation相关代码.

Simple Linear Regression

y_i =β_0 +β_1*x_i +ε_i, ε_i ∼N(0,σ )

我们通过Least Squares Estimation来estimate $β_0$ 和 $β_1$ .
(省略证明的步骤)

\hat{β_1 ^{LS}} = \frac{\sum (y_i - \bar{y})(x_i - \bar{x})}{\sum (x_i - \bar{x})^2}

\hat{β_1 ^{LS}} = \frac{S_{xy}}{S_{xx}}

\hat{β_0 ^{LS}} = \bar{y} - \hat{β_1 ^{LS}}

Maximum Likelihood Estimation和Least Squares Estimation估算的 $β_0$ 和 $β_1$ 是一样的.

\hat{σ}_{ML}^2=\frac{\sum_{i=1}^ne_i^2}{n}

\hat{σ}^2=\frac{\sum_{i=1}^ne_i^2}{n-2}

当n大于50时,这两个没什么区别.

注: $S_{xy} = \sum (x_i - \bar(x))(y_i - \bar(y))$
同理 $S_{xx} = \sum (x_i - \bar(x))^2$

#--------------------------------------------------------------------------
# lecture 2
#--------------------------------------------------------------------------
## correlation demo:
N <- 2000 ## sample size
b0 <- 0   ## intercept
b1 <- 0.5 ## slope
sig <- 0.2 ## sd of errors (sigma)

## generate data
x <- rnorm(N,0,1) ## covariates
y <- rnorm(N,b0+b1*x,sig) ## outcomes
y2 <- 2*y

par(mfrow=c(1,2))
plot(y~x,xlim=c(-4,4),ylim=c(-3,3)) ## scatterplot
abline(b0,b1,lwd=2,col="blue") ## true line
round(cor(y,x),2) ## correlation

plot(y2~x,xlim=c(-4,4),ylim=c(-3,3)) ## scatterplot
abline(b0,2*b1,lwd=2,col="blue") ## true line
round(cor(y2,x),2) ## correlation



#-------------------------------
### Back to Data
# load Dr. Gladstone's brain data
brain <- read.csv(file = "brainhead.csv")
head(brain) # take a peek at the data
summary(brain) # summarize the data
# create variable names that require less typing
x <- brain$head.size # head size
y <- brain$brain.wgt # brain weight

# histograms
par(mfrow=c(1,2))
hist(x,main="Head Size (cm^3)")
hist(y,main="Brain Weight (grams)")
par(mfrow=c(1,1))

### point estimates for beta0, beta1, and sigma
# prepare some quantities
xbar <- mean(x)
ybar <- mean(y)
Sxy <- sum( (y-ybar) * (x-xbar) )
Sxx <- sum( (x-xbar)^2 )
n <- length(y) # number of observations

# beta1 (MLE)
b1.hat <- cov(y,x)/var(x) # quick way: sample covar. / sample var.
c(b1.hat, Sxy/Sxx) # compare to longer way

# beta0 (MLE)
b0.hat <- ybar - b1.hat * xbar

# sigma2 (Unbiased estimate)
yfit <- b0.hat + b1.hat * x # fitted values
e <- y - yfit # residuals
sig2.hat <- sum(e^2)/(n-2)
sig.hat <- sqrt(sig2.hat)

# display
round(c(b0.hat,b1.hat,sig.hat),3)

# OLS using built-in R functions
# 做model

M <- lm(y ~ x) # fit (l)inear (m)odel
M
# The return object is called a list
typeof(M)
names(M) # see ?lm for meaning of each element

# regression coefficients (betas)
round(M$coefficients,3) # access via list element
round(coef(M),3) # access via accessor function.  see ?lm for others.
round(c(b0.hat, b1.hat),3)

Confidence Intervals

注意confidence intervals分为知道sigma和不知道sigma.
知道sigma用normal distribution,不知道用t distribution.

通过MLE的distribution得知

Z = \frac{\hat{\beta_1} - \beta_1}{\sigma / \sqrt{S_{xx}}} ∼N(0,1)

V = \frac{1}{\sigma^2}\sum_{i=1}^n e_i^2 ∼X_{n-2}^2

在知道sigma的情况下,95%的 $\beta_1 = \hat{\beta_1} \pm 1.96\frac{\sigma}{\sqrt{S_{xx}}}$
1.96是Z常数.

不知道sigma的情况下,我们使用 $\frac{\hat{\beta_1} - \beta_1}{\hat{\sigma} / \sqrt{S_{xx}}} ∼t_{n-2}^2$ ,95%的 $\beta_1 = \hat{\beta_1} \pm q\frac{\hat{\sigma}}{\sqrt{S_{xx}}}$
q由r计算.写作 $t_{1-\alpha/2,n-2}$

1	qt(p = 0.05/2, df = n-2, lower.tail = FALSE)

虽然老师讲了很多的代码但不如直接用r自带的confint built-in function好了.


conf <- 0.95 # size of confidence interval (1-alpha)
qval <- -qt(p = (1-conf)/2, df = n-2) # inverse-cdf of a t(n-2)
qval <- qt(p = (1-conf)/2, df = n-2, lower.tail = FALSE)

# CI for beta1
s1 <- sig.hat / sqrt(Sxx) # standard error

b1.CI <- b1.hat + c(-1,1) * qval * s1 # confidence interval
names(b1.CI) <- c("L", "U")
round(b1.CI,digits=3)


# CI for beta0
s0 <- sig.hat * sqrt(1/n + xbar^2/Sxx) # standard error

b0.CI <- b0.hat + c(-1,1) * qval * s0
names(b0.CI) <- c("L", "U")
round(b0.CI,digits=3)

# Compare to built-in functions
confint(M)

Hypothesis Testing


# H0: beta1 = 0
# test statistic: T = beta1.hat/s1
# T | H0 ~ t_(n-2)
# pval = P( |T| > |Tobs|  |  H0 )

Tobs <- b1.hat/s1 # observed value of test statistic
Tobs

# want P(|T| > |Tobs| | H0)
pval <- pt(q = abs(Tobs),               # |Tobs|
           df = n-2,                    # degrees of freedom
           lower.tail = FALSE)          # return P( T > |Tobs|)
pval <- 2*pval                          # add P( T < |Tobs|)
pval

# compare to built in function
summary(M)

Multiple Linear Regression

也就是多个系数的model.

其中有系数是categorical的,需要特殊处理.


#--- categorical predictors ------------------------------------------------
# data
mercury <- read.csv("fishermen_mercury.csv")
head(mercury)

# predict MeHg as a function of weight and fishpart
M <- lm(MeHg ~ weight + fishpart, data = mercury)
summary(M)

# since fishpart is coded as:
# None = 0, Muscle = 1, Muscle/Whole = 2, Whole = 3
# model assumes increase in expected MeHg from None to Muscle is
# exactly equal to increase in expected MeHg from Muscle to Muscle/Whole.
# Imagine if we encoded: None = 0, M = 10, MW = 11, W = 20?

# the "design matrix" X for the problem
head(model.matrix(M))

#--- fishpart as categorical predictor -------------------------------------

# convert fishpart to non-numeric categorical variable
fishpart2 <- mercury$fishpart
table(fishpart2)

# "levels" of variable fishpart
fplevels <- c("none", "muscle", "muscle_whole", "whole")
# using for-loop
for(ii in 0:3) {
  fishpart2[fishpart2 == ii] <- fplevels[ii+1]
}
# without for-loop
all(fplevels[mercury$fishpart+1] == fishpart2)


# in R, categorical variables are coded as "factor" variables
tmp <- factor(fishpart2)
head(tmp)
levels(tmp) # alphabetical group ordering

fishpart2 <- factor(fishpart2, levels = fplevels) # custom ordering
levels(fishpart2)

mercury$fishpart <- fishpart2 # replace in dataset

# run regression in R:
Mb <- lm(MeHg ~ weight + fishpart, data = mercury) # intercept model (I)
Mc <- lm(MeHg ~ weight + fishpart - 1,         # no-intercept model (0)
         data = mercury)          # note the "-1" to remove intercept



# compare design matrices at selected values
ind <- c(102, 35, 2, 1, 14, 106)

mercury[ind, c("MeHg", "weight", "fishpart")] # original dataset
model.matrix(Mb)[ind,] # with intercept model
model.matrix(Mc)[ind,] # no intercept model

# interpretation of parameters:
coef(Mb)
# fishpartmuscle: estimate of E[MeHg | ...]

coef(Mc)
# fishpartmuscle: estimate of ...

# note that model predictions are identical:
range(predict(Mb) - predict(Mc))

<figure class="highlight r"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line"></span><br><span class="line">&lt;img src=<span class="string">&quot;/blog/2021/05/12/P194/P194_12.png&quot;</span> <span class="built_in">class</span>=<span class="string">&quot;&quot;</span>&gt;</span><br><span class="line"></span><br></pre></td></tr></table></figure>

<p>#— Hypothesis testing —————————————————-</p>
<h1 id="Question-How-to-test-H0-fishpart-not-associated-with-mercury-in-presence-of-weight"><a href="#Question-How-to-test-H0-fishpart-not-associated-with-mercury-in-presence-of-weight" class="headerlink" title="Question: How to test H0: fishpart not associated with mercury in presence of weight?"></a>Question: How to test H0: fishpart not associated with mercury in presence of weight?</h1><p>Mfull &lt;- lm(MeHg ~ fishpart + weight, data = mercury) # full model<br>coef(Mfull)</p>
<h1 id="degrees-of-freedom"><a href="#degrees-of-freedom" class="headerlink" title="degrees of freedom"></a>degrees of freedom</h1><p>dff &lt;- Mfull$df</p>
<h1 id="F-statistic-using-subset-of-MLE-of-full-model-betastar"><a href="#F-statistic-using-subset-of-MLE-of-full-model-betastar" class="headerlink" title="F-statistic using subset of MLE of full model (betastar)"></a>F-statistic using subset of MLE of full model (betastar)</h1><p>betastar.ind &lt;- 2:4 # indices corresponding to beta’s for fishpart<br>betastar.hat &lt;- coef(Mfull)[betastar.ind] # subset of MLE</p>
<h1 id="vcov-Mfull-sigma-hat-2-X’X-1"><a href="#vcov-Mfull-sigma-hat-2-X’X-1" class="headerlink" title="vcov(Mfull) = sigma.hat^2 * (X’X)^{-1}"></a>vcov(Mfull) = sigma.hat^2 * (X’X)^{-1}</h1><p>betastar.ve &lt;- vcov(Mfull)[betastar.ind,betastar.ind] # betastar.ve = sigma.hat^2 * V</p>
<h1 id="long-hand-calculation"><a href="#long-hand-calculation" class="headerlink" title="long hand calculation"></a>long hand calculation</h1><p>X &lt;- model.matrix(Mfull) # design matrix for full model<br>V &lt;- solve(crossprod(X))[betastar.ind,betastar.ind] # crossprod(X) = t(X) %*% X<br>range(betastar.ve - sigma(Mfull)^2 * V)</p>
<h1 id="Fobs-betastar-hat’-V-1-betastar-hat-q-sigma-hat-2"><a href="#Fobs-betastar-hat’-V-1-betastar-hat-q-sigma-hat-2" class="headerlink" title="Fobs = (betastar.hat’ V^{-1} betastar.hat / q) / sigma.hat^2"></a>Fobs = (betastar.hat’ V^{-1} betastar.hat / q) / sigma.hat^2</h1><h1 id="note-betastar-ve-already-includes-the-denominator"><a href="#note-betastar-ve-already-includes-the-denominator" class="headerlink" title="note: betastar.ve already includes the denominator"></a>note: betastar.ve already includes the denominator</h1><p>Fobs &lt;- t(betastar.hat) %*% solve(betastar.ve, betastar.hat) / length(betastar.hat)</p>
<h1 id="p-value-large-values-of-Fobs-are-more-evidence-against-H0"><a href="#p-value-large-values-of-Fobs-are-more-evidence-against-H0" class="headerlink" title="p-value: large values of Fobs are more evidence against H0"></a>p-value: large values of Fobs are more evidence against H0</h1><p>pf(Fobs, df1 = length(betastar.hat), df2 = dff, lower.tail = FALSE)</p>

其中系数也可能有nolinear的情况.

插播stat444老师的名言:
all models are false but some are useful.

在Assignment1中的例子,按照scatter plot看出跟exponential像,所以plot exponential.

log.sigma2.y = log(sigma2.y)
plot(Week.x, log.sigma2.y , pch=19, cex.axis = 1.5, cex.lab = 1.5, ylim=c(0,12))
var.model = lm(log.sigma2.y ~ Week.x) # exponential
abline(var.model , col="blue" , lwd = 1.5) #从log(y)中求出系数
summary(model)

plot(Week.x, sigma2.y , pch=19, cex.axis = 1.5, cex.lab = 1.5, ylim=c(0,36000))
var.model.exp = function(x,
  a0 = var.model$coeffient[1],
  a1 = var.model$coeffient[2]){
    return(exp(a0+a1*x))
  } # 提取a0和a1
x.plot = seq(1, 17, length = 100)
y.plot = var.model.exp(x.plot)
lines(y.plot~x.plot, col="blue", lwd=1.5) #再plot

从R-squared中看出数据比较大(偏差比较大),model可以再优化.

ANOVA

(这个在stat系列中有记载过,再次备份一下)
ANOVA: How much of the variability in Y is explained by our model?

### LECTURE 11
#--- first make categorical predictors (from previous lectures) ------------------------------------------------
# data
mercury <- read.csv("fishermen_mercury.csv")
head(mercury)

# predict MeHg as a function of weight and fishpart
M <- lm(MeHg ~ weight + fishpart, data = mercury)
summary(M)

# since fishpart is coded as:
# None = 0, Muscle = 1, Muscle/Whole = 2, Whole = 3
# model assumes increase in expected MeHg from None to Muscle is
# exactly equal to increase in expected MeHg from Muscle to Muscle/Whole.
# Imagine if we encoded: None = 0, M = 10, MW = 11, W = 20?

# the "design matrix" X for the problem
head(model.matrix(M))

#--- fishpart as categorical predictor -------------------------------------

# convert fishpart to non-numeric categorical variable
fishpart2 <- mercury$fishpart
table(fishpart2)

# "levels" of variable fishpart
fplevels <- c("none", "muscle", "muscle_whole", "whole")
# using for-loop
for(ii in 0:3) {
  fishpart2[fishpart2 == ii] <- fplevels[ii+1]
}
# without for-loop
all(fplevels[mercury$fishpart+1] == fishpart2)


# in R, categorical variables are coded as "factor" variables
tmp <- factor(fishpart2)
head(tmp)
levels(tmp) # alphabetical group ordering

fishpart2 <- factor(fishpart2, levels = fplevels) # custom ordering
levels(fishpart2)

mercury$fishpart <- fishpart2 # replace in dataset



#--- ANOVA ----------------------------------------------------

# Fit model
M1 <- lm(MeHg ~ fishpart + weight, data = mercury) #

## ANOVA Decomposition
SSTotal <- sum((mercury$MeHg-mean(mercury$MeHg))^2)
SSReg <- sum((fitted(M1)-mean(mercury$MeHg))^2)
R2 <- SSReg/SSTotal
SSRes <- sum((mercury$MeHg-fitted(M1))^2)
round(SSTotal,10)==round(SSReg+SSRes,10)

# F test for all covariates
# testing \beta_1=\beta_2=\beta_3=\beta_4=0
n <- nrow(mercury)
Fobs <- (SSReg/4)/(SSRes/(n-5)) ## by hand
pf(Fobs,4,n-5,lower.tail=F)
summary(M1) ## R fn



#--- F test  ----------------------------------------------------

# Question: How to test H0: fishpart has no effect in presence of weight?

# full model:
Mfull <- lm(MeHg ~ factor(fishpart) + weight, data = mercury)

# reduced model: all levels of fishpart have same intercept
Mred <- lm(MeHg ~ weight, data = mercury)

# degrees of freedom in full and reduced models
dff <- Mfull$df
dfr <- Mred$df

# F-statistic, using LR definition
SSRes_full <- sum(residuals(Mfull)^2) # sum-of-squares for Mfull
SSRes_red <- sum(residuals(Mred)^2) # sum-of-squares for Mred
Fobs <- (SSRes_red-SSRes_full)/(dfr-dff) / (SSRes_full/dff)
c(Fobs, (SSRes_red-SSRes_full)/(dfr-dff) / sigma(Mfull)^2) # equivalent denominator

# F-statistic using subset of MLE of full model
beta.ind <- 2:4 # indices corresponding to beta's for fishpart
beta.hat <- coef(Mfull)[beta.ind] # subset of MLE
beta.ve <- vcov(Mfull)[beta.ind,beta.ind] # beta.ve = sigma.hat^2 * V
Fobs2 <- t(beta.hat) %*% solve(beta.ve, beta.hat) / length(beta.hat)

# check two version are identical
c(Fobs, Fobs2)

# p-value: large values of Fobs are more evidence against H0
pf(Fobs, df1 = (dfr-dff), df2 = dff, lower.tail = FALSE)

# fully R version:
anova(Mred, Mfull)

Goodness of Fit

测试一个model是否合适可以通过测试以下数据:

R^2

,where

R^2 = \frac{SSR_{eg}}{SST_{ot}} = 1 - \frac{SSR_{es}}{SST_{ot}}

Interpretation: proportion of variability explained by the model.
Problem: will never decrease when adding more variables.

因此可以测试Adjusted $R^2$ .

此外也可以测试AIC.


library(mvtnorm)
nn <- 200 # n obs
n.cov <- 20 # 20 covs
n.cov.plot <- 20
X <- rmvnorm(nn,rep(0,n.cov),diag(rep(1,n.cov)))
## generate y as function of first 4 covs
y <- X[,1]+X[,2]+X[,3]+X[,4]+rnorm(nn,0,1)
dat <- data.frame(y,X)
# calculate R^2 and R^2_adj upon adding in garbage predictors one at a time
R2 <-  rep(NA,n.cov.plot-1)
R2a <- rep(NA, n.cov.plot-1)
aic <- rep(NA, n.cov.plot-1)
bic <- rep(NA, n.cov.plot-1)
mse <- rep(NA, n.cov.plot-1)
for(ii in 2:n.cov.plot) {
  M <- lm(y ~ ., data = dat[,1:ii])
  R2[ ii-1] <- summary(M)$r.squared
  R2a[ii-1] <- summary(M)$adj.r.squared
  aic[ii-1] <- AIC(M)
  bic[ii-1] <- BIC(M)
  mse[ii-1] <- sigma(M)  
}



plot(2:n.cov.plot, mse, xlab = "# of Predictors", ylab = "",
     main = "MSE",
     pch = 16, type = "b")

plot(2:n.cov.plot, R2, xlab = "# of Predictors", ylab = "",
     main = "Coefficient of Determination",
     pch = 16, type = "b", ylim = range(R2, R2a))
points(2:n.cov.plot, R2a, pch = 16, type = "b", col = "red")
legend(x = "bottomright", legend = expression(R^2, R[adj]^2),
       fill = c("black", "red"))

plot(2:n.cov.plot, aic, xlab = "# of Predictors", ylab = "",
     main = "Criteria",pch = 16, type = "b", ylim = range(aic, bic))
points(2:n.cov.plot, bic, pch = 16, type = "b", col = "blue")
legend(x = "bottomright", legend = expression(AIC, BIC),
       fill = c("black", "blue"))

Automatic Selection

一个model中有多个系数,其中建立model的方式有 将所有的系数一个一个加入 和 一个一个减去 两种,分为Forward Selection和Backward Elimination.

Forward Selection:

Backward Elimination:

还有一种是combine了forward和backward,Stepwise Selection.

# ------ automatic selection ------------------


# forward, backward, and stepwise selection

# high school and beyond dataset
hsb <- read.csv("HSB2.csv")
hsbm <- hsb[,c(13, 1:6, 10, 7:9)] # math response only

summary(hsbm)

# bounds for model selection
M0 <- lm(math ~ 1, data = hsbm) # minimal model

Mfull <- lm(math ~ ., data= hsbm) # full model

# forward
system.time({
Mfwd <- step(object = M0, # base model
             scope = list(lower = M0, upper = Mfull), # smallest and largest model
             trace = 1, # trace prints out information
             direction = "forward" )
})

# backward
system.time({
Mback <- step(object = Mfull, # base model
              scope = list(lower = M0, upper = Mfull),
              direction = "backward", trace = 1)
})

# stepwise (both directions)
system.time({
Mstep <- step(object = M0,
              scope = list(lower = M0, upper = Mfull),
              direction = "both", trace = 1)
})

Cross-validation

cross validation的思想是把数据分组,一部分作为training set,来fit model,一部分作为test set,用来测试fit的model的误差.

k-fold cross validation是其中的一种:将k-1组作为traning set.

leave-one-out cross validation则除了一个数据外将全部数据作为traning set,再计算其中的误差.

load data and fit models from lec 14:


# high school and beyond dataset
hsb <- read.csv("HSB2.csv")
hsbm <- hsb[,c(13, 1:6, 10, 7:9)] # math response only

summary(hsbm)

# bounds for model selection
M0 <- lm(math ~ 1, data = hsbm) # minimal model

# full model:
# 1. remove all interactions with career, but leave as main effect
# 2. remove the interaction between lang and minor
# 3. add nonlinear effects for continuous variables locus, concept, mot
Mfull <- lm(math ~ (.-career)^2 + career - lang:minor +    
            I(locus^2) + I(concept^2) + I(mot^2),
            data= hsbm)
length(coef(Mfull))
anyNA(coef(Mfull))

df.penalty <- 2 # this is the k penalty
## 2 corresponds to AIC
## log(n) corresponds to BIC

# forward
system.time({
  Mfwd <- step(object = M0, # base model
               scope = list(lower = M0, upper = Mfull), # smallest and largest model
               direction = "forward",
               trace = 0, # trace prints out information
               k = df.penalty
  )
})

# backward
system.time({
  Mback <- step(object = Mfull, # base model
                scope = list(lower = M0, upper = Mfull),
                direction = "backward", trace = 0, k = df.penalty)
})

# stepwise (both directions)
Mstart <- lm(math ~ ., data = hsbm) # starting point model: main effects only
system.time({
  Mstep <- step(object = Mstart,
                scope = list(lower = M0, upper = Mfull),
                direction = "both", trace = 1, k = df.penalty)
})


# compare Mfull to Mstep
M1 <- Mfull
M2 <- Mstep
Mnames <- expression(M[FULL], M[STEP])

# number of cross-validation replications
nreps <- 1e3

ntot <- nrow(hsbm) # total number of observations
ntrain <- 500 # for fitting MLE's
ntest <- ntot-ntrain # for out-of-sample prediction

# storage space
mspe1 <- rep(NA, nreps) # mspe for M1
mspe2 <- rep(NA, nreps) # mspe for M2

system.time({
  for(ii in 1:nreps) {
    if(ii%%100 == 0) message("ii = ", ii)

    train.ind <- sample(ntot, ntrain) # training observations
    # long-form cross-validation
    ## M1.cv <- lm(math ~ read + prog + race + ses + locus + read:prog + prog:ses,
    ##         data = hsbm, subset = train.ind)
    ## M2.cv <- lm(math ~ race + ses + sch + prog + locus + concept +
    #           mot + read + ses:sch + ses:concept + prog:read,
    #         data = hsbm, subset = train.ind)
    # using R functions
    M1.cv <- update(M1, subset = train.ind)
    M2.cv <- update(M2, subset = train.ind)
    # cross-validation residuals
    M1.res <- hsbm$math[-train.ind] - # test observations
      predict(M1.cv, newdata = hsbm[-train.ind,]) # prediction with training data
    M2.res <- hsbm$math[-train.ind] -predict(M2.cv, newdata = hsbm[-train.ind,])
    # mspe for each model
    mspe1[ii] <- mean(M1.res^2)
    mspe2[ii] <- mean(M2.res^2)

  }
})

# compare
par(mfrow = c(1,2))
cex <- 1
boxplot(x = list(mspe1, mspe2), names = Mnames,
        main = "MSPE",
        #ylab = expression(sqrt(bar(SSE)[CV])),
        ylab = expression(MSPE),
        col = c("yellow", "orange"),
        cex = cex, cex.lab = cex, cex.axis = cex, cex.main = cex)
boxplot(x = list(sqrt(mspe1), sqrt(mspe2)), names = Mnames,
        main = "Root MSPE",
        ylab = expression(sqrt(MSPE)),
        ## ylab = expression(SSE[CV]),
        col = c("yellow", "orange"),
        cex = cex, cex.lab = cex, cex.axis = cex, cex.main = cex)



# compare predictions by training set
par(mfrow=c(1,1))
plot(mspe1, mspe2, pch = 16,
     xlab = Mnames[1], ylab = Mnames[2],
     main = "")
abline(a = 0, b = 1, col= "red", lwd = 2)


#--- K-fold cross-validation -----------------------------------------------------

# compare Mfwd to Mstep
M1 <- Mfwd
M2 <- Mstep
Mnames <- expression(M[FWD], M[STEP])

ntot <- nrow(hsbm) # total number of observations

# number of cross-validation replications
Kfolds <- 10

hsbm <- hsbm[sample(ntot),] # permute rows
hsbm$index <- rep(1:Kfolds,each=ntot/Kfolds)

# storage space
mspe1 <- rep(NA, Kfolds) # mspe for M1
mspe2 <- rep(NA, Kfolds) # mspe for M2

system.time({
  for(ii in 1:Kfolds) {

    train.ind <- which(hsbm$index!=ii) # training observations


    # using R functions
    M1.cv <- update(M1, subset = train.ind)
    M2.cv <- update(M2, subset = train.ind)
    # cross-validation residuals
    M1.res <- hsbm$math[-train.ind] - # test observations
      predict(M1.cv, newdata = hsbm[-train.ind,]) # prediction with training data
    M2.res <- hsbm$math[-train.ind] -predict(M2.cv, newdata = hsbm[-train.ind,])
    # mspe for each model
    mspe1[ii] <- mean(M1.res^2)
    mspe2[ii] <- mean(M2.res^2)

  }
})


# compare
par(mfrow = c(1,2))
cex <- 1
boxplot(x = list(mspe1, mspe2), names = Mnames,
        main = "MSPE",
        #ylab = expression(sqrt(bar(SSE)[CV])),
        ylab = expression(MSPE),
        col = c("yellow", "orange"),
        cex = cex, cex.lab = cex, cex.axis = cex, cex.main = cex)
boxplot(x = list(sqrt(mspe1), sqrt(mspe2)), names = Mnames,
        main = "Root MSPE",
        ylab = expression(sqrt(MSPE)),
        ## ylab = expression(SSE[CV]),
        col = c("yellow", "orange"),
        cex = cex, cex.lab = cex, cex.axis = cex, cex.main = cex)


mean(mspe1)
mean(mspe2)



PRESS1 <- resid(M1)/(1-hatvalues(M1))
PRESS2 <- resid(M2)/(1-hatvalues(M2))

# should match if doing LOO-CV
mean(PRESS1^2)
mean(PRESS2^2)

k-fold cross validation stat444加强版:

这里也包括了LOOCV.
一般k-fold有更好的bais和variance trace off,但LOOCV会更快.

# A function to generate the indices of the k-fold sets
kfold <- function(N, k=N, indices=NULL){
  # get the parameters right:
  if (is.null(indices)) {
    # Randomize if the index order is not supplied
    indices <- sample(1:N, N, replace=FALSE)
  } else {
    # else if supplied, force N to match its length
    N <- length(indices)
  }
  # Check that the k value makes sense.
  if (k > N) stop("k must not exceed N")
  #

  # How big is each group?
  gsize <- rep(round(N/k), k)

  # For how many groups do we need odjust the size?
  extra <- N - sum(gsize)

  # Do we have too few in some groups?
  if (extra > 0) {
    for (i in 1:extra) {
      gsize[i] <- gsize[i] +1
    }
  }
  # Or do we have too many in some groups?
  if (extra < 0) {
    for (i in 1:abs(extra)) {
      gsize[i] <- gsize[i] - 1
    }
  }

  running_total <- c(0,cumsum(gsize))

  # Return the list of k groups of indices
  lapply(1:k,
         FUN=function(i) {
           indices[seq(from = 1 + running_total[i],
                       to = running_total[i+1],
                       by = 1)
                   ]
         }
  )
}


# A function to form the k samples
getKfoldSamples <- function (x, y, k, indices=NULL){
  groups <- kfold(length(x), k, indices)
  Ssamples <- lapply(groups,
                     FUN=function(group) {
                       list(x=x[-group], y=y[-group])
                     })
  Tsamples <- lapply(groups,
                     FUN=function(group) {
                       list(x=x[group], y=y[group])
                     })
  list(Ssamples = Ssamples, Tsamples = Tsamples)
}


set.seed(1234)

# Simulating the data
x = seq(1,10,length=50)
y = 5+log(x)*sin(.65*x)-2*cos(x-5)/(sqrt(x)) + rnorm(50,mean=0,sd=1) #+ c(rnorm(22,mean=0,sd=1) , rnorm(28,mean=0,sd=.4))
SimulatedData = data.frame(x,y)
plot(y~x,pch=19 , col=adjustcolor("brown",0.7) , cex.axis=1.5 , cex.lab=1.5)

# For leave one out cross-validation
samples_loocv <-  getKfoldSamples(SimulatedData$x, SimulatedData$y, k=length(SimulatedData$y))
# 10 fold cross-validation
samples_10fold <-  getKfoldSamples(SimulatedData$x, SimulatedData$y, k=10)
# 5 fold cross-validation
samples_5fold <-  getKfoldSamples(SimulatedData$x, SimulatedData$y, k=5)


# the degrees of freedom associated with each
complexity <- c(1:10) # Thiese are the degrees of polynomials to be fitted

# Performing the Cross-Validation
Ssamples <- samples_5fold$Ssamples # change this accorcing to the number of folds
Tsamples <- samples_5fold$Tsamples # change this accorcing to the number of folds
CV.To.Plot = data.frame(Complexity=NA , MSE=NA)
for(i in 1:length(complexity)){
  MSE = c()
  for(j in 1:length(Ssamples)){
    x.temp = Ssamples[[j]]$x
    y.temp = Ssamples[[j]]$y
    model = lm(y.temp~poly(x.temp , complexity[i]))
    pred = predict(model , newdata=data.frame(x.temp=Tsamples[[j]]$x))
    MSE[j] = mean((Tsamples[[j]]$y-pred)^2)
  }
  CV.To.Plot[i,] = c(complexity[i] , mean(MSE))
}

Title.Graph = "5-fold CV" # change this accorcing to the number of folds
plot(CV.To.Plot , pch=19 , col="darkblue" , type="b",
     cex.axis = 1.5 , cex.lab=1.5 , ylab="Overall CV Error")
indx = which.min(CV.To.Plot$MSE)
abline(v=indx ,lty=2 , lwd=2 , col='red')
title(main=Title.Graph)

plot(y~x,pch=19 , col=adjustcolor("brown",0.7) , cex.axis=1.5 , cex.lab=1.5)
curve(5+log(x)*sin(.65*x)-2*cos(x-5)/(sqrt(x)),
      xlim = c(1, 10), col = "black", add = TRUE)
lines(x, predict(lm(y~poly(x,4))), type="l", col="blue", lwd=2)
text(8,7,"Black: True")
text(8,6.5,"Blue: Fitted" , col="blue")
title(main="")

Linear Regression Assumptions:

Linearity
Independence
Normality
Equal variance (homoskedasticity)

code for checking them:


### --------Diagnostics in Practice-----------------
library(car)


# high school and beyond dataset
hsb <- read.csv("HSB2.csv")
hsbm <- hsb[1:150,c(13, 1:6, 10, 7:9)] # math response only

## look at data
summary(hsbm)


## fit model
M1 <- lm(math ~ locus + concept + mot , data = hsbm)

## residuals
res1 <- resid(M1) # raw residuals
stud1 <- res1/(sigma(M1)*sqrt(1-hatvalues(M1))) # studentized residuals

## plot distribution of studentized residuals
hist(stud1,breaks=12,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot of studentized residuals
qqnorm(stud1)
abline(0,1) # add 45 degree line
# 如果qqplot完全normal就会跟45度线重合.

## plot of residuals vs X
plot(res1~hsbm$locus,
     xlab="Locus.",
     ylab="Residuals",
     main="Residuals vs X")
abline(0,0) ## add horizontal line
plot(res1~hsbm$concept,
     xlab="Concept.",
     ylab="Residuals",
     main="Residuals vs X")
abline(0,0) ## add horizontal line
plot(res1~hsbm$mot,
     xlab="Motiv.",
     ylab="Residuals",
     main="Residuals vs X")
abline(0,0) ## add horizontal line

## partial regression (aka added variable plots)
avPlots(M1)

## plot of studentized residuals vs fitted values
plot(stud1~fitted(M1),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")


## standard residual plots
plot(M1)


##--------Practice Q2 :Simulations------------

## load packages
library(mvtnorm)

## set seed for reproducibility
set.seed(1000)

## set parameters
n <- 300
sig <- 1.5

## generate 4 covariates
X <- data.frame(rmvnorm(n,sigma=matrix(0.5,4,4)+diag(rep(1,4),4,4)))

## generate outcome under different models
y <- rnorm(n,-2+0.8*X$X1,sig) ## linear assoc.
y2 <- rnorm(n,-2+0.8*X$X1^2,sig) ## non-linear assoc.
y3 <- -2+0.8*X$X1+runif(n,-2,2) ## non-gaussian errors
y4 <- rnorm(n,-2+0.8*X$X1,abs((4+X$X1))) ## mean-variance relationship
y5 <- rnorm(n,-2+0.8*X$X1+0.8*X$X1^2+0.4*X$X2+1.2*X$X3,sig) ## MLR

## combine data into data.frame
dat <- data.frame(X,y,y2,y3,y4,y5)


#------Linear Assoc. with Gaussian Errors -------------
## SLR
M1 <- lm(y~X1,data=dat)

## compute  residuals
res1 <- resid(M1) # raw residuals
stud1 <- res1/(sigma(M1)*sqrt(1-hatvalues(M1))) # studentized residuals

## plot distribution of studentized residuals
hist(stud1,breaks=12,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot of studentized residuals
qqnorm(stud1)
abline(0,1) # add 45 degree line

## plot of residuals vs X
plot(res1~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of studentized residuals vs fitted values
plot(stud1~fitted(M1),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")


#--------Quad. Assoc. and Gaussian Errors-----------------

## SLR (assuming linear assoc.)
M2 <- lm(y2~X1,data=dat)

## compute  residuals
res2 <- resid(M2) # raw residuals
stud2 <- res2/(sigma(M2)*sqrt(1-hatvalues(M2))) # studentized residuals

## plot studentized residuals
hist(stud2,breaks=20,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot
qqnorm(stud2)
abline(0,1) # add 45 degree line

## plot of residuals vs X
plot(res2~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of studentized residuals vs fitted values
plot(stud2~fitted(M2),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")



#------Non-Gaussian Residuals -------------
## SLR, assuming gaussian errors
M3 <- lm(y3~X1,data=dat)

## compute  residuals
res3 <- resid(M3) # raw residuals
stud3 <- res3/(sigma(M3)*sqrt(1-hatvalues(M3))) # studentized residuals

## plot distribution of studentized residuals
hist(stud3,breaks=12,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot of studentized residuals
qqnorm(stud3)
abline(0,1) # add 45 degree line

## plot of residuals vs X
plot(res3~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of studentized residuals vs fitted values
plot(stud3~fitted(M3),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")



#------Heteroskedasticity-------------
## SLR
M4 <- lm(y4~X1,data=dat)

## compute  residuals
res4 <- resid(M4) # raw residuals
stud4 <- res4/(sigma(M4)*sqrt(1-hatvalues(M4))) # studentized residuals

## plot distribution of studentized residuals
hist(stud4,breaks=12,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot of studentized residuals
qqnorm(stud4)
abline(0,1) # add 45 degree line

## plot of residuals vs X
plot(res4~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of studentized residuals vs fitted values
plot(stud4~fitted(M4),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")




#------Linear Assoc. with Gaussian Errors -------------
## SLR
M5 <- lm(y5~X1+X2+X3+X4,data=dat)

## compute  residuals
res5 <- resid(M5) # raw residuals
stud5 <- res5/(sigma(M5)*sqrt(1-hatvalues(M5))) # studentized residuals

## plot distribution of studentized residuals
hist(stud5,breaks=12,
     probability=TRUE,xlim=c(-4,4),
     xlab="Studentized Residuals",
     main="Distribution of Residuals")
grid <- seq(-3.5,3.5,by=0.05)
lines(x=grid,y=dnorm(grid),col="blue") # add N(0,1) pdf

## qqplot of studentized residuals
qqnorm(stud5)
abline(0,1) # add 45 degree line


## plot of studentized residuals vs fitted values
plot(stud5~fitted(M5),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")

## plot of residuals vs X1,X2,X3,X4
plot(res5~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")
plot(res5~X$X2,
     xlab=expression(X[2]),
     ylab="Residuals",
     main="Residuals vs X")
plot(res5~X$X3,
     xlab=expression(X[3]),
     ylab="Residuals",
     main="Residuals vs X")
plot(res5~X$X4,
     xlab=expression(X[4]),
     ylab="Residuals",
     main="Residuals vs X")


## plot of residuals vs X1,X2,X3,X4
My <- lm(y5~X2+X3+X4,data=dat)
Mx <- lm(X1~X2+X3+X4,data=dat)
plot(My$residuals~Mx$residuals,
     xlab=expression(e[x[1]]),
     ylab=expression(e[y]),
     main="Partial Regression Plot")

avPlots(M5)

在不满足的情况下修改model:

Linearity

Instead include log(xj)
Instead use a quadratic model (xj and xj2)

Independence

Estimates are still unbiased but standard errors are broken
=⇒ replace SEs with robust alternatives (sandwich form, GEE)
Explicitly model the dependence structure • Mixed effects models

Normality

Violations of normality might not be a big deal

E.g. if we have a large sample size
However, Normality is required for valid prediction intervals
Could consider transforming Y
E.g. model log(Y)
This again changes interpretations!
Might not be a problem if we’re interested in prediction
Could consider other regression approaches: GLMs, etc. (not covered in this course)

Equal variance (homoskedasticity)

If our errors are heteroskedastic, we have a few options:

Transform outcome (see above)
Variance stabilizing transform
Weighted Least Squares
Bootstrap (time permitting…)

Weighted Least Squares:

## load packages
library(mvtnorm)

## set seed for reproducibility
set.seed(1000)

## set parameters
n <- 300
sig <- 1.5

## generate 4 covariates
X <- data.frame(rmvnorm(n,sigma=matrix(0.5,4,4)+diag(rep(1,4),4,4)))

## generate outcome under different models
# y <- rnorm(n,-2+0.8*X$X1,sig) ## linear assoc.
y2 <- rnorm(n,-2+0.8*X$X1^2,sig) ## non-linear assoc.
y4 <- rnorm(n,-2+0.8*X$X1,abs((4+X$X1))) ## mean-variance relationship

## combine data into data.frame
dat <- data.frame(X,y,y2,y4)

#--------------------------------
## Linear
Mlin <- lm(y2~X1,data=dat)

## compute  residuals
reslin <- resid(Mlin) # raw residuals
studlin <- reslin/(sigma(Mlin)*sqrt(1-hatvalues(Mlin))) # studentized residuals


## plot of residuals vs X
plot(reslin~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of residuals vs X^2
plot(reslin~I(X$X1^2),
     xlab=expression(X[1]^2),
     ylab="Residuals",
     main="Residuals vs X^2")


#--------------------------------
## Quadratic
Mquad <- lm(y2~X1+I(X1^2),data=dat)

## compute  residuals
resquad <- resid(Mquad) # raw residuals
studquad <- resquad/(sigma(Mquad)*sqrt(1-hatvalues(Mquad))) # studentized residuals

## plot of residuals vs X
plot(resquad~X$X1,
     xlab=expression(X[1]),
     ylab="Residuals",
     main="Residuals vs X")

## plot of residuals vs X^2
plot(resquad~I(X$X1^2),
     xlab=expression(X[1]^2),
     ylab="Residuals",
     main="Residuals vs X^2")



### --------Iteratively Re-Weighted Least Squares--------

#--------------------------------
## OLS
Mols <- lm(y4~X1,data=dat)

## compute  residuals
resols <- resid(Mols) # raw residuals
studols <- resols/(sigma(Mols)*sqrt(1-hatvalues(Mols))) # studentized residuals


## plot of studentized residuals vs fitted values
plot(studols~fitted(Mols),
     xlab="Fitted Vals",
     ylab="Studentized Residuals",
     main="Residuals vs Fitted")

#--------------------------------
## WLS
# step 1
# initial estimates (OLS)
Mcurrent <- lm(y4~X1,data=dat)
Mcurrent$coef

# step 2
# model |e|
abs_resid <- abs(Mcurrent$residuals)
Mresid <- lm(abs_resid~ fitted(Mcurrent))
w <- 1/fitted(Mresid)^2
# fit WLS
Mwls <- lm(y4~X1,weights=w,data=dat)
Mwls$coef

# step 3
Mcurrent <- Mwls
# model |e|
abs_resid <- abs(Mcurrent$residuals)
Mresid <- lm(abs_resid~ fitted(Mcurrent))
w <- 1/fitted(Mresid)^2
# fit WLS
Mwls <- lm(y4~X1,weights=w,data=dat)
Mwls$coef

# step 4
Mcurrent <- Mwls
# model |e|
abs_resid <- abs(Mcurrent$residuals)
Mresid <- lm(abs_resid~ fitted(Mcurrent))
w <- 1/fitted(Mresid)^2
# fit WLS
Mwls <- lm(y4~X1,weights=w,data=dat)
Mwls$coef


### automatically:
sumdiff <- 100 ## initial val
threshold <- 10e-12 ## stopping condition
iter <- 1 ## count iterations
Mcurrent <- lm(y4~X1,data=dat) ## initial OLS fit
while(sumdiff>=threshold){

  # model |e|
  abs_resid <- abs(Mcurrent$residuals)
  Mresid <- lm(abs_resid~ fitted(Mcurrent))
  w <- 1/fitted(Mresid)^2
  # fit WLS
  Mwls <- lm(y4~X1,weights=w,data=dat)

  ## compute differences
  sumdiff <- sum(abs(Mwls$coef-Mcurrent$coef))
  Mcurrent <- Mwls

  ## count iterations
  print(iter)
  iter <- iter+1

}


### --------PQ1: Weighted Least Squares-----------------
library(readr)
hosp <- read_csv("hospitals.csv")

# calculate the weighted regression using matrices
y <- hosp$SurgQual # response
X <- cbind(1, hosp$Difficulty) # covariate matrix
N <- hosp$N # sample sizes

y_W <- y*sqrt(N) # multiply by W^{1/2}
X_W <- X*sqrt(N)# multiply by W^{1/2}
beta.hat.WLS <- c(solve(t(X_W)%*%X_W)%*%t(X_W)%*%y_W) # WLS estimator
beta.hat.WLS

# OLS on pre multiplied data
g_WLS2 <- lm(y_W ~ X_W-1, data = hosp)
g_WLS2$coef

# using the built-in R function
# R expects weights = N, not weights = sqrt(N)
g_WLS <- lm(SurgQual ~ Difficulty, weights = N, data = hosp)
g_WLS$coef

## OLS
g_OLS <- lm(SurgQual ~ Difficulty,  data = hosp)
g_OLS$coef


# plot the data with circle proportional to sample sizes
plot(hosp$Difficulty, hosp$SurgQual,
     xlab = "Case Load Difficulty Index",
     ylab = "Post-Surgery Quality Index",
     cex = sqrt(hosp$N/max(hosp$N))^2*4,
     pch = 16,
     col = "lightgrey")
points(hosp$Difficulty, hosp$SurgQual, pch = 3, col = "black")
abline(g_OLS, col = "blue") # add simple regression line
abline(g_WLS, col = "blue", lty = 2) # add weighted regression line
# add legend to plot
legend(x = "topright",
       legend = c("OLS", "WLS"),
       col = "blue", cex = .8,
       lty = c(1, 2), seg.len = 1)

stat444加强版选择weight: