Download this R markdown as: R, Rmd.

Loading the data and libraries

library("mffSM")
library("colorspace")

We want to examine a nitrogen concentration measured at various depths of the peat. Load the data peat below and try to understand the design of the underlying experiment. Focus on various depths in particular.

peat <- read.csv("http://www.karlin.mff.cuni.cz/~vavraj/nmfm334/data/peat.csv", 
                 header = TRUE, stringsAsFactors = TRUE)
dim(peat)
## [1] 148   3
str(peat)
## 'data.frame':    148 obs. of  3 variables:
##  $ group: Factor w/ 4 levels "CB","CB-VJJ",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ N    : num  0.738 0.847 1.063 1.142 1.272 ...
##  $ depth: int  2 4 6 8 10 12 14 2 4 6 ...
head(peat)
##   group         N depth
## 1   VJJ 0.7383710     2
## 2   VJJ 0.8473675     4
## 3   VJJ 1.0629922     6
## 4   VJJ 1.1419428     8
## 5   VJJ 1.2721831    10
## 6   VJJ 1.4032890    12
summary(peat)
##     group          N              depth       
##  CB    :35   Min.   :0.4989   Min.   : 0.000  
##  CB-VJJ:40   1st Qu.:0.8992   1st Qu.: 4.000  
##  VJJ   :33   Median :1.0343   Median : 8.000  
##  VJJ-CB:40   Mean   :1.0766   Mean   : 7.527  
##              3rd Qu.:1.2440   3rd Qu.:12.000  
##              Max.   :1.6921   Max.   :14.000

Visual exploration

Basic scatterplot for Nitrogen concentration given depth:

XLAB <- "Depth [cm]"
YLAB <- "Nitrogen concentration [weight %]"

XLIM <- range(peat[, "depth"])
YLIM <- range(peat[, "N"])

par(mfrow = c(1,1), mar = c(4,4,0.5,0.5))
plot(N ~ depth, data = peat, pch = 21, bg = "lightblue", 
     xlab = XLAB, ylab = YLAB, xlim = XLIM, ylim = YLIM)

But it can be done better… by distinguishing for different peat types (variable group).

Groups <- levels(peat[, "group"])
print(Groups)
## [1] "CB"     "CB-VJJ" "VJJ"    "VJJ-CB"
PCH <- c(21, 22, 24, 25)
DARK <- rainbow_hcl(4, c = 80, l = 35)
COL <- rainbow_hcl(4, c = 80, l = 50)
BGC <- rainbow_hcl(4, c = 80, l = 70)
names(PCH) <- names(COL) <- names(BGC) <- Groups

par(mfrow = c(2,2), mar = c(4,4,2,1))
for(g in 1:4){
  Group <- Groups[g]
  xx <- subset(peat, group == Group)[, "depth"]
  yy <- subset(peat, group == Group)[, "N"]
  plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g],
       xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
}  

Conditional empirical characteristics by groups

tabs <- list()
# g <- 1 # run for one g
for(g in 1:length(Groups)){
  # data subset containing only the g-th group
  subdata <- subset(peat, group == Groups[g])
  # averages for particular depths
  Mean    <- with(subdata, tapply(N, depth, mean))
  # standard deviations for particular depths
  StdDev  <- with(subdata, tapply(N, depth, sd))
  # numbers of observations for particular depths
  ns      <- with(subdata, tapply(N, depth, length))
  
  # cat("\n",Groups[g],"\n")
  # print(Mean)
  # print(StdDev)
  # print(ns)

  tabs[[g]] <- data.frame(Depth = as.numeric(names(Mean)), Mean = Mean, 
                                              Std.Dev = StdDev, n = ns)
  rownames(tabs[[g]]) <- 1:nrow(tabs[[g]])
}  
names(tabs) <- Groups
print(tabs)
## $CB
##   Depth      Mean    Std.Dev n
## 1     2 0.8504320 0.17437100 5
## 2     4 0.8885866 0.26575319 5
## 3     6 0.7951133 0.07905635 5
## 4     8 0.9122869 0.10257331 5
## 5    10 0.9930867 0.12950782 5
## 6    12 0.9024869 0.14771465 5
## 7    14 0.8694117 0.14202920 5
## 
## $`CB-VJJ`
##   Depth      Mean    Std.Dev n
## 1     0 1.1769177 0.13512002 5
## 2     2 1.1011507 0.05042430 5
## 3     4 1.0153398 0.07849447 5
## 4     6 1.0113715 0.08000427 5
## 5     8 1.0215852 0.15842290 5
## 6    10 0.9036023 0.21677703 5
## 7    12 0.8086419 0.19742942 5
## 8    14 0.7710349 0.10510728 5
## 
## $VJJ
##   Depth      Mean    Std.Dev n
## 1     2 0.9381259 0.19972666 4
## 2     4 0.8817402 0.11941469 4
## 3     6 1.0673126 0.06671083 5
## 4     8 1.2065316 0.09810094 5
## 5    10 1.3061741 0.12734137 5
## 6    12 1.3610624 0.06927207 5
## 7    14 1.4549562 0.09526403 5
## 
## $`VJJ-CB`
##   Depth      Mean    Std.Dev n
## 1     0 0.9132266 0.18371658 5
## 2     2 1.0804183 0.10997845 5
## 3     4 1.0331289 0.11638281 5
## 4     6 1.1380384 0.18806905 5
## 5     8 1.3685547 0.14144116 5
## 6    10 1.4260073 0.05603573 5
## 7    12 1.5131996 0.09093290 5
## 8    14 1.5221417 0.13197782 5

Insert the group averages into the original plot

par(mfrow = c(2,2), mar = c(4,4,2,1))
for (g in 1:4){
  Group <- Groups[g]
  xx <- subset(peat, group == Group)[, "depth"]
  yy <- subset(peat, group == Group)[, "N"]
  plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g], 
               xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
  lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
        pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
}  

Linear regression line for the group “CB-VJJ”

Let’s fit the least-squares estimated of the regression line parameters with lm:

Group <- "CB-VJJ"  
g <- 2 

fit1 <- lm(N ~ depth, data = peat, subset = (group == Group))
print(fit1)
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
## 
## Coefficients:
## (Intercept)        depth  
##     1.16907     -0.02755

The regression line establishes a simple relationship between the nitrogen concentration (Y values) and the depth of the measurement (X values).

Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]

Y <- function(a, b, x){
# a simple line parametrization with the intercept 'a' and the slope `b`
  return(a + b * x) 
}

par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues ~ Xvalues, pch = PCH[Group], col = COL[Group], bg = BGC[Group], 
     xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) 
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
      pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of the depth)
points(Y(fit1$coeff[1], fit1$coeff[2], Xvalues) ~ Xvalues, 
       pch = 16, col = DARK[g], bg = BGC[g],  cex = 1.5)
# regression line by the 'lm()' function (for the whole real line)
abline(fit1, col = DARK[g], lwd = 2)

Object fit1 contains many interesting quantities… fit1 is in fact a list and its components are accessible either as fit1[["COMPONENT"]] or as fit1$COMPONENT

names(fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
##  [7] "qr"            "df.residual"   "xlevels"       "call"          "terms"         "model"

Estimated coefficients (intercept and slope):

fit1[["coefficients"]]
## (Intercept)       depth 
##  1.16906895 -0.02755192
fit1$coefficients
## (Intercept)       depth 
##  1.16906895 -0.02755192
coef(fit1)                   # hat{beta}
## (Intercept)       depth 
##  1.16906895 -0.02755192

Fitted values:

fit1[["fitted.values"]]
##       109       110       111       112       113       114       115       116       117       118       119 
## 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 
##       120       121       122       123       124       125       126       127       128       129       130 
## 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 
##       131       132       133       134       135       136       137       138       139       140       141 
## 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 
##       142       143       144       145       146       147       148 
## 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421
fitted(fit1)                 # hat{Y}
##       109       110       111       112       113       114       115       116       117       118       119 
## 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 
##       120       121       122       123       124       125       126       127       128       129       130 
## 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 
##       131       132       133       134       135       136       137       138       139       140       141 
## 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 
##       142       143       144       145       146       147       148 
## 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421
Yhat = cbind(1,Xvalues) %*% coef(fit1)
all.equal(as.numeric(Yhat), unname(fitted(fit1)))  
## [1] TRUE

Residuals:

fit1[["residuals"]]
##          109          110          111          112          113          114          115          116 
## -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646 
##          117          118          119          120          121          122          123          124 
## -0.150838933 -0.003993299  0.023491705 -0.012196205  0.074187174  0.113268543  0.009912453  0.013481456 
##          125          126          127          128          129          130          131          132 
##  0.047741778 -0.074664484 -0.094067519  0.121601286  0.343741240  0.329024967  0.296564845  0.157087576 
##          133          134          135          136          137          138          139          140 
##  0.213629523  0.062043179 -0.091651399  0.003870787  0.020785509 -0.026919068 -0.143776424 -0.076927619 
##          141          142          143          144          145          146          147          148 
## -0.027662155 -0.038311109  0.058281214  0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
residuals(fit1)              # U = Y - hat{Y}
##          109          110          111          112          113          114          115          116 
## -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646 
##          117          118          119          120          121          122          123          124 
## -0.150838933 -0.003993299  0.023491705 -0.012196205  0.074187174  0.113268543  0.009912453  0.013481456 
##          125          126          127          128          129          130          131          132 
##  0.047741778 -0.074664484 -0.094067519  0.121601286  0.343741240  0.329024967  0.296564845  0.157087576 
##          133          134          135          136          137          138          139          140 
##  0.213629523  0.062043179 -0.091651399  0.003870787  0.020785509 -0.026919068 -0.143776424 -0.076927619 
##          141          142          143          144          145          146          147          148 
## -0.027662155 -0.038311109  0.058281214  0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
(Res = c(Yvalues - Yhat))
##  [1] -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646
##  [9] -0.150838933 -0.003993299  0.023491705 -0.012196205  0.074187174  0.113268543  0.009912453  0.013481456
## [17]  0.047741778 -0.074664484 -0.094067519  0.121601286  0.343741240  0.329024967  0.296564845  0.157087576
## [25]  0.213629523  0.062043179 -0.091651399  0.003870787  0.020785509 -0.026919068 -0.143776424 -0.076927619
## [33] -0.027662155 -0.038311109  0.058281214  0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
all.equal(as.numeric(Res),unname(residuals(fit1))) 
## [1] TRUE

Rank and degrees of freedom:

fit1[["rank"]]               # r
## [1] 2
fit1[["df.residual"]]        # n - r
## [1] 38
(r = qr(cbind(1,Xvalues))$rank)
## [1] 2
n = length(Yvalues)
n - r
## [1] 38

Basic inferential quantities can be extracted from the object returned by summary of lm fit:

summary(fit1)      
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19491 -0.09226 -0.01956  0.05038  0.34374 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.169069   0.038191  30.611  < 2e-16 ***
## depth       -0.027552   0.004565  -6.036 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.476 
## F-statistic: 36.43 on 1 and 38 DF,  p-value: 5.083e-07

First summary is simple summary of the residuals:

summary(Res)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.19491 -0.09226 -0.01956  0.00000  0.05038  0.34374

Residual sum of squares and residual standard error

(SSe <- deviance(fit1))        # SS_e
## [1] 0.665101
sum((fit1$residuals)^2)        # the same
## [1] 0.665101
sqrt(SSe / (n-r))              # residual standard error
## [1] 0.1322976

Matrix \(\mathsf{MS}_e \cdot (\mathbb{X}^\top \mathbb{X})^{-1}\)

vcov(fit1)
##               (Intercept)         depth
## (Intercept)  0.0014585549 -0.0001458555
## depth       -0.0001458555  0.0000208365

Standard error terms (square root of the diagonal)

sqrt(diag(vcov(fit1)))
## (Intercept)       depth 
## 0.038191032 0.004564701

Correlation matrix derived from vcov(fit1)

cov2cor(vcov(fit1))
##             (Intercept)    depth
## (Intercept)     1.00000 -0.83666
## depth          -0.83666  1.00000

Confidence intervals for regression coefficients

confint(fit1, level= 0.95)
##                   2.5 %      97.5 %
## (Intercept)  1.09175525  1.24638265
## depth       -0.03679268 -0.01831117

Intercept only model for the group “CB-VJJ”

What is it all good for? Another regression line is used to fit the same data. What is the main difference between fit1 and fit0?

fit0 <- lm(N ~ 1, data = peat, subset = (group == Group))
summary(fit0)
## 
## Call:
## lm(formula = N ~ 1, data = peat, subset = (group == Group))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33267 -0.11414  0.02298  0.13556  0.40649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.9762     0.0289   33.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1828 on 39 degrees of freedom

The model fits the regression line which is always parallel with the x-axis (thus, the slope is always zero)

Y <- function(a, x){
# a simple line parametrization with the intercept 'a' and zero slope
  return(a + 0 * x) 
}

# the same Y values for each X value... why? 
Y(fit0$coeff[1], Xvalues)
##  [1] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [11] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [21] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [31] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues~Xvalues, pch = PCH[Group], col = COL[Group], bg = BGC[Group],
     xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) 
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
      pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of depth)
points(Y(fit0$coeff[1], Xvalues)~Xvalues, 
       pch = 16, col = DARK[g], bg = BGC[2], cex = 1.5)
# regression line by the 'lm()' function (for the whole real line)
abline(fit0, col = DARK[g], lwd = 2)

Do you see any relationship with the following?

mean(Yvalues)           # beta coefficient
## [1] 0.9762055
sd(Yvalues)             # residual standard error
## [1] 0.1827673
sd(Yvalues)/sqrt(n)     # estimated sd of beta
## [1] 0.02889805

Total sum of squares:

deviance(fit0)          # SS_T (= SS_e of fit0)
## [1] 1.302752
sum((Yvalues - mean(Yvalues))^2)
## [1] 1.302752

Overall F-test (once more):

summary(fit1)
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19491 -0.09226 -0.01956  0.05038  0.34374 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.169069   0.038191  30.611  < 2e-16 ***
## depth       -0.027552   0.004565  -6.036 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.476 
## F-statistic: 36.43 on 1 and 38 DF,  p-value: 5.083e-07
anova(fit0, fit1)
## Analysis of Variance Table
## 
## Model 1: N ~ 1
## Model 2: N ~ depth
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     39 1.3028                                  
## 2     38 0.6651  1   0.63765 36.432 5.083e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare to standard correlation test:

cor.test(Xvalues, Yvalues)
## 
##  Pearson's product-moment correlation
## 
## data:  Xvalues and Yvalues
## t = -6.0359, df = 38, p-value = 5.083e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8301961 -0.4962622
## sample estimates:
##        cor 
## -0.6996175

Depth-as-factor model for the group “CB-VJJ”

Compare fit0 and fit1 with the model where each depth level is considered separately:

(fit2 = lm(N ~ -1 + as.factor(depth), data = peat, subset = (group == Group)))
## 
## Call:
## lm(formula = N ~ -1 + as.factor(depth), data = peat, subset = (group == 
##     Group))
## 
## Coefficients:
##  as.factor(depth)0   as.factor(depth)2   as.factor(depth)4   as.factor(depth)6   as.factor(depth)8  
##             1.1769              1.1012              1.0153              1.0114              1.0216  
## as.factor(depth)10  as.factor(depth)12  as.factor(depth)14  
##             0.9036              0.8086              0.7710
par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues~Xvalues,pch = PCH[Group], col = COL[Group], bg = BGC[Group], 
               xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM) 
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type="b",
      pch = 23, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of depth)
points(subset(peat, group==Group)$depth, fitted(fit2), pch=20, cex=2, col=DARK[g])

Is it possible to judge (just visually) from the third column whether one assumption of the classical linear model is satisfied? Which assumption?

print(tabs[[Group]])
##   Depth      Mean    Std.Dev n
## 1     0 1.1769177 0.13512002 5
## 2     2 1.1011507 0.05042430 5
## 3     4 1.0153398 0.07849447 5
## 4     6 1.0113715 0.08000427 5
## 5     8 1.0215852 0.15842290 5
## 6    10 0.9036023 0.21677703 5
## 7    12 0.8086419 0.19742942 5
## 8    14 0.7710349 0.10510728 5

Numeric computations behind lm

We fit the simple linear regression line estimate once again with additional argument:

fit1 <- lm(N ~ depth, data = peat, subset = (group == Group), x = TRUE)
names(fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
##  [7] "qr"            "df.residual"   "xlevels"       "call"          "terms"         "model"        
## [13] "x"

The fit1 object additionally contains an element x, the model matrix \(\mathbb{X}\):

X <- fit1[["x"]]
# X <- model.matrix(N ~ depth, peat, subset = (group == Group)) # or directly
print(X)
##     (Intercept) depth
## 109           1     0
## 110           1     2
## 111           1     4
## 112           1     6
## 113           1     8
## 114           1    10
## 115           1    12
## 116           1    14
## 117           1     0
## 118           1     2
## 119           1     4
## 120           1     6
## 121           1     8
## 122           1    10
## 123           1    12
## 124           1    14
## 125           1     0
## 126           1     2
## 127           1     4
## 128           1     6
## 129           1     8
## 130           1    10
## 131           1    12
## 132           1    14
## 133           1     0
## 134           1     2
## 135           1     4
## 136           1     6
## 137           1     8
## 138           1    10
## 139           1    12
## 140           1    14
## 141           1     0
## 142           1     2
## 143           1     4
## 144           1     6
## 145           1     8
## 146           1    10
## 147           1    12
## 148           1    14
## attr(,"assign")
## [1] 0 1

Response \(\mathbf{Y}\) could be obtained with y = TRUE or taken from the data:

y <- subset(peat, group == Group)[, "N"]
print(y)
##  [1] 1.1254425 1.1048190 0.9452000 0.9024598 0.9029528 0.7088391 0.6435388 0.6731084 1.0182300 1.1099718
## [11] 1.0823530 0.9915612 1.0228408 1.0068183 0.8483584 0.7968235 1.2168107 1.0393006 0.9647937 1.1253587
## [21] 1.2923948 1.2225747 1.1350107 0.9404296 1.3826985 1.1760083 0.9672099 1.0076282 0.9694391 0.8666307
## [31] 0.6946695 0.7064144 1.1414068 1.0756540 1.1171425 1.0298494 0.9202986 0.7131487 0.7216320 0.7383986

Model fitting \(\widehat{\mathbf{\beta}} = \left(\mathbb{X}^\top \mathbb{X}\right)^{-1} \mathbb{X}^\top \mathbf{Y}\):

(XtX <- t(X) %*% X)              # What are the elements of XtX?
##             (Intercept) depth
## (Intercept)          40   280
## depth               280  2800
(Xty <- t(X) %*% y)              # What are the elements of Xty?
##                  [,1]
## (Intercept)  39.04822
## depth       250.19393
(b <- solve(XtX, Xty))           # beta
##                    [,1]
## (Intercept)  1.16906895
## depth       -0.02755192
coef(fit1)
## (Intercept)       depth 
##  1.16906895 -0.02755192

Projection matrix \(\mathbb{H} = \mathbb{X} \left(\mathbb{X}^\top \mathbb{X}\right)^{-1} \mathbb{X}^\top\):

H <- X %*% solve(XtX) %*% t(X)    # What is its purpose?
dim(H)
## [1] 40 40
all.equal(c(H%*%y),unname(fitted(fit1)))  
## [1] TRUE

Numerically efficient way how to get it (using the QR decomposition):

Q <- qr.Q(fit1[["qr"]])  # Q matrix (orthogonal columns)
H <- qr.Q(fit1[["qr"]]) %*% t(qr.Q(fit1[["qr"]]))
dim(H)
## [1] 40 40
H[1:10, 1:10]     # part of H matrix
##                [,1]         [,2]          [,3]       [,4]       [,5]          [,6]         [,7]          [,8]
##  [1,]  8.333333e-02  0.066666667  5.000000e-02 0.03333333 0.01666667 -6.938894e-18 -0.016666667 -3.333333e-02
##  [2,]  6.666667e-02  0.054761905  4.285714e-02 0.03095238 0.01904762  7.142857e-03 -0.004761905 -1.666667e-02
##  [3,]  5.000000e-02  0.042857143  3.571429e-02 0.02857143 0.02142857  1.428571e-02  0.007142857 -3.469447e-18
##  [4,]  3.333333e-02  0.030952381  2.857143e-02 0.02619048 0.02380952  2.142857e-02  0.019047619  1.666667e-02
##  [5,]  1.666667e-02  0.019047619  2.142857e-02 0.02380952 0.02619048  2.857143e-02  0.030952381  3.333333e-02
##  [6,] -6.938894e-18  0.007142857  1.428571e-02 0.02142857 0.02857143  3.571429e-02  0.042857143  5.000000e-02
##  [7,] -1.666667e-02 -0.004761905  7.142857e-03 0.01904762 0.03095238  4.285714e-02  0.054761905  6.666667e-02
##  [8,] -3.333333e-02 -0.016666667 -3.469447e-18 0.01666667 0.03333333  5.000000e-02  0.066666667  8.333333e-02
##  [9,]  8.333333e-02  0.066666667  5.000000e-02 0.03333333 0.01666667  0.000000e+00 -0.016666667 -3.333333e-02
## [10,]  6.666667e-02  0.054761905  4.285714e-02 0.03095238 0.01904762  7.142857e-03 -0.004761905 -1.666667e-02
##              [,9]        [,10]
##  [1,]  0.08333333  0.066666667
##  [2,]  0.06666667  0.054761905
##  [3,]  0.05000000  0.042857143
##  [4,]  0.03333333  0.030952381
##  [5,]  0.01666667  0.019047619
##  [6,]  0.00000000  0.007142857
##  [7,] -0.01666667 -0.004761905
##  [8,] -0.03333333 -0.016666667
##  [9,]  0.08333333  0.066666667
## [10,]  0.06666667  0.054761905
max(c(abs(X %*% solve(XtX) %*% t(X) - H))) # numerically zero
## [1] 6.938894e-17

Fitted values \(\widehat{\mathbf{Y}}\):

yhat  <- H %*% y
yhat2 <- X %*% b
summary(yhat - yhat2)             # Numerically only zeros
##        V1            
##  Min.   :-6.661e-16  
##  1st Qu.:-2.776e-16  
##  Median : 1.110e-16  
##  Mean   : 1.110e-17  
##  3rd Qu.: 2.498e-16  
##  Max.   : 5.551e-16

Residuals \(\mathbf{U}\):

mean(y - yhat)                    # Numerically zero 
## [1] -7.217305e-17

Complement of the projection matrix \(\mathbb{M}\) - projection matrix into the space of residuals

M <- diag(nrow(H)) - H
all.equal(c(M%*%y), unname(fit1$residuals))
## [1] TRUE

Residuals are orthogonal to the columns of \(\mathbb{X}\), i.e. \(\mathbb{M} \cdot \mathbb{X} = \mathbb{O}\) (zero matrix):

max(abs(c(M%*%X)))
## [1] 5.329071e-15

Therefore, since \(\mathbf{U} = \mathbb{M} \cdot \mathbf{Y}\), \[\mathbb{X}^\top \cdot \mathbf{U} = \mathbb{X}^\top \cdot \mathbb{M} \cdot \mathbf{Y} = (\mathbb{M} \cdot \mathbb{X})^\top \cdot \mathbf{Y} = \mathbf{0}.\] Because the vector of all ones \(\mathbf{1}\) is a column of \(\mathbb{X}\), also \(\mathbf{1}^\top \cdot \mathbf{U} = 0\). We obtain that the sum of residuals in a linear model with intercept must equal 0. More generally, residuals are orthogonal to the columns of \(\mathbb{X}\).

\(\mathsf{SS}_e\) (residual sum of squares):

deviance(fit1)
## [1] 0.665101
(SSe  <- as.numeric(t(y) %*% M %*% y))
## [1] 0.665101
(SSe2 <- as.numeric(crossprod(y - yhat)))  # The same
## [1] 0.665101

Residual mean square \(\mathsf{MS_e}\) (and its square root):

(df <- length(y) - r)
## [1] 38
(s2 <- SSe / df)
## [1] 0.01750266
(s <- sqrt(s2))
## [1] 0.1322976
summary(fit1)
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group), 
##     x = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19491 -0.09226 -0.01956  0.05038  0.34374 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.169069   0.038191  30.611  < 2e-16 ***
## depth       -0.027552   0.004565  -6.036 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.476 
## F-statistic: 36.43 on 1 and 38 DF,  p-value: 5.083e-07

What are we going to calculate by this? What is its purpose? Can it be found somewhere in summary(fit1)?

deviance(fit0)
## [1] 1.302752
(SST <- as.numeric(crossprod(y - mean(y))))
## [1] 1.302752
(WhatIsIt <- 1 - SSe / SST)
## [1] 0.4894646

Coefficient of determination - square of the coefficient of (multiple). Also, a square of correlation between response Y and regressor X.

cor(Xvalues,Yvalues)^2     
## [1] 0.4894646

Variance and standard deviations of the regression coefficients’ estimates. Why is it important or useful? Can you find them somewhere in summary(fit1)?

(varb <- s2 * solve(XtX))
##               (Intercept)         depth
## (Intercept)  0.0014585549 -0.0001458555
## depth       -0.0001458555  0.0000208365
(sb1 <- sqrt(varb[1, 1]))
## [1] 0.03819103
(sb2 <- sqrt(varb[2, 2]))
## [1] 0.004564701
summary(fit1)
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group), 
##     x = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19491 -0.09226 -0.01956  0.05038  0.34374 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.169069   0.038191  30.611  < 2e-16 ***
## depth       -0.027552   0.004565  -6.036 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.476 
## F-statistic: 36.43 on 1 and 38 DF,  p-value: 5.083e-07

Does the nitrogen concentration depend on the depth significantly?

(statistic <- (b[2] - 0) / sb2)                    
## [1] -6.035865
# critical value
alpha <- 0.05
(K <- qt(1 - alpha/2, df=df))
## [1] 2.024394
# p-value
(Pval <- 2 * pt(-abs(statistic), df = df))  # Is it correct and why?
## [1] 5.083043e-07

What is our decision? Is the test from above somewhere here?

summary(fit1)   
## 
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group), 
##     x = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19491 -0.09226 -0.01956  0.05038  0.34374 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.169069   0.038191  30.611  < 2e-16 ***
## depth       -0.027552   0.004565  -6.036 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared:  0.4895, Adjusted R-squared:  0.476 
## F-statistic: 36.43 on 1 and 38 DF,  p-value: 5.083e-07

Individual work

Additional material

In some situations (especially if there are lot of data points and no reasonable insight can be made by using a standard plot) a jittered version may be used instead. In this case the depth is randomly shifted by an amount from a uniform distribution on \([-0.5,+0.5]\).

set.seed(20010911)  
peat[, "jdepth"] <- peat[, "depth"] + runif(nrow(peat), -0.5, 0.5)
summary(peat)
##     group          N              depth            jdepth       
##  CB    :35   Min.   :0.4989   Min.   : 0.000   Min.   :-0.4611  
##  CB-VJJ:40   1st Qu.:0.8992   1st Qu.: 4.000   1st Qu.: 3.8135  
##  VJJ   :33   Median :1.0343   Median : 8.000   Median : 7.8496  
##  VJJ-CB:40   Mean   :1.0766   Mean   : 7.527   Mean   : 7.4887  
##              3rd Qu.:1.2440   3rd Qu.:12.000   3rd Qu.:11.5338  
##              Max.   :1.6921   Max.   :14.000   Max.   :14.4786

Another plot, this time with the jittered depth:

par(mfrow=c(1,1), mar=c(4,4,0.5,0.5))
plot(N ~ jdepth, data = peat, 
     xlab = XLAB, ylab = YLAB, xaxt = "n", 
     pch = PCH[group], col = COL[group], bg = BGC[group])
axis(1, at = seq(0, 14, by = 2))
abline(v = seq(1, 13, by = 2), col = "gray", lty = 5, lwd = 2)
legend(1.3, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL,
       pt.bg = BGC, y.intersp = 1.2)

Similar plot using an R function jitter:

par(mfrow=c(1,1), mar=c(4,4,0.5,0.5))
plot(N ~ jitter(depth), data=peat, 
     xlab = XLAB, ylab = YLAB, 
     pch = PCH[group], col = COL[group], bg = BGC[group])