Download this R markdown as: R, Rmd.
library("mffSM")
library("colorspace")
We want to examine a nitrogen concentration measured at various
depths of the peat. Load the data peat
below and try to
understand the design of the underlying experiment. Focus on various
depths in particular.
peat <- read.csv("http://www.karlin.mff.cuni.cz/~vavraj/nmfm334/data/peat.csv",
header = TRUE, stringsAsFactors = TRUE)
dim(peat)
## [1] 148 3
str(peat)
## 'data.frame': 148 obs. of 3 variables:
## $ group: Factor w/ 4 levels "CB","CB-VJJ",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ N : num 0.738 0.847 1.063 1.142 1.272 ...
## $ depth: int 2 4 6 8 10 12 14 2 4 6 ...
head(peat)
## group N depth
## 1 VJJ 0.7383710 2
## 2 VJJ 0.8473675 4
## 3 VJJ 1.0629922 6
## 4 VJJ 1.1419428 8
## 5 VJJ 1.2721831 10
## 6 VJJ 1.4032890 12
summary(peat)
## group N depth
## CB :35 Min. :0.4989 Min. : 0.000
## CB-VJJ:40 1st Qu.:0.8992 1st Qu.: 4.000
## VJJ :33 Median :1.0343 Median : 8.000
## VJJ-CB:40 Mean :1.0766 Mean : 7.527
## 3rd Qu.:1.2440 3rd Qu.:12.000
## Max. :1.6921 Max. :14.000
Basic scatterplot for Nitrogen concentration given depth:
XLAB <- "Depth [cm]"
YLAB <- "Nitrogen concentration [weight %]"
XLIM <- range(peat[, "depth"])
YLIM <- range(peat[, "N"])
par(mfrow = c(1,1), mar = c(4,4,0.5,0.5))
plot(N ~ depth, data = peat, pch = 21, bg = "lightblue",
xlab = XLAB, ylab = YLAB, xlim = XLIM, ylim = YLIM)
But it can be done better… by distinguishing for different peat types
(variable group
).
Groups <- levels(peat[, "group"])
print(Groups)
## [1] "CB" "CB-VJJ" "VJJ" "VJJ-CB"
PCH <- c(21, 22, 24, 25)
DARK <- rainbow_hcl(4, c = 80, l = 35)
COL <- rainbow_hcl(4, c = 80, l = 50)
BGC <- rainbow_hcl(4, c = 80, l = 70)
names(PCH) <- names(COL) <- names(BGC) <- Groups
par(mfrow = c(2,2), mar = c(4,4,2,1))
for(g in 1:4){
Group <- Groups[g]
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g],
xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
}
Conditional empirical characteristics by groups
tabs <- list()
# g <- 1 # run for one g
for(g in 1:length(Groups)){
# data subset containing only the g-th group
subdata <- subset(peat, group == Groups[g])
# averages for particular depths
Mean <- with(subdata, tapply(N, depth, mean))
# standard deviations for particular depths
StdDev <- with(subdata, tapply(N, depth, sd))
# numbers of observations for particular depths
ns <- with(subdata, tapply(N, depth, length))
# cat("\n",Groups[g],"\n")
# print(Mean)
# print(StdDev)
# print(ns)
tabs[[g]] <- data.frame(Depth = as.numeric(names(Mean)), Mean = Mean,
Std.Dev = StdDev, n = ns)
rownames(tabs[[g]]) <- 1:nrow(tabs[[g]])
}
names(tabs) <- Groups
print(tabs)
## $CB
## Depth Mean Std.Dev n
## 1 2 0.8504320 0.17437100 5
## 2 4 0.8885866 0.26575319 5
## 3 6 0.7951133 0.07905635 5
## 4 8 0.9122869 0.10257331 5
## 5 10 0.9930867 0.12950782 5
## 6 12 0.9024869 0.14771465 5
## 7 14 0.8694117 0.14202920 5
##
## $`CB-VJJ`
## Depth Mean Std.Dev n
## 1 0 1.1769177 0.13512002 5
## 2 2 1.1011507 0.05042430 5
## 3 4 1.0153398 0.07849447 5
## 4 6 1.0113715 0.08000427 5
## 5 8 1.0215852 0.15842290 5
## 6 10 0.9036023 0.21677703 5
## 7 12 0.8086419 0.19742942 5
## 8 14 0.7710349 0.10510728 5
##
## $VJJ
## Depth Mean Std.Dev n
## 1 2 0.9381259 0.19972666 4
## 2 4 0.8817402 0.11941469 4
## 3 6 1.0673126 0.06671083 5
## 4 8 1.2065316 0.09810094 5
## 5 10 1.3061741 0.12734137 5
## 6 12 1.3610624 0.06927207 5
## 7 14 1.4549562 0.09526403 5
##
## $`VJJ-CB`
## Depth Mean Std.Dev n
## 1 0 0.9132266 0.18371658 5
## 2 2 1.0804183 0.10997845 5
## 3 4 1.0331289 0.11638281 5
## 4 6 1.1380384 0.18806905 5
## 5 8 1.3685547 0.14144116 5
## 6 10 1.4260073 0.05603573 5
## 7 12 1.5131996 0.09093290 5
## 8 14 1.5221417 0.13197782 5
Insert the group averages into the original plot
par(mfrow = c(2,2), mar = c(4,4,2,1))
for (g in 1:4){
Group <- Groups[g]
xx <- subset(peat, group == Group)[, "depth"]
yy <- subset(peat, group == Group)[, "N"]
plot(xx, yy, pch = PCH[g], col = COL[g], bg = BGC[g],
xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
}
Let’s fit the least-squares estimated of the regression line
parameters with lm
:
Group <- "CB-VJJ"
g <- 2
fit1 <- lm(N ~ depth, data = peat, subset = (group == Group))
print(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
##
## Coefficients:
## (Intercept) depth
## 1.16907 -0.02755
The regression line establishes a simple relationship between the nitrogen concentration (Y values) and the depth of the measurement (X values).
Yvalues <- peat$N[peat$group == Group]
Xvalues <- peat$depth[peat$group == Group]
Y <- function(a, b, x){
# a simple line parametrization with the intercept 'a' and the slope `b`
return(a + b * x)
}
par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues ~ Xvalues, pch = PCH[Group], col = COL[Group], bg = BGC[Group],
xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of the depth)
points(Y(fit1$coeff[1], fit1$coeff[2], Xvalues) ~ Xvalues,
pch = 16, col = DARK[g], bg = BGC[g], cex = 1.5)
# regression line by the 'lm()' function (for the whole real line)
abline(fit1, col = DARK[g], lwd = 2)
Object
fit1
contains many interesting quantities…
fit1
is in fact a list and its components are accessible
either as fit1[["COMPONENT"]]
or as
fit1$COMPONENT
names(fit1)
## [1] "coefficients" "residuals" "effects" "rank" "fitted.values" "assign"
## [7] "qr" "df.residual" "xlevels" "call" "terms" "model"
Estimated coefficients (intercept and slope):
fit1[["coefficients"]]
## (Intercept) depth
## 1.16906895 -0.02755192
fit1$coefficients
## (Intercept) depth
## 1.16906895 -0.02755192
coef(fit1) # hat{beta}
## (Intercept) depth
## 1.16906895 -0.02755192
Fitted values:
fit1[["fitted.values"]]
## 109 110 111 112 113 114 115 116 117 118 119
## 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613
## 120 121 122 123 124 125 126 127 128 129 130
## 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497
## 131 132 133 134 135 136 137 138 139 140 141
## 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689
## 142 143 144 145 146 147 148
## 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421
fitted(fit1) # hat{Y}
## 109 110 111 112 113 114 115 116 117 118 119
## 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613
## 120 121 122 123 124 125 126 127 128 129 130
## 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497
## 131 132 133 134 135 136 137 138 139 140 141
## 0.8384459 0.7833421 1.1690689 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421 1.1690689
## 142 143 144 145 146 147 148
## 1.1139651 1.0588613 1.0037574 0.9486536 0.8935497 0.8384459 0.7833421
Yhat = cbind(1,Xvalues) %*% coef(fit1)
all.equal(as.numeric(Yhat), unname(fitted(fit1)))
## [1] TRUE
Residuals:
fit1[["residuals"]]
## 109 110 111 112 113 114 115 116
## -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646
## 117 118 119 120 121 122 123 124
## -0.150838933 -0.003993299 0.023491705 -0.012196205 0.074187174 0.113268543 0.009912453 0.013481456
## 125 126 127 128 129 130 131 132
## 0.047741778 -0.074664484 -0.094067519 0.121601286 0.343741240 0.329024967 0.296564845 0.157087576
## 133 134 135 136 137 138 139 140
## 0.213629523 0.062043179 -0.091651399 0.003870787 0.020785509 -0.026919068 -0.143776424 -0.076927619
## 141 142 143 144 145 146 147 148
## -0.027662155 -0.038311109 0.058281214 0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
residuals(fit1) # U = Y - hat{Y}
## 109 110 111 112 113 114 115 116
## -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646
## 117 118 119 120 121 122 123 124
## -0.150838933 -0.003993299 0.023491705 -0.012196205 0.074187174 0.113268543 0.009912453 0.013481456
## 125 126 127 128 129 130 131 132
## 0.047741778 -0.074664484 -0.094067519 0.121601286 0.343741240 0.329024967 0.296564845 0.157087576
## 133 134 135 136 137 138 139 140
## 0.213629523 0.062043179 -0.091651399 0.003870787 0.020785509 -0.026919068 -0.143776424 -0.076927619
## 141 142 143 144 145 146 147 148
## -0.027662155 -0.038311109 0.058281214 0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
(Res = c(Yvalues - Yhat))
## [1] -0.043626454 -0.009146158 -0.113661218 -0.101297614 -0.045700761 -0.184710673 -0.194907119 -0.110233646
## [9] -0.150838933 -0.003993299 0.023491705 -0.012196205 0.074187174 0.113268543 0.009912453 0.013481456
## [17] 0.047741778 -0.074664484 -0.094067519 0.121601286 0.343741240 0.329024967 0.296564845 0.157087576
## [25] 0.213629523 0.062043179 -0.091651399 0.003870787 0.020785509 -0.026919068 -0.143776424 -0.076927619
## [33] -0.027662155 -0.038311109 0.058281214 0.026092004 -0.028354961 -0.180401010 -0.116813908 -0.044943502
all.equal(as.numeric(Res),unname(residuals(fit1)))
## [1] TRUE
Rank and degrees of freedom:
fit1[["rank"]] # r
## [1] 2
fit1[["df.residual"]] # n - r
## [1] 38
(r = qr(cbind(1,Xvalues))$rank)
## [1] 2
n = length(Yvalues)
n - r
## [1] 38
Basic inferential quantities can be extracted from the object
returned by summary
of lm
fit:
summary(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19491 -0.09226 -0.01956 0.05038 0.34374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169069 0.038191 30.611 < 2e-16 ***
## depth -0.027552 0.004565 -6.036 5.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.476
## F-statistic: 36.43 on 1 and 38 DF, p-value: 5.083e-07
First summary is simple summary of the residuals:
summary(Res)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.19491 -0.09226 -0.01956 0.00000 0.05038 0.34374
Residual sum of squares and residual standard error
(SSe <- deviance(fit1)) # SS_e
## [1] 0.665101
sum((fit1$residuals)^2) # the same
## [1] 0.665101
sqrt(SSe / (n-r)) # residual standard error
## [1] 0.1322976
Matrix \(\mathsf{MS}_e \cdot (\mathbb{X}^\top \mathbb{X})^{-1}\)
vcov(fit1)
## (Intercept) depth
## (Intercept) 0.0014585549 -0.0001458555
## depth -0.0001458555 0.0000208365
Standard error terms (square root of the diagonal)
sqrt(diag(vcov(fit1)))
## (Intercept) depth
## 0.038191032 0.004564701
Correlation matrix derived from vcov(fit1)
cov2cor(vcov(fit1))
## (Intercept) depth
## (Intercept) 1.00000 -0.83666
## depth -0.83666 1.00000
Confidence intervals for regression coefficients
confint(fit1, level= 0.95)
## 2.5 % 97.5 %
## (Intercept) 1.09175525 1.24638265
## depth -0.03679268 -0.01831117
What is it all good for? Another regression line is used to fit the
same data. What is the main difference between fit1
and
fit0
?
fit0 <- lm(N ~ 1, data = peat, subset = (group == Group))
summary(fit0)
##
## Call:
## lm(formula = N ~ 1, data = peat, subset = (group == Group))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33267 -0.11414 0.02298 0.13556 0.40649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9762 0.0289 33.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1828 on 39 degrees of freedom
The model fits the regression line which is always parallel with the x-axis (thus, the slope is always zero)
Y <- function(a, x){
# a simple line parametrization with the intercept 'a' and zero slope
return(a + 0 * x)
}
# the same Y values for each X value... why?
Y(fit0$coeff[1], Xvalues)
## [1] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [11] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [21] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
## [31] 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055 0.9762055
par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues~Xvalues, pch = PCH[Group], col = COL[Group], bg = BGC[Group],
xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type = "b",
pch = 23, lty = 2, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of depth)
points(Y(fit0$coeff[1], Xvalues)~Xvalues,
pch = 16, col = DARK[g], bg = BGC[2], cex = 1.5)
# regression line by the 'lm()' function (for the whole real line)
abline(fit0, col = DARK[g], lwd = 2)
Do you see any relationship with the following?
mean(Yvalues) # beta coefficient
## [1] 0.9762055
sd(Yvalues) # residual standard error
## [1] 0.1827673
sd(Yvalues)/sqrt(n) # estimated sd of beta
## [1] 0.02889805
Total sum of squares:
deviance(fit0) # SS_T (= SS_e of fit0)
## [1] 1.302752
sum((Yvalues - mean(Yvalues))^2)
## [1] 1.302752
Overall F-test (once more):
summary(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19491 -0.09226 -0.01956 0.05038 0.34374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169069 0.038191 30.611 < 2e-16 ***
## depth -0.027552 0.004565 -6.036 5.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.476
## F-statistic: 36.43 on 1 and 38 DF, p-value: 5.083e-07
anova(fit0, fit1)
## Analysis of Variance Table
##
## Model 1: N ~ 1
## Model 2: N ~ depth
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 39 1.3028
## 2 38 0.6651 1 0.63765 36.432 5.083e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compare to standard correlation test:
cor.test(Xvalues, Yvalues)
##
## Pearson's product-moment correlation
##
## data: Xvalues and Yvalues
## t = -6.0359, df = 38, p-value = 5.083e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8301961 -0.4962622
## sample estimates:
## cor
## -0.6996175
Compare fit0
and fit1
with the model where
each depth level is considered separately:
(fit2 = lm(N ~ -1 + as.factor(depth), data = peat, subset = (group == Group)))
##
## Call:
## lm(formula = N ~ -1 + as.factor(depth), data = peat, subset = (group ==
## Group))
##
## Coefficients:
## as.factor(depth)0 as.factor(depth)2 as.factor(depth)4 as.factor(depth)6 as.factor(depth)8
## 1.1769 1.1012 1.0153 1.0114 1.0216
## as.factor(depth)10 as.factor(depth)12 as.factor(depth)14
## 0.9036 0.8086 0.7710
par(mfrow = c(1,1), mar = c(4,4,2,1))
# the underlying data
plot(Yvalues~Xvalues,pch = PCH[Group], col = COL[Group], bg = BGC[Group],
xlab = XLAB, ylab = YLAB, main = Group, xlim = XLIM, ylim = YLIM)
# empirical group means
lines(tabs[[g]][, "Depth"], tabs[[g]][, "Mean"], type="b",
pch = 23, cex = 2, col = COL[g], bg = BGC[g])
# regression line given by the LS estimates (for the specific values of depth)
points(subset(peat, group==Group)$depth, fitted(fit2), pch=20, cex=2, col=DARK[g])
Is it possible to judge (just visually) from the third column whether one assumption of the classical linear model is satisfied? Which assumption?
print(tabs[[Group]])
## Depth Mean Std.Dev n
## 1 0 1.1769177 0.13512002 5
## 2 2 1.1011507 0.05042430 5
## 3 4 1.0153398 0.07849447 5
## 4 6 1.0113715 0.08000427 5
## 5 8 1.0215852 0.15842290 5
## 6 10 0.9036023 0.21677703 5
## 7 12 0.8086419 0.19742942 5
## 8 14 0.7710349 0.10510728 5
lm
We fit the simple linear regression line estimate once again with additional argument:
fit1 <- lm(N ~ depth, data = peat, subset = (group == Group), x = TRUE)
names(fit1)
## [1] "coefficients" "residuals" "effects" "rank" "fitted.values" "assign"
## [7] "qr" "df.residual" "xlevels" "call" "terms" "model"
## [13] "x"
The fit1
object additionally contains an element
x
, the model matrix \(\mathbb{X}\):
X <- fit1[["x"]]
# X <- model.matrix(N ~ depth, peat, subset = (group == Group)) # or directly
print(X)
## (Intercept) depth
## 109 1 0
## 110 1 2
## 111 1 4
## 112 1 6
## 113 1 8
## 114 1 10
## 115 1 12
## 116 1 14
## 117 1 0
## 118 1 2
## 119 1 4
## 120 1 6
## 121 1 8
## 122 1 10
## 123 1 12
## 124 1 14
## 125 1 0
## 126 1 2
## 127 1 4
## 128 1 6
## 129 1 8
## 130 1 10
## 131 1 12
## 132 1 14
## 133 1 0
## 134 1 2
## 135 1 4
## 136 1 6
## 137 1 8
## 138 1 10
## 139 1 12
## 140 1 14
## 141 1 0
## 142 1 2
## 143 1 4
## 144 1 6
## 145 1 8
## 146 1 10
## 147 1 12
## 148 1 14
## attr(,"assign")
## [1] 0 1
Response \(\mathbf{Y}\) could be
obtained with y = TRUE
or taken from the data:
y <- subset(peat, group == Group)[, "N"]
print(y)
## [1] 1.1254425 1.1048190 0.9452000 0.9024598 0.9029528 0.7088391 0.6435388 0.6731084 1.0182300 1.1099718
## [11] 1.0823530 0.9915612 1.0228408 1.0068183 0.8483584 0.7968235 1.2168107 1.0393006 0.9647937 1.1253587
## [21] 1.2923948 1.2225747 1.1350107 0.9404296 1.3826985 1.1760083 0.9672099 1.0076282 0.9694391 0.8666307
## [31] 0.6946695 0.7064144 1.1414068 1.0756540 1.1171425 1.0298494 0.9202986 0.7131487 0.7216320 0.7383986
Model fitting \(\widehat{\mathbf{\beta}} = \left(\mathbb{X}^\top \mathbb{X}\right)^{-1} \mathbb{X}^\top \mathbf{Y}\):
(XtX <- t(X) %*% X) # What are the elements of XtX?
## (Intercept) depth
## (Intercept) 40 280
## depth 280 2800
(Xty <- t(X) %*% y) # What are the elements of Xty?
## [,1]
## (Intercept) 39.04822
## depth 250.19393
(b <- solve(XtX, Xty)) # beta
## [,1]
## (Intercept) 1.16906895
## depth -0.02755192
coef(fit1)
## (Intercept) depth
## 1.16906895 -0.02755192
Projection matrix \(\mathbb{H} = \mathbb{X} \left(\mathbb{X}^\top \mathbb{X}\right)^{-1} \mathbb{X}^\top\):
H <- X %*% solve(XtX) %*% t(X) # What is its purpose?
dim(H)
## [1] 40 40
all.equal(c(H%*%y),unname(fitted(fit1)))
## [1] TRUE
Numerically efficient way how to get it (using the QR decomposition):
Q <- qr.Q(fit1[["qr"]]) # Q matrix (orthogonal columns)
H <- qr.Q(fit1[["qr"]]) %*% t(qr.Q(fit1[["qr"]]))
dim(H)
## [1] 40 40
H[1:10, 1:10] # part of H matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 8.333333e-02 0.066666667 5.000000e-02 0.03333333 0.01666667 -6.938894e-18 -0.016666667 -3.333333e-02
## [2,] 6.666667e-02 0.054761905 4.285714e-02 0.03095238 0.01904762 7.142857e-03 -0.004761905 -1.666667e-02
## [3,] 5.000000e-02 0.042857143 3.571429e-02 0.02857143 0.02142857 1.428571e-02 0.007142857 -3.469447e-18
## [4,] 3.333333e-02 0.030952381 2.857143e-02 0.02619048 0.02380952 2.142857e-02 0.019047619 1.666667e-02
## [5,] 1.666667e-02 0.019047619 2.142857e-02 0.02380952 0.02619048 2.857143e-02 0.030952381 3.333333e-02
## [6,] -6.938894e-18 0.007142857 1.428571e-02 0.02142857 0.02857143 3.571429e-02 0.042857143 5.000000e-02
## [7,] -1.666667e-02 -0.004761905 7.142857e-03 0.01904762 0.03095238 4.285714e-02 0.054761905 6.666667e-02
## [8,] -3.333333e-02 -0.016666667 -3.469447e-18 0.01666667 0.03333333 5.000000e-02 0.066666667 8.333333e-02
## [9,] 8.333333e-02 0.066666667 5.000000e-02 0.03333333 0.01666667 0.000000e+00 -0.016666667 -3.333333e-02
## [10,] 6.666667e-02 0.054761905 4.285714e-02 0.03095238 0.01904762 7.142857e-03 -0.004761905 -1.666667e-02
## [,9] [,10]
## [1,] 0.08333333 0.066666667
## [2,] 0.06666667 0.054761905
## [3,] 0.05000000 0.042857143
## [4,] 0.03333333 0.030952381
## [5,] 0.01666667 0.019047619
## [6,] 0.00000000 0.007142857
## [7,] -0.01666667 -0.004761905
## [8,] -0.03333333 -0.016666667
## [9,] 0.08333333 0.066666667
## [10,] 0.06666667 0.054761905
max(c(abs(X %*% solve(XtX) %*% t(X) - H))) # numerically zero
## [1] 6.938894e-17
Fitted values \(\widehat{\mathbf{Y}}\):
yhat <- H %*% y
yhat2 <- X %*% b
summary(yhat - yhat2) # Numerically only zeros
## V1
## Min. :-6.661e-16
## 1st Qu.:-2.776e-16
## Median : 1.110e-16
## Mean : 1.110e-17
## 3rd Qu.: 2.498e-16
## Max. : 5.551e-16
Residuals \(\mathbf{U}\):
mean(y - yhat) # Numerically zero
## [1] -7.217305e-17
Complement of the projection matrix \(\mathbb{M}\) - projection matrix into the space of residuals
M <- diag(nrow(H)) - H
all.equal(c(M%*%y), unname(fit1$residuals))
## [1] TRUE
Residuals are orthogonal to the columns of \(\mathbb{X}\), i.e. \(\mathbb{M} \cdot \mathbb{X} = \mathbb{O}\) (zero matrix):
max(abs(c(M%*%X)))
## [1] 5.329071e-15
Therefore, since \(\mathbf{U} = \mathbb{M} \cdot \mathbf{Y}\), \[\mathbb{X}^\top \cdot \mathbf{U} = \mathbb{X}^\top \cdot \mathbb{M} \cdot \mathbf{Y} = (\mathbb{M} \cdot \mathbb{X})^\top \cdot \mathbf{Y} = \mathbf{0}.\] Because the vector of all ones \(\mathbf{1}\) is a column of \(\mathbb{X}\), also \(\mathbf{1}^\top \cdot \mathbf{U} = 0\). We obtain that the sum of residuals in a linear model with intercept must equal 0. More generally, residuals are orthogonal to the columns of \(\mathbb{X}\).
\(\mathsf{SS}_e\) (residual sum of squares):
deviance(fit1)
## [1] 0.665101
(SSe <- as.numeric(t(y) %*% M %*% y))
## [1] 0.665101
(SSe2 <- as.numeric(crossprod(y - yhat))) # The same
## [1] 0.665101
Residual mean square \(\mathsf{MS_e}\) (and its square root):
(df <- length(y) - r)
## [1] 38
(s2 <- SSe / df)
## [1] 0.01750266
(s <- sqrt(s2))
## [1] 0.1322976
summary(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group),
## x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19491 -0.09226 -0.01956 0.05038 0.34374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169069 0.038191 30.611 < 2e-16 ***
## depth -0.027552 0.004565 -6.036 5.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.476
## F-statistic: 36.43 on 1 and 38 DF, p-value: 5.083e-07
What are we going to calculate by this? What is its purpose? Can it
be found somewhere in summary(fit1)
?
deviance(fit0)
## [1] 1.302752
(SST <- as.numeric(crossprod(y - mean(y))))
## [1] 1.302752
(WhatIsIt <- 1 - SSe / SST)
## [1] 0.4894646
Coefficient of determination - square of the coefficient of (multiple). Also, a square of correlation between response Y and regressor X.
cor(Xvalues,Yvalues)^2
## [1] 0.4894646
Variance and standard deviations of the regression coefficients’
estimates. Why is it important or useful? Can you find them somewhere in
summary(fit1)
?
(varb <- s2 * solve(XtX))
## (Intercept) depth
## (Intercept) 0.0014585549 -0.0001458555
## depth -0.0001458555 0.0000208365
(sb1 <- sqrt(varb[1, 1]))
## [1] 0.03819103
(sb2 <- sqrt(varb[2, 2]))
## [1] 0.004564701
summary(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group),
## x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19491 -0.09226 -0.01956 0.05038 0.34374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169069 0.038191 30.611 < 2e-16 ***
## depth -0.027552 0.004565 -6.036 5.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.476
## F-statistic: 36.43 on 1 and 38 DF, p-value: 5.083e-07
Does the nitrogen concentration depend on the depth significantly?
(statistic <- (b[2] - 0) / sb2)
## [1] -6.035865
# critical value
alpha <- 0.05
(K <- qt(1 - alpha/2, df=df))
## [1] 2.024394
# p-value
(Pval <- 2 * pt(-abs(statistic), df = df)) # Is it correct and why?
## [1] 5.083043e-07
What is our decision? Is the test from above somewhere here?
summary(fit1)
##
## Call:
## lm(formula = N ~ depth, data = peat, subset = (group == Group),
## x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19491 -0.09226 -0.01956 0.05038 0.34374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.169069 0.038191 30.611 < 2e-16 ***
## depth -0.027552 0.004565 -6.036 5.08e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1323 on 38 degrees of freedom
## Multiple R-squared: 0.4895, Adjusted R-squared: 0.476
## F-statistic: 36.43 on 1 and 38 DF, p-value: 5.083e-07
group
in peat
.jitter()
to make the plot even more clear,
see below.In some situations (especially if there are lot of data points and no reasonable insight can be made by using a standard plot) a jittered version may be used instead. In this case the depth is randomly shifted by an amount from a uniform distribution on \([-0.5,+0.5]\).
set.seed(20010911)
peat[, "jdepth"] <- peat[, "depth"] + runif(nrow(peat), -0.5, 0.5)
summary(peat)
## group N depth jdepth
## CB :35 Min. :0.4989 Min. : 0.000 Min. :-0.4611
## CB-VJJ:40 1st Qu.:0.8992 1st Qu.: 4.000 1st Qu.: 3.8135
## VJJ :33 Median :1.0343 Median : 8.000 Median : 7.8496
## VJJ-CB:40 Mean :1.0766 Mean : 7.527 Mean : 7.4887
## 3rd Qu.:1.2440 3rd Qu.:12.000 3rd Qu.:11.5338
## Max. :1.6921 Max. :14.000 Max. :14.4786
Another plot, this time with the jittered depth:
par(mfrow=c(1,1), mar=c(4,4,0.5,0.5))
plot(N ~ jdepth, data = peat,
xlab = XLAB, ylab = YLAB, xaxt = "n",
pch = PCH[group], col = COL[group], bg = BGC[group])
axis(1, at = seq(0, 14, by = 2))
abline(v = seq(1, 13, by = 2), col = "gray", lty = 5, lwd = 2)
legend(1.3, 1.7, legend = levels(peat[, "group"]), pch = PCH, col = COL,
pt.bg = BGC, y.intersp = 1.2)
Similar plot using an R function jitter
:
par(mfrow=c(1,1), mar=c(4,4,0.5,0.5))
plot(N ~ jitter(depth), data=peat,
xlab = XLAB, ylab = YLAB,
pch = PCH[group], col = COL[group], bg = BGC[group])