Statistics review 7: Correlation and regression¶

R code accompanying paper

Key learning points¶

Correlation quantifies the strength of the linear relationship between a pair of variables
Regression expresses the relationship in the form of an equation

suppressPackageStartupMessages(library(tidyverse))

options(repr.plot.width=4, repr.plot.height=3)

Scatter diagram¶

age <- c(60, 76, 81, 89, 44, 58, 55, 74, 45, 67, 72,
         91, 76, 39, 71, 56, 77, 37, 64, 84)
urea <- c(1.099, 1.723, 2.054, 2.262, 1.686, 1.988, 1.131, 1.917, 1.548, 1.386,
              2.617, 2.701, 2.054, 1.526, 2.002, 1.526, 1.825, 1.435, 2.460, 1.932)

ae <- data.frame(subject=1:20, age=age, urea=urea)

head(ae)

subject	age	urea
1	60	1.099
2	76	1.723
3	81	2.054
4	89	2.262
5	44	1.686
6	58	1.988

ggplot(ae, aes(x=age, y=urea)) + geom_point()

Correlation¶

Function to calculate Pearson’s correlatin¶

pearson <- function(x, y) {
    xbar <- mean(x)
    ybar <- mean(y)
    sum((x - xbar)*(y-ybar))/sqrt(sum((x-xbar)^2)*sum((y-ybar)^2))
}

round(pearson(x=ae$age, y=ae$urea), 2)

0.62

Using built-in function¶

round(cor(x=ae$age, y=ae$urea, method = 'pearson'), 2)

0.62

Hypothesis test of correlation¶

cor.test(x=ae$age, y=ae$urea)

    Pearson's product-moment correlation

data:  ae$age and ae$urea
t = 3.3538, df = 18, p-value = 0.003535
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2447935 0.8338342
sample estimates:
      cor
0.6201371

Confidence interval for the population correlation coefficient¶

This is also provided by the built in test function.

z <- c(0.5 * log((1+0.62)/(1 - 0.62)))
se <- 1/sqrt(20 -3)

z.ci <- c(z - 1.96*se, z + 1.96*se)
round(z.ci, 2)

0.25
1.2

ci <- c((exp(2*z.ci[1]) - 1)/(exp(2*z.ci[1]) + 1),
        (exp(2*z.ci[2]) - 1)/(exp(2*z.ci[2]) + 1))
round(ci, 2)

0.24
0.83

Misuse of correlation¶

Correlations may arise from third variable¶

z <- 3*rnorm(10)
x <- z + rnorm(10)
y <- z + rnorm(10)

df <- data.frame(x=x, y=y)

ggplot(df, aes(x=x, y=y)) + geom_point() + geom_smooth(method=lm) +
annotate("text", x=-4, y=5, label = paste("r =", round(cor(df$x, df$y), 2)))

_images/SR07_Correlation_and_regression_22_1.png

Correlations can be misleading if relationships are non-linear¶

head(anscombe)

x1	x2	x3	x4	y1	y2	y3	y4
10	10	10	8	8.04	9.14	7.46	6.58
8	8	8	8	6.95	8.14	6.77	5.76
13	13	13	8	7.58	8.74	12.74	7.71
9	9	9	8	8.81	8.77	7.11	8.84
11	11	11	8	8.33	9.26	7.81	8.47
14	14	14	8	9.96	8.10	8.84	7.04

g1 <- ggplot(anscombe, aes(x=x1, y=y1)) +
geom_point() + geom_smooth(method=lm) +
annotate("text", x=6, y=9,
         label = paste("r =", round(cor(anscombe$x1, anscombe$y1), 2)))
g2 <- ggplot(anscombe, aes(x=x2, y=y2)) +
geom_point() + geom_smooth(method=lm) +
annotate("text", x=6, y=9,
         label = paste("r =", round(cor(anscombe$x2, anscombe$y2), 2)))
g3 <- ggplot(anscombe, aes(x=x3, y=y3)) +
geom_point() + geom_smooth(method=lm) +
annotate("text", x=6, y=11,
         label = paste("r =", round(cor(anscombe$x3, anscombe$y3), 2)))
g4 <- ggplot(anscombe, aes(x=x4, y=y4)) +
geom_point() + geom_smooth(method=lm) +
annotate("text", x=10, y=11,
         label = paste("r =", round(cor(anscombe$x4, anscombe$y4), 2)))

g1

_images/SR07_Correlation_and_regression_26_1.png

g2

_images/SR07_Correlation_and_regression_27_1.png

g3

_images/SR07_Correlation_and_regression_28_1.png

g4

_images/SR07_Correlation_and_regression_29_1.png

Correlations may arise from subgroups¶

m <- rnorm(10, 70, 4)
f <- rnorm(10, 65, 3.5)
x.m <- rnorm(10, 0)
x.f <- rnorm(10, 6)

df <- data.frame(x=c(x.f, x.m), height=c(f, m),
                 sex=c(rep("f", 10), rep("m", 10)))

ggplot(df, aes(x=x, y=height)) + geom_point() +
geom_smooth(method=lm) +
annotate("text", x=6, y=72,
         label = paste("r =", round(cor(df$x, df$height), 2)))

Regression¶

x <- ae$age
xbar <- mean(x)
y <- ae$urea
ybar <- mean(y)

b <- sum((x - xbar)*(y - ybar))/sum((x-xbar)^2)
a <- ybar - b*xbar
round(c(a, b), 4)

0.7147
0.0172

ggplot(ae, aes(x=age, y=urea)) + geom_point() +
geom_abline(aes(intercept=a, slope=b), color="blue")

_images/SR07_Correlation_and_regression_37_1.png

Hypothesis tests and confidence intervals¶

fit <- lm(urea ~ age, data=ae)
summary(fit)

Call:
lm(formula = urea ~ age, data = ae)

Residuals:
     Min       1Q   Median       3Q      Max
-0.64509 -0.21403  0.02789  0.16075  0.66703

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.714698   0.346129   2.065  0.05365 .
age         0.017157   0.005116   3.354  0.00354 **
---
Signif. codes:  0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3606 on 18 degrees of freedom
Multiple R-squared:  0.3846,        Adjusted R-squared:  0.3504
F-statistic: 11.25 on 1 and 18 DF,  p-value: 0.003535

cor.test(ae$age, ae$urea)

    Pearson's product-moment correlation

data:  ae$age and ae$urea
t = 3.3538, df = 18, p-value = 0.003535
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2447935 0.8338342
sample estimates:
      cor
0.6201371

Analysis of variance¶

anova

fit.aov <- anova(fit)
fit.aov

	Df	Sum Sq	Mean Sq	F value	Pr(>F)
age	1	1.462673	1.4626726	11.24785	0.003535245
Residuals	18	2.340724	0.1300402	NA	NA

Residuals¶

ae$predicted <- predict(fit)
ae$residuals <- residuals(fit)

ggplot(ae, aes(x = age, y = urea)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +  # Plot regression slope
  geom_segment(aes(xend = age, yend = predicted), alpha = .2) +  # alpha to fade lines
  geom_point() +
  geom_point(aes(y = predicted), shape = 1)

_images/SR07_Correlation_and_regression_47_1.png

Coefficient of determination¶

Regression_sum_of_squares <- fit.aov$'Sum Sq'[1]
Total_sum_of_squares <- fit.aov$'Sum Sq'[2]
round(c(Regression_sum_of_squares, Total_sum_of_squares), 4)

1.4627
2.3407

Explained variance¶

Age accounts for 38% of the total varia- tion in ln urea

r_squared = (Regression_sum_of_squares/Total_sum_of_squares)^2
round(r_squared, 2)

0.39

Prediction¶

newdata <- data.frame(age = seq(40, 90, 10))
newdata$urea.pred <- predict(fit, newdata)
newdata

age	urea.pred
40	1.400961
50	1.572526
60	1.744092
70	1.915658
80	2.087223
90	2.258789

ggplot(ae, aes(x=age, y=urea)) + geom_point() + geom_smooth(method=lm, )

_images/SR07_Correlation_and_regression_55_1.png

names(fit)

'coefficients'
'residuals'
'effects'
'rank'
'fitted.values'
'assign'
'qr'
'df.residual'
'xlevels'
'call'
'terms'
'model'

ggplot(ae, aes(x=predicted, y=residuals)) +
geom_point() + geom_smooth(se=FALSE)

geom_smooth() using method = 'loess'

_images/SR07_Correlation_and_regression_58_2.png

ggplot(ae, aes(sample=residuals)) + stat_qq()

_images/SR07_Correlation_and_regression_59_1.png

Default diagnostic plots¶

plot(fit)

_images/SR07_Correlation_and_regression_61_0.png

_images/SR07_Correlation_and_regression_61_1.png

_images/SR07_Correlation_and_regression_61_2.png

_images/SR07_Correlation_and_regression_61_3.png

Exercises¶

You may need to install the car package in R in the usual way:

install.packages("car")

suppressPackageStartupMessages(library(car))

head(Davis)

sex	weight	height	repwt	repht
M	77	182	77	180
F	58	161	51	159
F	53	161	54	158
M	68	177	70	175
F	59	157	59	155
M	76	170	76	165

1. Find the correlation between age and height for males and females in the Davis data set.

2. Test if the correlation is significant.

3. Repeat exercises 1 and 2 taking for each sex spearately.

4. Show separate linear regressions of weight (y-axis) on height (x-axis) for each sex, either on the same plot (using different colors) or on different subplots.

5. What is the predicted weight of a female who is 165 cm tall?