4. Geocentric Models

Normal Distribution

Normal by Addition

(Keep in mind the example of field soccer, coin tossing, and stepping right and left on page 72)

library(rethinking)
Loading required package: cmdstanr
This is cmdstanr version 0.7.1
- CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
- CmdStan path: C:/Users/H/Documents/.cmdstan/cmdstan-2.34.1
- CmdStan version: 2.34.1
Loading required package: posterior
This is posterior version 1.5.0

Attaching package: 'posterior'
The following objects are masked from 'package:stats':

    mad, sd, var
The following objects are masked from 'package:base':

    %in%, match
Loading required package: parallel
rethinking (Version 2.40)

Attaching package: 'rethinking'
The following object is masked from 'package:stats':

    rstudent
# given that the steps for each person is represented by a list of 16 random numbers
# between -1 and 1:
# run 1000 simulation of stepping left and right, and store the final result/position
pos <- replicate(1000, sum(runif(16,-1,1)))

# plot the end positions around the half line of soccer field
plot(pos)

# plot the density of the positions
plot(density(pos))

Normal by multiplication

(See the example on page 74) Note that the interaction between the growth deviations converges to Gaussian dist as long as the effect is small.

growth_samll_effect <- replicate(1000, prod(1+runif(12,0,0.1)))
dens(growth_samll_effect, norm.comp = TRUE)

growth_big_effect <- replicate(10000, prod(1+runif(12,0,0.5)))
dens(growth_big_effect, norm.comp = TRUE)

Normal by log-multiplication

Multiplication interactions of large deviations converges to Gaussian dist when we measure the outcomes on the log scale.

log.big <- replicate(10000, log(prod(1+runif(12,0,0.5))))
dens(log.big, norm.comp = TRUE)

Gaussian model of human height

The data

library(rethinking)
data("Howell1")
d <- Howell1

# explore data
str(d)
'data.frame':   544 obs. of  4 variables:
 $ height: num  152 140 137 157 145 ...
 $ weight: num  47.8 36.5 31.9 53 41.3 ...
 $ age   : num  63 63 65 41 51 35 32 27 19 54 ...
 $ male  : int  1 0 0 1 0 1 0 1 0 1 ...
# data summary
precis(d, hist=FALSE)
              mean         sd      5.5%     94.5%
height 138.2635963 27.6024476 81.108550 165.73500
weight  35.6106176 14.7191782  9.360721  54.50289
age     29.3443934 20.7468882  1.000000  66.13500
male     0.4724265  0.4996986  0.000000   1.00000

The model and prior

Based on domain-specific information, we decide that the range of plausible human heights is \(178 \mp 40\). The std. deviation must be basically positive \(h_i \sim Normal(\mu, \sigma)\)

Given that the parameters are independent, the prior is:

\(Pr(\mu, \sigma) = Pr(\mu)Pr(\sigma)\)

Where:

\(\mu \sim Normal(178,20)\)

\(\sigma \sim Uniform(0, 50)\)

Let’s plot the priors: - Mean

curve(dnorm(x, 178, 20), from=100, to=250)

  • Std. deviation
curve(dunif(x, 0, 50), from=0, to=60)

Let’s simulate heights based on the priors. This is called the prior predictive simulation:

sample_mu <- rnorm(1e4, 178, 20)
sample_sigma <- runif(1e4, 0, 50)

prior_h <- rnorm(1e4, sample_mu, sample_sigma)
dens(prior_h)

So far, the model is defined before showing it the data. We can change the prior \(\mu\) std. deviation to see how the model is sensitive to the prior choices that aren’t relying on scintific knowledge as we did.

sample_mu <- rnorm(1e4, 178, 100)
sample_sigma <- runif(1e4, 0, 50)

prior_h <- rnorm(1e4, sample_mu, sample_sigma)
dens(prior_h)

Note how the result doesn’t make sense with negative and very large heights.

Grid approximation of the posterior distribution

Since we have 2 parameters, grid approx. method is not practical. However, we will try using it computing the log-likelihood:

# we will compute the approximation using the height data of persons over 18 y.o.
d2 <- d[d$age >= 18,]


mu.list <- seq(from=150, to=160, length.out=100)
sigma.list <- seq(from=7, to=9, length.out=100)

# Create a Data Frame from All Combinations of Factor Variables
post <- expand.grid(mu=mu.list, sigma=sigma.list)

# compute the log-likelihood
post$LL <- sapply(
  1:nrow(post),
  function(i) sum (
    dnorm(d2$height, post$mu[i], post$sigma[i], log=TRUE)
  )
)

post$prod <- post$LL + dnorm(post$mu, 178, 20, TRUE) + dunif(post$sigma, 0, 50, TRUE)


post$prob <- exp(post$prod - max(post$prod))
contour_xyz(post$mu, post$sigma, post$prob)

image_xyz(post$mu, post$sigma, post$prob)

Sampling from the posterior

# generate random indexes of rows
sample.rows <- sample(1:nrow(post), size=1e4, replace=TRUE, prob=post$prob)
sample.mu <- post$mu[sample.rows]
sample.sigma <- post$sigma[sample.rows]

# this shows the most plausible combinations of mu and sigma
plot(sample.mu, sample.sigma, cex=0.5, pch=16, col=col.alpha(rangi2, 0.1))

Let’s check the shape of marginal posterior densities:

dens(sample.mu)

dens(sample.sigma)

Note that the density for sigma has a longer right tail

d3 <- sample(d2$height, size = 20)

mu.list <- seq( from=150, to=170 , length.out=200 ) 
sigma.list <- seq( from=4 , to=20 , length.out=200 ) 
post2 <- expand.grid( mu=mu.list , sigma=sigma.list ) 
post2$LL <- sapply( 1:nrow(post2) , function(i) sum( dnorm( d3 , mean=post2$mu[i] , sd=post2$sigma[i] , log=TRUE ) ) ) 
post2$prod <- post2$LL + dnorm( post2$mu , 178 , 20 , TRUE ) + dunif( post2$sigma , 0 , 50 , TRUE ) 
post2$prob <- exp( post2$prod - max(post2$prod) ) 

sample2.rows <- sample( 1:nrow(post2) , size=1e4 , replace=TRUE , prob=post2$prob ) 
sample2.mu <- post2$mu[ sample2.rows ] 
sample2.sigma <- post2$sigma[ sample2.rows ] 
plot( sample2.mu , sample2.sigma , cex=0.5 , col=col.alpha(rangi2,0.1) , xlab="mu" , ylab="sigma" , pch=16 )

dens(sample2.sigma, norm.comp = TRUE)

Finding the posterior with quap

Quadratic Approximation is good to make inferences about the shape of posterior, particularly its peak that lie at the maximum a posteriori (MAP)

Let’s first load the data:

# load data
library(rethinking) 
data(Howell1) 
d <- Howell1 
d2 <- d[ d$age >= 18 , ]

Now, we will define our model with code:

\(h_i \sim Normal(\mu, \sigma)\)

\(\mu \sim Normal(178,20)\)

\(\sigma \sim Uniform(0, 50)\)

flist <- alist(
  height ~ dnorm(mu, sigma),
  mu ~ dnorm(178, 20),
  sigma ~ dunif(0, 50)
)

Note: alist stores formulas without executing the expression in the code unlike list

Now, we fit the mode to the data in `d2`:

m4.1 <- quap(flist, data=d2)

Let’s take a glance at the model (i.e posterior dist):

precis(m4.1)
            mean        sd       5.5%      94.5%
mu    154.607026 0.4119947 153.948579 155.265473
sigma   7.731333 0.2913861   7.265642   8.197025

Sampling from a quap

post <- extract.samples(m4.1, n=1e4)
head(post)
        mu    sigma
1 154.8719 8.369672
2 155.0301 8.009201
3 154.9311 7.360071
4 154.3732 7.744091
5 154.7560 7.226322
6 155.2402 7.903876
precis(post, hist=FALSE)
           mean        sd       5.5%      94.5%
mu    154.60282 0.4124339 153.945159 155.261959
sigma   7.72796 0.2908551   7.255576   8.187313

Comparing these values to the output from precis(m4.1), we found it very close.

plot(post)

Sampling the multivariate posterior w/o rethinking

The function extract.samples runs the following simulation that samples random vectors of multivariate Gaussian values. This simulation requires computing the variance-covariance matrix

library(MASS)
post <- mvrnorm(n=1e4, mu=coef(m4.1), Sigma=vcov(m4.1))
plot(post)

Variance-Covariance Matrix

  • It is an essential compnent in the quap algorithm.

  • It tells us how each parameter relates to every other parameter in the posterior distribution.

  • It can be factored into 2 elements:

    • Vector of variances for the parameters diag(vcov(model))

    • Correlation matrix that tells how changes in one parameter lead to correlated changes in the others cov2cor(vcov(model))

Linear prediction

Using the association between predictor variables and outcome variable, we want to predict the later. This is how linear regression works.

Linear model strategy: probabilistic approach

  • We tell the model (golem) the following: “Assume that the predictor variable has a constant and additive relationship to the mean of the outcome. Consider all the lines (formed by the combinations of parameter values) that relate one variable (or more) to the other. Rank all of these lines by plausibility, given these data.”

  • The resulted model is a posterior distribution

In the following example, we want to predict the height using the weight as a predictor variable. This code plot the data to use in model fitting:

library(rethinking)
data(Howell1)
d <- Howell1
d2 <- d[d$age >= 18,]

plot(d2$height ~ d2$weight)

We want to use the Gaussian model of height we built in the previous chapters but making the mean of height \(\mu_i\) is a function of weights where weight values are denoted by \(x\). Here is the model:

\(h_i \sim Normal(\mu_i, \sigma)\)

\(\mu_i = \alpha + \beta(x_i - \bar{x})\)

\(\alpha \sim Normal(178, 20)\)

\(\beta \sim Normal(0, 10)\)

\(\sigma \sim Uniform(0,50)\)

Notations:

  • \(\bar{x}\) is the mean of weights

  • \(x_i\) weight at row \(i\)

  • \(\mu_i\) the mean of heights are row \(i\)

  • \(h_i\) the height at row \(i\)

  • \(\alpha, \beta\) are parameters to learn

Note all relationships are Stochastic except the relationship between the height mean and weight.

The parameters are made up as devices that will help us to manipulate \(\mu\). Here is what each parameter does:

  1. \(\alpha\) (intercept): represents the expected height when \(x_i=\bar{x}\)
  2. \(\beta\) (slope): represents the rate of change in expectation when \(x_i\) changes by 1 unit

Priors

  • The unobserved variables are called parameters (\(\alpha, \beta, \sigma\)) and their distributions are called priors.

  • Each combination of parameter values implies a unique line

  • Let’s simulate the prior predictive distribution to see the possible lines

set.seed(2971)
N <- 100 # 100 lines
a <- rnorm(N, 178, 20)
b <- rnorm(N, 0, 10)

# prepare the canvas for plotting
plot(NULL, xlim=range(d2$weight), ylim=c(-100,400), xlab="weight", ylab="height")
abline(h=0, lty=2) # no one is shorter than zero!
abline(h=272, lty=1, lwd=0.5) # the world's tallest person

xbar <- mean(d2$weight)

# simulate the possible lines
for (i in 1:N) curve(a[i] + b[i]*(x-xbar), 
                     from=min(d2$weight),
                     to=max(d2$weight),
                     add=TRUE,
                     col=col.alpha("black", 0.2))

As we can see, not all the lines seem to represent the relationship between weight and height for human. Negative relationship doesn’t make sense in this context.

We want to restrict \(\beta\) to positive numbers so we only get positive relationship. Therefore, we can define the prior as Log-Normal instead to enforce positive relationship:

\[ \beta \sim Log-Normal(0,1) \]

b <- rlnorm(1e4, 0, 1)
dens(b, xlim=c(-1,5), adj=0.1)

We can see the distribution is defined only on the positive beta values.

Now, let’s do the prior predictive simulation again with the new prior:

set.seed(2971)
N <- 100 # 100 lines
a <- rnorm(N, 178, 20)
b <- rlnorm(N, 0, 1) # log-normal prior

# prepare the canvas for plotting
plot(NULL, xlim=range(d2$weight), ylim=c(-100,400), xlab="weight", ylab="height")
abline(h=0, lty=2) # no one is shorter than zero!
abline(h=272, lty=1, lwd=0.5) # the world's tallest person

xbar <- mean(d2$weight)

# simulate the possible lines
for (i in 1:N) curve(a[i] + b[i]*(x-xbar), 
                     from=min(d2$weight),
                     to=max(d2$weight),
                     add=TRUE,
                     col=col.alpha("black", 0.2))

Now, the result is much more sensible!

Finding the posterior distribution

The model is defined now along with the priors. We are now ready to build the posterior approximation using quap

library(rethinking)
data("Howell1")
d <- Howell1
d2 <- d[d$age >= 18,]

xbar <- mean(d2$weight)

# fit the model

m4.3 <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b * (weight-xbar),
    a ~ dnorm(178, 20),
    b ~ dlnorm(0, 1),
    sigma ~ dunif(0, 50)
  ),
  data=d2
)

To interpret the posterior, we can use either tables or plots. Plots gives more information about the posterior. However, let’s see the summary table:

precis(m4.3)
             mean         sd        5.5%       94.5%
a     154.6013671 0.27030766 154.1693633 155.0333710
b       0.9032807 0.04192363   0.8362787   0.9702828
sigma   5.0718809 0.19115478   4.7663786   5.3773831

We also need to see the covariance among the parameters by computing the variance-covariance matrix:

round(vcov(m4.3), 3)
          a     b sigma
a     0.073 0.000 0.000
b     0.000 0.002 0.000
sigma 0.000 0.000 0.037

Plotting the posterior against the data

plot(height~weight, data=d2, col=rangi2)
post <- extract.samples(m4.3)
a_map <- mean(post$a)
b_map <- mean(post$b)
curve(a_map + b_map * (x-xbar), add=TRUE)

post[1:5,]
         a         b    sigma
1 154.5564 0.9000245 4.768499
2 154.4602 0.8474034 5.136055
3 154.7858 0.9440438 5.313512
4 154.4490 0.8806246 4.683555
5 154.4639 0.8397176 5.185017

Uncertainty around the mean

We want to know the uncertainty around the mean of posterior in order to determine the confidence in the relationship between predictor and outcome, since the posterior we plot in the previous step is the MAP, which is the mean of many lines formed by the posterior.

Here is a sample of possible lines:

post[1:5,]
         a         b    sigma
1 154.5564 0.9000245 4.768499
2 154.4602 0.8474034 5.136055
3 154.7858 0.9440438 5.313512
4 154.4490 0.8806246 4.683555
5 154.4639 0.8397176 5.185017

Let’s see how the confident about the location of the mean changes based on data size. First, we will extract the first 10 cases and re-estimate the model:

N <- 10
dN <- d2[1:N, ]
mN <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a+b*(weight - mean(weight)),
    a ~ dnorm(178, 20),
    b ~ dlnorm(0, 1),
    sigma ~ dunif(0, 50)
  ), data=dN
)

Plot 20 of these lines to see what the uncertainty looks like:

post <- extract.samples(mN, n=20)

# plot the 10 sampled cases
plot(dN$weight, dN$height, 
     xlim=range(d2$weight), ylim=range(d2$height),
     col=rangi2, xlab="weight", ylab="height")

mtext(concat("N = ", N))

for(i in 1:20) curve(post$a[i] + post$b[i] * (x-mean(dN$weight)),                col=col.alpha("black", 0.3), add=TRUE)

Plotting regression intervals and contours

Let’s find the quadratic posterior distribution of the mean height \(\mu\) when weight is 50 kg. This distribution represents the relative plausibility of different values of the mean:

post <- extract.samples(m4.3)
mu_at_50 <- post$a + post$b * (50-xbar)
dens(mu_at_50, col=rangi2, lwd=2, xlab="mu|weight=50")

Compatibility interval of \(\mu\) at 50 kg is:

PI(mu_at_50, prob = 0.89)
      5%      94% 
158.5839 159.6832 

To do that for all weight values:

mu <- link(m4.3)
str(mu)
 num [1:1000, 1:352] 157 157 157 158 157 ...

The resulted matrix contains 352 columns, each corresponds to one row in the d2 data. It contains 1000 rows, each represents a sample. Therefore, the matrix contains a distribution of \(\mu\) for each individual in the original data d2.

Let’s plot the Gaussian distribution for each mean value:

plot(height ~ weight, d2, type="n")

for(i in 1:100)  points(d2$weight, mu[i,], pch=16, col=col.alpha(rangi2, 0.1))

The pile of points represents the rows.

The plot is kind of missy, let’s do that for a small group of weight values

weight.seq <- seq(from=25, to=70, by=1)

mu <- link(m4.3, data=data.frame(weight=weight.seq))

plot(height ~ weight, d2, type="n")

for(i in 1:100)  points(weight.seq, mu[i,], pch=16, col=col.alpha(rangi2, 0.1))

Now, let’s summarize the distribution of mu

# compute the mean of each column (dimension 2) of the matrix mu
mu.mean <- apply(mu, 2, mean)
mu.PI <- apply(mu, 2, PI, prob=0.89)

# plot the line and the interval
plot(height ~ weight, data=d2, col=col.alpha(rangi2, 0.5))
lines(weight.seq, mu.mean)
shade(mu.PI, weight.seq)

Summary: recipe of generating predictions and intervals from the posterior

  1. Use link to generate distributions posterior values for \(\mu\)
  2. Use mean or PI to find averages and bounds of \(\mu\) for each value of the predictor variable
  3. Plot the lines and intervals using lines and shades or the distribution of the prediction given the value of predictor(s)

Prediction intervals

What we’ve done so far is just use samples from the posterior to visualize the uncertainty in \(\mu_i\). Now, we want to compute the predictions of heights that’s distributed according to: \(h_i \sim Normal(\mu_i, \sigma)\)

Let’s simulate heights:

# simulate 1e3 data by default
sim.height <- sim(m4.3, data=list(weight=weight.seq))
str(sim.height)
 num [1:1000, 1:46] 146 139 136 135 138 ...

The resulted matrix contains 1000 simulated heights (rows) for 46 weight values (columns). Let’s summarize it:

height.PI <- apply(sim.height, 2, PI, prob=0.89)
height.PI
        [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
5%  128.1853 129.4536 130.1615 131.0184 131.5920 132.3106 133.6148 135.5421
94% 144.4269 145.3213 146.0324 147.5608 148.3459 149.0710 150.5475 151.3630
        [,9]    [,10]    [,11]    [,12]    [,13]    [,14]    [,15]    [,16]
5%  135.8942 136.3520 137.6058 138.6176 139.6518 140.1450 140.8575 141.6930
94% 151.7098 152.9104 153.9036 154.4739 155.3530 156.1087 157.2042 158.1098
       [,17]    [,18]    [,19]    [,20]    [,21]    [,22]    [,23]    [,24]
5%  143.4392 143.9476 144.7856 145.0695 146.4285 147.0674 148.4928 149.2981
94% 158.6873 159.7029 160.9985 161.8457 162.9799 163.1741 164.9025 165.5837
       [,25]    [,26]    [,27]    [,28]    [,29]    [,30]    [,31]    [,32]
5%  150.1580 151.7772 151.4979 152.9998 154.1103 154.4654 155.6601 156.6560
94% 165.9044 166.7547 167.7817 168.9641 169.7019 170.2732 171.7155 171.8713
       [,33]    [,34]    [,35]    [,36]    [,37]    [,38]    [,39]    [,40]
5%  157.6031 158.3161 159.5117 160.4609 160.9952 161.3552 162.1338 163.4090
94% 173.9521 174.8402 175.4895 176.2916 177.2782 178.1800 179.4627 180.3177
       [,41]    [,42]    [,43]    [,44]    [,45]    [,46]
5%  164.5611 165.1676 165.5014 167.6589 168.1717 168.8493
94% 180.4383 181.5567 182.5953 183.3265 184.1630 185.3150

Now, height.PI contains the 89% (we can use any interval) posterior prediction interval of observable heights across the values of weights in weight.seq (i.e. the boundaries of the simulated heights the model expects)

Let’s plot everything: 1. the average line (MAP line) 2. shaded region of 89% plausible \(\mu\) 3. boundaries of the simulated heights the model expects

# plot data points
plot(height ~ weight, d2, col=col.alpha(rangi2, 0.5))

# draw MAP line
lines(weight.seq, mu.mean)

# i used the border because the shade is not appearing for a bug related to R version 
shade(mu.PI, weight.seq,border=TRUE)
shade(height.PI, weight.seq, border = TRUE)

The narrow boundaries that are close to the line are the intervals of \(\mu\). The wider boundary is the region within which the model expects to find 89% of actual heights in the population at each weight.

The rouglness around the prediction interval is due to the simulation variance. We can decrease that by increasing the number of samples we take from the posterior.

sim.height <- sim(m4.3, data=list(weight=weight.seq), n=1e4)

height.PI <- apply(sim.height, 2, PI, prob=0.89)


# plot data points
plot(height ~ weight, d2, col=col.alpha(rangi2, 0.5))

# draw MAP line
lines(weight.seq, mu.mean)

# i used the border because the shade is not appearing for a bug related to R version 
shade(mu.PI, weight.seq,border=TRUE)
shade(height.PI, weight.seq, border = TRUE)

How sim works

  1. extract samples from posterior (i.e. parameters values)
  2. use the built-in simulation functions like rnorm for Gaussian
post <- extract.samples(m4.3)

weight.seq <- 25:70
sim.height <- sapply(weight.seq, function(weight) 
  rnorm(
    n=nrow(post),
    mean=post$a + post$b * (weight - xbar),
    sd=post$sigma
  )  
)

And we can summarize it with PI as normal

height.PI <- apply(sim.height, 2, PI, prob=0.89)

Curves from lines

We can build models to describe the outcome as a curved function of a predictor using the linear regression. Here are the common methods: 1. Polynomial regression 2. B-Splines

Polynomial regression

The following data is seen to be followed a curved relationship

library(rethinking)
data("Howell1")
d <- Howell1
plot(height~weight, d)

We can use the parabolic equation for representing the mean height: \(\mu_i = \alpha + \beta_1 x_i + \beta_2 x_i^2\)

The last parameter \(\beta_2\) measures the curvature of the relationship

Because the polynomial equations involve computing the square or curve of large number, we need to standarize the predictor values in order to avoid the errors in computing estimates. To standarize weight values we do the following:

\[ x_{std.} = \frac{x - \mu_x}{\sigma_x} \]

This unit is called z-score. However, we will use \(x\) instead of \(x_{std.}\) in the following sections.

This is the definition of our model:

\(h_i \sim Normal(\mu_i, \sigma)\)

\(\mu_i = \alpha + \beta_1 x_i + \beta_2 x_i^2\)

\(\alpha \sim Normal(178, 20)\)

\(\beta_1 \sim Log-Normal(0, 1)\)

\(\beta_2 \sim Normal(0, 1)\)

\(\sigma \sim Uniform(0, 50)\)

Note that it is okay to have negative values for \(\beta_2\).

Let’s code that and fit the model to our data:

weight_s <- (d$weight - mean(d$weight)) / sd(d$weight)
weight_s2 <- weight_s ^ 2

m4.5 <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b1*weight_s + b2*weight_s2,
    a ~ dnorm(178, 20),
    b1 ~ dlnorm(0, 1),
    b2 ~ dnorm(0, 1),
    sigma ~ dunif(0, 50)
  ), data=d
)
precis(m4.5)
            mean        sd       5.5%      94.5%
a     146.056799 0.3689795 145.467099 146.646500
b1     21.733299 0.2888903  21.271596  22.195001
b2     -7.803207 0.2741846  -8.241407  -7.365007
sigma   5.774482 0.1764657   5.492455   6.056508

Let’s summarize the prediction and plot it:

weight.seq <- seq(from=-2.2, to=2, length.out=30)
pred_dat <- list(weight_s=weight.seq, weight_s2=weight.seq^2)

# compute predictions of mu for pred_dat as input
mu <- link(m4.5, data=pred_dat)
mu.mean <- apply(mu, 2, mean)
mu.PI <- apply(mu, 2, PI, prob=0.89)

# simulate height values
sim.height <- sim(m4.5, data=pred_dat)
height.PI <- apply(sim.height, 2, PI, prob=0.89)

plot(height ~ weight_s, d, col=col.alpha(rangi2, 0.5))
lines(weight.seq, mu.mean)
shade(mu.PI, weight.seq)
shade(height.PI, weight.seq)

Remember that we are now working on the full data with both adults and non-adults, and that’s why the relationship is not linear as it was with the adults data.

Let’s try building cubic regression on weight:

\(h_i \sim Normal(\mu_i, \sigma)\)

\(\mu_i = \alpha + \beta_1 x_i + \beta_2 x_i^2 + \beta_3 x_i^3\)

\(\alpha \sim Normal(178, 20)\)

\(\beta_1 \sim Log-Normal(0, 1)\)

\(\beta_2 \sim Normal(0, 1)\)

\(\beta_3 \sim Normal(0, 1)\)

\(\sigma \sim Uniform(0,50)\)

weight_s3 <- weight_s ^ 3

m4.6 <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b1*weight_s + b2*weight_s2 + b3*weight_s3,
    a ~ dnorm(178, 20),
    b1 ~ dlnorm(0, 1),
    b2 ~ dnorm(0, 1),
    b3 ~ dnorm(0, 1),
    sigma ~ dunif(0, 50)
  ), data=d
)
precis(m4.6)
            mean        sd       5.5%      94.5%
a     146.394536 0.3099867 145.899117 146.889955
b1     15.219713 0.4762646  14.458550  15.980875
b2     -6.202616 0.2571579  -6.613604  -5.791628
b3      3.583387 0.2287731   3.217763   3.949010
sigma   4.829882 0.1469421   4.595040   5.064724
weight.seq <- seq(from=-2.2, to=2, length.out=30)
pred_dat_m4.6 <- list(weight_s=weight.seq, weight_s2=weight.seq^2, weight_s3=weight.seq^3)
mu <- link(m4.6, data=pred_dat_m4.6)
mu.mean <- apply(mu, 2, mean)
mu.PI <- apply(mu, 2, PI, prob=0.89)

sim.height <- sim(m4.6, pred_dat_m4.6)
height.PI <- apply(sim.height, 2, PI, prob=0.89)

plot(height ~ weight_s, d, col=col.alpha(rangi2, 0.5))
lines(weight.seq, mu.mean)
shade(mu.PI, weight.seq)
shade(height.PI, weight.seq)

The cubic model is more flexible than others and that’s why it fits well. However, we stoll have these issues in our model:

  1. Having a better fit \(\neq\) Having a better model
  2. All the models we built so far have no biological information. We haven’t learnt any causal relationship so far

The models are good geocentric model = meaning they describe the sample well

Note that the x-axis contains the standardized weight values. To convet back to natural scale, we need to remove the current axis and build the axis explicitly:

plot(height ~ weight_s, d, col=col.alpha(rangi2, 0.5), xaxt="n")
lines(weight.seq, mu.mean)
shade(mu.PI, weight.seq)
shade(height.PI, weight.seq)

at <- c(-2,-1,0,1,2)
# convert z-scores to weight values
labels <- at*sd(d$weight) + mean(d$weight)
axis(side=1, at=at, labels=round(labels, 1))

Splines

B-Spline stands for basis spline. It means that we can build wiggly functions from simple less-wiggly bassis components, that are basis functions

We will use data of a 1000 years of blossoms days

library(rethinking)
data("cherry_blossoms")
d <- cherry_blossoms
precis(d, hist=FALSE)
                  mean          sd      5.5%      94.5%
year       1408.000000 350.8845964 867.77000 1948.23000
doy         104.540508   6.4070362  94.43000  115.00000
temp          6.141886   0.6636479   5.15000    7.29470
temp_upper    7.185151   0.9929206   5.89765    8.90235
temp_lower    5.098941   0.8503496   3.78765    6.37000

The B-Spline model

d2 <- d[complete.cases(d),]
num_knots = 15
knot_list <- quantile(d2$year, probs=seq(0, 1, length.out=num_knots))
library(splines)
# create B-spline basis matrix 
B <- bs(d2$year, 
        knots=knot_list[-c(1, num_knots)], # -c(1, num_knots) means exclude the 1st and last element
        degree=3,
        intercept=TRUE)
# Create an empty plot with specified axes
{
  plot(NULL, xlim=range(d2$year), ylim=c(0,1), xlab="year", ylab="basis", type="n")
  
  # Plot knots
  for (knot in knot_list) {
      # Add a vertical line for each knot
      abline(v = knot, col = "red", lty = 2, lwd = 2)
  }
  
  # Plot each column in the basis matrix against year
  for (i in 1:ncol(B)) {
      # Add lines for each column
      lines(d2$year, B[, i])
  }
}



Building the model with quap

m4.7 <- quap(
  alist(
    D ~ dnorm(mu, sigma),
    mu <- a + B %*% w, # matrix multiplication
    a ~ dnorm(100, 10),
    w ~ dnorm(0, 1),
    sigma ~ dexp(1)
  ), 
  data=list(D=d2$doy, B=B),
  start=list(w=rep(0, ncol(B)))
)

Let’s look at the posterior means:

precis(m4.7)
17 vector or matrix parameters hidden. Use depth=2 to show them.
            mean        sd       5.5%      94.5%
a     104.705197 0.3330664 104.172893 105.237502
sigma   6.075087 0.1541852   5.828669   6.321504

Let’s plot the posterior predictions:

post <- extract.samples(m4.7)
# find the mean of all weights

w <- apply(post$w, 2, mean)

plot(NULL, xlim=range(d2$year), ylim=c(-4,4),
     xlab="year", ylab="basis * weight")

# plot the basis * weight for each column
for (i in 1:ncol(B)) lines(d2$year, w[i]*B[,i])

# plot knots
for (knot in knot_list) {
    abline(v = knot, col = "red", lty = 2, lwd = 2)
}

# 97% posterior interval for mu at each year
mu <- link(m4.7)
mu.PI <- apply(mu, 2, PI, 0.97)
plot(d2$year, d2$doy, col=col.alpha(rangi2, 0.3), pch=16)
shade(mu.PI, d2$year, col=col.alpha("black", 0.5))
abline(h = mean(d2$doy, col="black"))

Practice

E

In the following model: \(y_i \sim Normal(\mu_i, \sigma)\)

\(\mu \sim Normal(0,10\)

\(\sigma \sim Exponential(1)\)

  1. The likelihood is \(L = \prod_i P(y_i | \mu_i, \sigma)\)
  2. Two parameters
  3. The Bayes theorem for this model is : \[ P(\mu, \sigma | y) \propto \prod_i P(y_i|\mu, \sigma) P(\mu) P(\sigma) \]

M

For the following model:

\(y_i \sim Normal(\mu_i, \sigma)\)

\(\mu \sim Normal(0,10\)

\(\sigma \sim Exponential(1)\)

  1. Simulate the observed y values from the prior
mu.sample <- rnorm(1e3, 0, 10)
sigma.sample <- rexp(1e3, 1)
y.sim <- rnorm(1e3, mu.sample, sigma.sample)

dens(y.sim)

  1. Translate the model into a quap formula

H

library(rethinking)
library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.3.2 --
v ggplot2 3.5.1     v purrr   1.0.2
v tibble  3.1.6     v dplyr   1.0.9
v tidyr   1.2.0     v stringr 1.4.0
v readr   2.1.2     v forcats 0.5.1
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
x purrr::map()    masks rethinking::map()
x dplyr::select() masks MASS::select()
data("Howell1")
d <- Howell1

xbar <- mean(d$weight)

model <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu  <-  a + b * (weight - xbar),
    a ~ dnorm(178, 20),
    b ~ dlnorm(0, 1),
    sigma ~ dunif(0, 50)
  ), data=d
)

weights <- c(46.95, 43.72, 64.78, 32.59, 54.63)

# `extract.samples`: sample parameter values from the posterior
# `link`: estimate the mean height for each weight
# `sim`: estimate observations, i.e. sample estimated values

heights <- sim(model, data=data.frame(weight=weights))
heights.mean <- apply(heights, 2, mean)
heights.PI <- apply(heights, 2, PI)
result <- tibble(weight=weights,
                 expected_height=heights.mean,
                 low=heights.PI[1,],
                 hi=heights.PI[2,],)

result
# A tibble: 5 x 4
  weight expected_height   low    hi
   <dbl>           <dbl> <dbl> <dbl>
1   47.0            158.  143.  173.
2   43.7            153.  138.  168.
3   64.8            190.  175.  204.
4   32.6            133.  118.  148.
5   54.6            172.  157.  187.
ggplot(result, aes(weights, expected_height, ymin=low, ymax=hi)) +
  geom_point(size=0.5) +
  geom_linerange()

  1. Fit the model to data with ages < 18
d2 <- d[d$age < 18, ]

xbar2 <- mean(d2$weight)

model2 <- quap(
  alist(
    height ~ dnorm(mu, sigma),
    mu  <-  a + b * (weight - xbar2),
    a ~ dnorm(xbar2, 20),
    b ~ dlnorm(0, 1),
    sigma ~ dunif(0, 50)
  ), 
  data=d2,
  start=list(a=mean(d2$height), b=3)
)

(a): For every 10 units of increase in weight, how much taller does the model predict a child gets?

precis(model2)
            mean         sd       5.5%      94.5%
a     108.235622 0.60868503 107.262826 109.208418
b       2.716672 0.06831628   2.607489   2.825855
sigma   8.437275 0.43058790   7.749112   9.125437

When the weight equals the mean, the expected height (a) is 108.2. For every change in weight of 10 kg, the height is expected to change by 27 cm, with 89% PI of 26 to 28

(b): plot data, MAP regression line and its 89% PI, and the 89% PI for predicted heights

{
  # plot d2 data
  plot(height ~ weight, data=d2, col=col.alpha(rangi2, 0.5), 
       ylim=c(50, 200), xlim=c(1, 50))
    
  weight.seq <- seq(from=4, to=45, by=1)
  
  # sample values of mean 
  mu <- link(model2, data=data.frame(weight=weight.seq))
  # expected mu and 89% PI
  mu.mean <- apply(mu, 2, mean)
  mu.PI <- apply(mu, 2, PI, prob=0.89)
  
  # plot the line and the PI 
  lines(weight.seq, mu.mean)
  shade(mu.PI, weight.seq)
  
  # sample expected values of height
  sim.height <- sim(model2, data=list(weight=weight.seq))
  
  # plot the line and the PI 
  heights.PI <- apply(sim.height, 2, PI, prob=0.89)
  shade(heights.PI, weight.seq)

}