# ------------------------------
#        ESERCITAZIONE 3
#    Hierarchical Model (RJags)
# ------------------------------

# In order to understand how many resources to allocate for next
# year (rooms, canteens ...) a university needs to make a prediction
# on how many students will pass the admission test next year.
# In order to do so, the university relies on some data collected 
# this year, in particular the score (pass/not pass) of each student
# of each high school.
#
# Assumptions:
# - Number of students applying for the university is random between
#   10 and 30
# - Each student has a probability to pass the test p[s] (s is the index
#   of the school) which is proportional to how good is the high school
#   he comes from
# - It is supposed that p[s] of each school depends on the quality of instruction
#   in the whole country
#
# Goal:
# - Estimate how many resources the university should allocate next year
#   to be confident enough to provide good services for everyone.
# - Study how the number of schools estimated for next year impact on
#   the predictions.
# - Study how the latter changes if we have prior information regarding
#   the quality of instruction
# - Study how the number of schools used for the analysis impacts on
#   the predictions.



library(R2jags)
library(rjags)



###################
# DATA GENERATION #
###################

set.seed(1234)

n.schools <- 20
p.glob.true <- 0.4
a.true <- 4

N <- round(runif(n.schools, 10, 30))
p <- rbeta(n.schools, a.true, ((1-p.glob.true)/p.glob.true)*a.true)

print(mean(p))

p.vec <- c()
p.obs <- c()
school <- c()

for(i in 1:n.schools){
  p.vec <- c(p.vec, rep(p[i], N[i]))
  school <- c(school, rep(i, N[i]))
}

pass <- rbinom(sum(N), 1, p.vec)
df <- data.frame(pass=pass, school=school)

for(i in 1:n.schools){
  p.obs[i] <- mean(df$pass[df$school==i])
}


par(mfrow=c(1,1))
plot(seq(1,n.schools), p, pch=16, col="black", ylim=c(0,1),
     xlab = "schools", ylab = "True Probabilities", main="Data generated")
points(seq(1,n.schools), p.obs, pch=5, col="black")
abline(h=mean(p), lty=2, col="red")
legend("bottom", c("Observed P", "True P"), pch=c(5,16))

print(df)




#######################
# MODEL SPECIFICATION #
#######################

model <- "
     model{
         
         # Priors
         
         p.glob ~ dbeta(a.glob, b.glob)
         a ~ dunif(1,100)
         for(i in 1:n.schools){
           p[i] ~ dbeta(a, ((1-p.glob)/p.glob)*a)
         }
         
         
         # Likelihood
     
         for(i in 1:n.students){
               y[i] ~ dbin(p[school[i]],1)
         }
     }

"
parameters <- c("p", "p.glob", "a")


################################
# TYPES OF PRIOR DISTRIBUTIONS #
################################

# Function to estimate Beta parameters given mean and variance
estBetaParams <- function(mu, var) {
  alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
  beta <- alpha * (1 / mu - 1)
  return(params = list(alpha = alpha, beta = beta))
}

a.glob <- c()
b.glob <- c()

mus.glob <- c(0.5, 0.9, 0.1)
vars.glob <- c(0.288^2, 0.05^2, 0.05^2)

for(i in 1:length(mus.glob)){
  BetaPar <- estBetaParams(mus.glob[i], vars.glob[i])
  a.glob[i] <- BetaPar$alpha
  b.glob[i] <- BetaPar$beta
}


xx <- seq(0,1,0.0001)
prior.vague <- dbeta(xx, a.glob[1], b.glob[1])
prior.optim <- dbeta(xx, a.glob[2], b.glob[2])
prior.pessim <- dbeta(xx, a.glob[3], b.glob[3])

par(mfrow=c(1,3))
plot(xx, prior.vague, type="l", main="Vague Prior")
plot(xx, prior.optim, type="l", main="Optimistic Prior")
plot(xx, prior.pessim, type="l", main="Pessimistic Prior")





#######################
# POSTERIOR INFERENCE #
#######################

n.iter <- 10000

dati <- list(y=df$pass, school=df$school, n.schools=length(p),
             n.students=nrow(df), a.glob=a.glob[1], b.glob=b.glob[1])

post.sample <- coda.samples( model= jags.model(file = textConnection(model), 
                                               data = dati), variable.names = parameters, n.iter = n.iter,
                             n.chains = 1, thin =1 )

print(summary(post.sample))




###############
# PREDICTIONS #
###############

ns.list <- c(5,10,20,50)
n.sim.pred <- 1000
perc.amm <- array(NA, dim=c(length(ns.list),n.sim.pred))

for(ns in 1:length(ns.list)){
  n.schools.new <- ns.list[ns]
  for(k in 1:n.sim.pred){
    
    p.glob <- post.sample[[1]][k,n.schools+2]
    a <- median(post.sample[[1]][k,1])
    
    N.new <- round(runif(n.schools.new, 10, 30))
    p.new <- rbeta(n.schools.new, a, ((1-p.glob)/p.glob)*a)
    
    p.pred <- c()
    pass.pred <- c()
    
    for(i in 1:n.schools.new){
      p.pred <- c(p.pred, rep(p.new[i], N.new[i]))
    }
    
    pass.pred <- rbinom(sum(N.new), 1, p.pred)
    perc.amm[ns,k] <- round(mean(pass.pred), 3)
  }
}



# Density Plots
par(mfrow=c(1,1))
colors <- rainbow(length(ns.list))
plot(density(perc.amm[1,]), col=colors[1],  lwd=1.5, ylim=c(0,12), main="Superamento Medio Test")
for(i in 2:length(ns.list)){
  lines(density(perc.amm[i,]), col=colors[i], lwd=1.5)
}

legend("topright", c("5 schools", "10 schools", "20 schools", "50 schools"),
       lty=1, col=colors)



# Summary Table
summary.table <- data.frame(array(NA, dim=c(length(ns.list), 4)))
for(i in 1:length(ns.list)){
  summary.table[i,] <- quantile(perc.amm[i,], c(0.5, 0.8, 0.9, 0.95), names = FALSE)
}

rownames(summary.table) <- c("5 schools", "10 schools", "20 schools", "50 schools")
colnames(summary.table) <- c("Median", "80%", "90%", "95%")

print(summary.table)



Embed on website

To embed this project on your website, copy the following code and paste it into your website's HTML: