set.seed(12345)
library(lhs)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
#set up mogp and Exeter_mogp (will be different for you)
mogp_dir <- "~/MoGP/mogp_emulator-devel"
setwd("~/MoGP/ExeterUQ_MOGP-devel")
source("~/MoGP/ExeterUQ_MOGP-devel/BuildEmulator/BuildEmulator.R")
#get simulator data
setwd("~/My_Files/Covid/uq4covid.github.io/vignettes/")
design <- readRDS("data/EnsembleZero/inputs/design.rds")
parRanges <- readRDS("data/EnsembleZero/inputs/parRanges.rds")
Hprev_12 <- as_tibble(readRDS("~/My_Files/Covid/Initial Calibration/Hprev_12.rds")) # see the EnsembleZer vignette for info on how to get data
#tidy up
Ward_Lookup <- read_csv("data/EnsembleZero/Ward_Lookup.csv")
names(Ward_Lookup)[11] <- "ward"
Ward_Lookup <- Ward_Lookup[,-c(3:7,10)]
NewOut <- inner_join(Ward_Lookup, Hprev_12, by="ward")
#match to hosptial trusts
WD11ToAcuteTrustIncWalesHB <- read_csv("data/EnsembleZero/WD11ToAcuteTrustIncWalesHB.csv")
NewWithHosptial <- inner_join(WD11ToAcuteTrustIncWalesHB, NewOut, by=c("WD11CD", "WD11NM"))
TrustData <- group_by(NewWithHosptial, trustId, trustName, output, replicate) %>%
summarise(Hprev=mean(Hprev)) %>%
ungroup()
#put in nice format for emulation
library(tidyr)
EmulateOut <- dplyr::select(TrustData, output, trustId, replicate, Hprev) %>% pivot_wider(names_from = trustId, values_from = Hprev) %>%
dplyr::select(-replicate)
ToEmulate <- inner_join(design, EmulateOut, by="output")
head(ToEmulate)
## # A tibble: 6 x 152
## r_zero incubation_time infectious_time hospital_time critical_time
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.801 0.746 0.952 0.574 0.940
## 2 0.801 0.746 0.952 0.574 0.940
## 3 0.801 0.746 0.952 0.574 0.940
## 4 0.801 0.746 0.952 0.574 0.940
## 5 0.801 0.746 0.952 0.574 0.940
## 6 0.801 0.746 0.952 0.574 0.940
## # ... with 147 more variables: lock_1_restrict <dbl>, lock_2_release <dbl>,
## # pEA <dbl>, pIH <dbl>, pIRprime <dbl>, pHC <dbl>, pHRprime <dbl>, pCR <dbl>,
## # GP_A <dbl>, GP_H <dbl>, GP_C <dbl>, output <chr>, repeats <dbl>,
## # `7A1` <dbl>, `7A2` <dbl>, `7A3` <dbl>, `7A4` <dbl>, `7A5` <dbl>,
## # `7A6` <dbl>, R0B <dbl>, R1F <dbl>, R1H <dbl>, RA2 <dbl>, RA3 <dbl>,
## # RA4 <dbl>, RA7 <dbl>, RA9 <dbl>, RAE <dbl>, RAJ <dbl>, RAL <dbl>,
## # RAP <dbl>, RAS <dbl>, RAX <dbl>, RBD <dbl>, RBK <dbl>, RBL <dbl>,
## # RBN <dbl>, RBT <dbl>, RBZ <dbl>, RC1 <dbl>, RC9 <dbl>, RCB <dbl>,
## # RCD <dbl>, RCF <dbl>, RCU <dbl>, RCX <dbl>, RD1 <dbl>, RD3 <dbl>,
## # RD8 <dbl>, RDD <dbl>, RDE <dbl>, RDU <dbl>, RDZ <dbl>, REF <dbl>,
## # REM <dbl>, RF4 <dbl>, RFF <dbl>, RFR <dbl>, RFS <dbl>, RGN <dbl>,
## # RGP <dbl>, RGR <dbl>, RGT <dbl>, RH5 <dbl>, RH8 <dbl>, RHM <dbl>,
## # RHQ <dbl>, RHU <dbl>, RHW <dbl>, RJ1 <dbl>, RJ2 <dbl>, RJ6 <dbl>,
## # RJ7 <dbl>, RJC <dbl>, RJE <dbl>, RJL <dbl>, RJN <dbl>, RJR <dbl>,
## # RJZ <dbl>, RK5 <dbl>, RK9 <dbl>, RKB <dbl>, RKE <dbl>, RL4 <dbl>,
## # RLQ <dbl>, RLT <dbl>, RM1 <dbl>, RM3 <dbl>, RMC <dbl>, RMP <dbl>,
## # RN3 <dbl>, RN5 <dbl>, RN7 <dbl>, RNA <dbl>, RNN <dbl>, RNQ <dbl>,
## # RNS <dbl>, RNZ <dbl>, RP5 <dbl>, RPA <dbl>, ...
And prepare for emulation
#split into training and testing
data <- ToEmulate[801:4000,]
data_valid <- ToEmulate[1:800,]
#extract Xs and ys
X <- data[,1:16]
X_valid <- data_valid[,1:16]
y <- data[,19:ncol(data)]
y_valid <- data_valid[,19:ncol(data_valid)]
#sqrt transform to ensure positivity
y <- sqrt(y)
y_valid <- sqrt(y_valid)
#and also standardise
ymean <- apply(y, 2, mean)
ysd <- apply(y, 2, sd)
y <- (y - ymean[col(y)]) / ysd[col(y)]
y_valid <- (y_valid - ymean[col(y_valid)]) / ysd[col(y_valid)]
##First emulator is a QK
#We're going to use ExeterUQ for the backend of this, so data needs to be further tidied up
#and rearranged for correct format for QK
#get location names
tnames <- names(data)[-c(1:18)]
tnames[1:5]
## [1] "7A1" "7A2" "7A3" "7A4" "7A5"
tnames <- paste("trust", tnames, sep = "_")#add "trust" to beginning of each name
#and ensure no weird characters
for(i in 1:length(tnames)){
tsplit <- strsplit(tnames[i],split=" ")[[1]]
if(length(tsplit)>1)
tnames[i] <- paste(tsplit,collapse = "")
}
for(i in 1:length(tnames)){
tsplit <- strsplit(tnames[i],split="-")[[1]]
if(length(tsplit)>1)
tnames[i] <- paste(tsplit,collapse = "")
}
for(i in 1:length(tnames)){
tsplit <- strsplit(tnames[i],split=",")[[1]]
if(length(tsplit)>1)
tnames[i] <- paste(tsplit,collapse = "")
}
for(i in 1:length(tnames)){
tsplit <- strsplit(tnames[i],split="'")[[1]]
if(length(tsplit)>1)
tnames[i] <- paste(tsplit,collapse = "")
}
#then we use the following function to rearrange the data such that the empirical quantiles are an input (for a few quantiles)
#code stolen from Danny which was in-turn modified code of mine
QK_data <- function(X, y){
full_output <- as.matrix(y)
output_dim <- ncol(full_output)
quantile_dfs <- list()
y1 <- full_output[,1]
quantile_df <- data.frame(X = X, y1 = y1)
quantile_df <- quantile_df %>% group_by_at(vars(names(quantile_df)[1:(length(X[1,]))])) %>% summarise("0.025" = quantile(y1, 0.025), "0.25" = quantile(y1, 0.25), "0.5" = quantile(y1, 0.5), "0.75" = quantile(y1, 0.75), "0.975" = quantile(y1, 0.975))
final_df <- quantile_df %>% gather(Quantile, y1, "0.025":"0.975")
names(final_df)[ncol(final_df)] <- tnames[1]
for(i in 2:ncol(full_output)){
y1 <- full_output[,i]
quantile_df <- data.frame(X = X, y1 = y1)
quantile_df <- quantile_df %>% group_by_at(vars(names(quantile_df)[1:(length(X[1,]))])) %>% summarise("0.025" = quantile(y1, 0.05), "0.25" = quantile(y1, 0.25), "0.5" = quantile(y1, 0.5), "0.75" = quantile(y1, 0.75), "0.975" = quantile(y1, 0.975))
quantile_df <- quantile_df %>% gather(Quantile, y1, "0.025":"0.975")
final_df <- merge(final_df, quantile_df)
names(final_df)[ncol(final_df)] <- tnames[i]
}
final_df
}
#then use this function to get the data, and prepare for ExeterUQ
final_df <- QK_data(X,y[,1:3]) #(for speed, lets just consider the first few outputs for now)
Noise <- rnorm(nrow(final_df),0,0.4) #ExeterUQ needs artifical noise for buildingt he prior mean function
tData <- cbind(final_df[,1:17],Noise,final_df[,18:ncol(final_df)]) #ExeterUQ input
tData$Quantile <- as.numeric(tData$Quantile)
#now build the QK emulators for each output location
MetaEmulatorsAll <- BuildNewEmulators(tData, HowManyEmulators = ncol(tData)-18, meanFun = "linear", kernel = "Matern52",additionalVariables = names(tData)[1:17])
#also extract nuggets (note im using a slightly older version of mogp where you have to do this, newer versions you dont)
nuggets <- rep(NA, 3)
for (i in 1:3){
nuggets[i] <- MetaEmulatorsAll$mogp$emulators[[i]]$nugget
}
And see how this performs
#And some rudimentary validation
#to validate a QK, we need to do sampling, as using the point estimates for select quantiles
#doesnt account for the epistemic uncertainty in predictions, making the emulator seem worse than it is
M = 1000 #how many samples
X_pred <- unique(X_valid) #only want to predict for each unique point
pred_samples <- list()
for (i in 1:nrow(X_pred)){ #for every pred point
#get inputs to emulator to predict (by sampling quantiles and pairing with x):
quantiles_pred <- runif(M, 0, 1)
X_pred_sample <- cbind(matrix(rep(X_pred[i,],each=M),nrow=M), quantiles_pred)
pred = MetaEmulatorsAll$mogp$predict(data.frame(X_pred_sample)) #make predictions
samples <- matrix(NA, nrow = M, ncol = 3)
for (j in 1:3){ #then sample from normal (once per sample per output dimension)
samplesj <- rnorm(M, pred$mean[j,], pred$unc[j,]+nuggets[j])
samples[,j] <- samplesj
}
pred_samples[[i]] <- samples
}
#now obtain validation plots
m_pred5 <- t(sapply((pred_samples), function(x) apply((x), 2, quantile, 0.5))) #get intervals from sample dist
m_pred975 <- t(sapply((pred_samples), function(x) apply((x), 2, quantile, 0.975)))
m_pred025 <- t(sapply((pred_samples), function(x) apply((x), 2, quantile, 0.025)))
m_pred75 <- t(sapply((pred_samples), function(x) apply((x), 2, quantile, 0.75)))
m_pred25 <- t(sapply((pred_samples), function(x) apply((x), 2, quantile, 0.25)))
#we repeat the predictions each 20 times, as the validaiton data is repeated 20 times each
m_pred5 =matrix(rep(m_pred5,each=20),ncol=ncol(m_pred5))
m_pred975 =matrix(rep(m_pred975,each=20),ncol=ncol(m_pred975))
m_pred025 =matrix(rep(m_pred025,each=20),ncol=ncol(m_pred025))
m_pred75 =matrix(rep(m_pred75,each=20),ncol=ncol(m_pred75))
m_pred25 =matrix(rep(m_pred25,each=20),ncol=ncol(m_pred25))
#Try Location 1
loc = 1
#within?
within = ((y_valid[,loc]) >= m_pred025[,loc]) & ((y_valid[,loc]) <= m_pred975[,loc])
ggplot()+
geom_errorbar(aes(x=X_valid[,1][[1]], ymin = m_pred025[,loc], ymax =m_pred975[,loc] ))+
geom_errorbar(aes(x=X_valid[,1][[1]], ymin = m_pred25[,loc], ymax =m_pred75[,loc] ))+
geom_point(aes(x=X_valid[within,1][[1]], y = y_valid[within,loc]), colour = "blue")+
geom_point(aes(x=X_valid[!within,1][[1]], y = y_valid[!within,loc]), colour = "red")+
geom_point(aes(x=X_valid[,1][[1]], y = m_pred5[,loc]))+
ylab("y (for site 1)")+
xlab("x_1")
This one sucks, in numerous places it misses the “mean” of the output, and almost always gets the distribution wrong as well
#fit hetGP instead
library(hetGP)
fit <- mleHetGP(as.matrix(X), as.matrix(y[,1])) #just use 1 output (for speed, and quicker to implement)
And how does this look?
#now do valdiation
pred <- predict(fit, as.matrix(X_valid))
m_pred5 <- pred$mean
m_pred975 <- pred$mean + 2*sqrt(pred$nugs+pred$sd2)
m_pred025 <- pred$mean - 2*sqrt(pred$nugs+pred$sd2)
#within?
within = ((y_valid[,loc]) >= m_pred025) & ((y_valid[,loc] <= m_pred975))
ggplot()+
geom_errorbar(aes(x=X_valid[,loc][[1]], ymin = m_pred025, ymax =m_pred975 ))+
geom_point(aes(x=X_valid[within,loc][[1]], y = y_valid[within,loc]), colour = "blue")+
geom_point(aes(x=X_valid[!within,loc][[1]], y = y_valid[!within,loc]), colour = "red")+
geom_point(aes(x=X_valid[,loc][[1]], y = m_pred5))+
ylab("y (for site 1)")+
xlab("x_1")
This one also sucks, the variance is grossly overestimated….
(in desperation) (this is also not fast to fit)
final_df <- QK_data(X,y[,1:20]) #we want all the outputs now, as a NN can use the same latent nodes for many outputs
#(note, this is then not fast, so we just use 20 here)
tData <- cbind(final_df[,1:17],final_df[,18:ncol(final_df)])
tData$Quantile <- as.numeric(tData$Quantile)
library(neuralnet) #ah the wonders of packages, we can as mindlessly build a NN jsut as we would linear regression...
formula <- as.formula(paste(paste(colnames(tData)[18:ncol(tData)], collapse = " + ") , " ~ ", paste(colnames(tData)[1:17], collapse = " + ")))
nn=neuralnet(formula, data = tData, hidden=3 ,act.fct = "tanh")
And how does this look?
#dont need to sample, as theres no epistemic uncertainty
m_pred5 <- predict(nn, data.frame(X_valid, "Quantile" = 0.5)) #mean est
m_pred975 <- predict(nn, data.frame(X_valid, "Quantile" = 0.975))
m_pred025 <- predict(nn, data.frame(X_valid, "Quantile" = 0.025))
m_pred75 <- predict(nn, data.frame(X_valid, "Quantile" = 0.75))
m_pred25 <- predict(nn, data.frame(X_valid, "Quantile" = 0.25))
within = ((y_valid[,loc]) >= m_pred025[,loc]) & ((y_valid[,loc]) <= m_pred975[,loc])
ggplot()+
geom_point(aes(x=X_valid[within,1][[1]], y = y_valid[within,loc]), colour = "blue")+
geom_point(aes(x=X_valid[!within,1][[1]], y = y_valid[!within,loc]), colour = "red")+
geom_point(aes(x=X_valid[,1][[1]], y = m_pred5[,loc]))+
geom_errorbar(aes(x=X_valid[,1][[1]], ymin = m_pred025[,loc], ymax =m_pred975[,loc] ))+
geom_errorbar(aes(x=X_valid[,1][[1]], ymin = m_pred25[,loc], ymax =m_pred75[,loc] )) +
ylab("y (for site 1)")+
xlab("x_1")
And this is terrible (not really suprising without any degree of tuning).