### R script for Introduction to Bayesian Analysis with R-INLA ### Day 2, ESAC Workshop 2017, Madrid ### ### Instructions ### (1) Step through the completed code with default parameterisations; ### questions in comments are there to check that *you* understand how ### the models are structured ### ### Further info: Dr Ewan Cameron library(INLA) # install.packages("INLA", repos="https://inla.r-inla-download.org/R/stable", dep=TRUE) library(rgeos) # install.packages("rgeos") library(quantreg) # install.packages(c("rgeos","quantreg")) ## Extract a nice clean sample of galaxies to play with from Nair & Abraham (2010) nair.data <- read.csv("NairAbrahamMorphology.cat",sep="\t",head=1) valid <- which(nair.data$z > 0.025 & nair.data$z < 0.035 & nair.data$Mass < 999999 & nair.data$Mass > 9 & nair.data$SFRM > -15 & nair.data$V_disp > 20 & nair.data$RA > 150 & nair.data$RA < 225 & nair.data$DEC > 40 & nair.data$DEC < 60 & nair.data$TType < 99) mass <- nair.data$Mass[valid] mass_err <- nair.data$Mass[valid] sfrm <- nair.data$SFRM[valid] barred <- matrix(as.integer(intToBits(nair.data$Bar[valid])),ncol=length(valid)) barred <- as.integer(barred[2,]>0 | barred[3,]>0 | barred[4,]>0) vdisp <- nair.data$V_disp[valid] vdisperr <- nair.data$V_disp_Err[valid] ra <- nair.data$RA[valid] dec <- nair.data$DEC[valid] type <- nair.data$TType[valid] ## A logistic regression model for the bar fraction: w/ linear predictors only # Simplest possible exploratory analysis: just bin the data and plot! plot(seq(9.125,11.875,by=0.25),hist(mass[barred==1],breaks=seq(9,12,by=0.25),plot=F)$counts/(hist(mass[barred==1],breaks=seq(9,12,by=0.25),plot=F)$counts+hist(mass[barred==0],breaks=seq(9,12,by=0.25),plot=F)$counts),xlab="Mass",ylab="Bar Fraction") plot(seq(-13.25,-8.25,by=0.5),hist(sfrm[barred==1],breaks=seq(-13.5,-8,by=0.5),plot=F)$counts/(hist(sfrm[barred==1],breaks=seq(-13.5,-8,by=0.5),plot=F)$counts+hist(sfrm[barred==0],breaks=seq(-13.5,-8,by=0.5),plot=F)$counts),xlab="SFRM",ylab="Bar Fraction") # Or make a scatter plot with barred galaxies highlighted plot(mass,sfrm) points(mass[barred==1],sfrm[barred==1],col="red",pch=19,cex=0.5) # The simplest possible INLA model: a logistic regression with no random effects data <- data.frame(mass,sfrm,barred) formula <- barred ~ mass + sfrm # this is a standard regression model notation in R: e.g. R's default 'glm' function: the presence of a constant intercept is assumed unless explicitly removed by adding a -1 as barred ~ -1 + mass + sfrm result <- inla(formula,family="binomial",data=data,control.compute= list(dic=TRUE, cpo = TRUE)) # family: tells INLA how the data are connected to the predictor variables (the logistic transform is assumed for binomial data) # control.compute: options in the class of control.XXX are input as lists and allow for additional INLA features: here we ask to compute also the DIC criterion and PIT statistics # for more details: help(inla), help(control.compute) summary(result) ilogit <- function(x) {1/(1+exp(-x))} plot(mass,sfrm,col=hsv(ilogit(result$summary.linear.predictor[,1]))) # the output of an INLA run is contained in lists (i.e, access with $name, use names(result), names(result$summary.fixed), etc.) hist(result$cpo$pit) # why does the PIT look so bad? ( http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0181790 ) ## Binomial regression for bar fraction: w/ linear predictors plus GP term mesh <- inla.mesh.2d(loc.domain=cbind(mass,sfrm),max.edge=c(0.5,5)) plot(mesh) spde <- inla.spde2.pcmatern(mesh,prior.range=c(0.5,0.1),prior.sigma=c(1,0.1)) A <- inla.spde.make.A(mesh,cbind(mass,sfrm)) stk <- inla.stack(data=list(barred=barred),A=list(A,1,1),effects=list(s=1:spde$n.spde,mass=mass,sfrm=sfrm),tag='est') formula <- barred ~ mass + sfrm + f(s,model=spde) result2 <- inla(formula,family="binomial",data=inla.stack.data(stk),control.compute= list(dic=TRUE,cpo=TRUE),control.predictor=list(A=inla.stack.A(stk))) summary(result2) plot(mass,sfrm,col=hsv(ilogit(result2$summary.linear.predictor[,1]))) rect(10,-11.3,11.2,-10) # this looks interesting; hypothesis? # better check whether this model actually performed better: start with the DIC cat("DIC Model 1 = ",result$dic$dic,"\n") cat("DIC Model 2 = ",result2$dic$dic,"\n") # which is preferred? what else might we do next? ## Simple shrinkage on velocity dispersion data <- data.frame(vdisp,vdisperr,id=1:length(vdisp)) formula <- vdisp ~ 1 + f(id,model="iid") # represent the underlying distribution for the true velocity dispersion of each galaxy as an intercept for the mean velocity dispersion plus an iid draw from a zero mean normal of unknown (to be fitted) width result <- inla(formula,data=data,family="gaussian",control.family = list(hyper = list(prec = list(prior = "loggamma",param = c(10000,10000), initial = 1))),scale=(1/vdisperr^2),control.predictor=list(compute=TRUE),control.compute= list(dic=TRUE,cpo=TRUE)) # the observational errors here are introduced by specifying control family to be "gaussian" with the scale (here parameterised as inverse variance; see help(inla)) set by the 'known' observational errors # the default inla model assumes that a rescaling of our 'known' observational errors is allowed but for now e want to use exactly the ones we're given so we set the prior to concentrate on a scaling of one param=c(10000,10000): this is a gamma(shape=10000,rate=10000) prior layout(matrix(c(1,2,3,4),nrow=2)) # may need to resize your plotting window to see all four plots hist(vdisp,breaks=30) lines(seq(1,350),dnorm(seq(1,350),result$summary.fixed$mean,1/sqrt(result$summary.hyperpar$mean[2]))*15000,col="red") plot(vdisperr,result$summary.fitted.values$sd,xlim=c(0,50),ylim=c(0,50)) lines(c(0,50),c(0,50),col="red") plot(vdisp,result$summary.fitted.values$mean-vdisp) lines(c(0,500),c(0,0),col="red") plot(vdisp,vdisperr) # do these diagnostics of the shrinkage make sense to you? is the normal distribution seem to be doing a good job here given the available data and their errors? layout(1) hist(result$cpo$pit) # try the PIT; what does this suggest? how might we try to improve? ## Local density estimation: there are a number of implementations that can lead to essentially the same model structure when using INLA for LGCP problems: this version is based on the guide by Elias Krainski layout(1) plot(ra,dec) n <- length(ra) loc.d <- matrix(NA,nrow=5,ncol=2) # boundary of our sample selection: this is important to specify since the likelihood of the Poisson process is defined over a specific area, not just at the locations of our data points loc.d[1,] <- c(150,40) loc.d[2,] <- c(225,40) loc.d[3,] <- c(225,60) loc.d[4,] <- c(150,60) loc.d[5,] <- c(150,40) mesh <- inla.mesh.2d(loc.domain=loc.d,max.edge=c(2,5),offset=0) plot(mesh) nv <- mesh$n # number of nodes in our mesh inla.mesh.dual <- function(mesh) { # this is an auxiliary function from 'INLA tools' designed to help estimate the area of each Voronoi polygon if (mesh$manifold=='R2') { ce <- t(sapply(1:nrow(mesh$graph$tv), function(i) colMeans(mesh$loc[mesh$graph$tv[i, ], 1:2]))) library(parallel) pls <- mclapply(1:mesh$n, function(i) { p <- unique(Reduce('rbind', lapply(1:3, function(k) { j <- which(mesh$graph$tv[,k]==i) if (length(j)>0) return(rbind(ce[j, , drop=FALSE], cbind(mesh$loc[mesh$graph$tv[j, k], 1] + mesh$loc[mesh$graph$tv[j, c(2:3,1)[k]], 1], mesh$loc[mesh$graph$tv[j, k], 2] + mesh$loc[mesh$graph$tv[j, c(2:3,1)[k]], 2])/2)) else return(ce[j, , drop=FALSE]) }))) j1 <- which(mesh$segm$bnd$idx[,1]==i) j2 <- which(mesh$segm$bnd$idx[,2]==i) if ((length(j1)>0) | (length(j2)>0)) { p <- unique(rbind(mesh$loc[i, 1:2], p, mesh$loc[mesh$segm$bnd$idx[j1, 1], 1:2]/2 + mesh$loc[mesh$segm$bnd$idx[j1, 2], 1:2]/2, mesh$loc[mesh$segm$bnd$idx[j2, 1], 1:2]/2 + mesh$loc[mesh$segm$bnd$idx[j2, 2], 1:2]/2)) yy <- p[,2]-mean(p[,2])/2-mesh$loc[i, 2]/2 xx <- p[,1]-mean(p[,1])/2-mesh$loc[i, 1]/2 } else { yy <- p[,2]-mesh$loc[i, 2] xx <- p[,1]-mesh$loc[i, 1] } Polygon(p[order(atan2(yy,xx)), ]) }) return(SpatialPolygons(lapply(1:mesh$n, function(i) Polygons(list(pls[[i]]), i)))) } else stop("It only works for R2!") } dmesh <- inla.mesh.dual(mesh) # determine a weight for each Voronoi cell based on its area domainSP <- SpatialPolygons(list(Polygons(list(Polygon(loc.d)), '0'))) sum(w <- sapply(1:length(dmesh), function(i) { if (gIntersects(dmesh[i,], domainSP)) return(gArea(gIntersection(dmesh[i,], domainSP))) else return(0) })) spde <- inla.spde2.pcmatern(mesh,prior.range=c(0.5,0.1),prior.sigma=c(1,0.1)) # define the spde component A <- inla.spde.make.A(mesh,cbind(ra,dec)) # project spde to observed data locations y.pp <- rep(0:1, c(nv, n)) # advanced INLA data/formula wrangling to properly constrain field over mesh locations: inside and outside of observing window: further details in Krainski's guide ( http://www.math.ntnu.no/inla/r-inla.org/tutorials/spde/spde-tutorial.pdf ) e.pp <- c(w, rep(0, n)) imat <- Diagonal(nv, rep(1, nv)) A.pp <- rBind(imat, A) stk.pp <- inla.stack(data=list(y=y.pp, e=e.pp),A=list(1,A.pp), tag='pp', effects=list(list(b0=rep(1,nv+n)), list(i=1:nv))) pp.res <- inla(y ~ 0 + b0 + f(i, model=spde), family='poisson', data=inla.stack.data(stk.pp), control.predictor=list(A=inla.stack.A(stk.pp)), E=inla.stack.data(stk.pp)$e) # our likelihood for this model is Poisson, the link function is automatically assumed to be the natural logaritm # Make pretty plot of fitted density mesh.projector <- inla.mesh.projector(mesh, xlim=c(150,225), ylim=c(40,60), dims=c(200,200)) im <- matrix(as.numeric(mesh.projector$proj$A%*%pp.res$summary.random$i$mean),nrow=200) image(seq(150,225,length.out=200),seq(40,60,length.out=200),im,xlab="RA",ylab="DEC",col=rev(rainbow(50))) points(ra,dec) # Does the 'local density' correlate with any galaxy properties? local.density <- as.numeric(A%*%pp.res$summary.random$i$mean) plot(local.density,vdisp) upperquantilefit <- rq(vdisp ~ local.density,0.9) lines(sort(local.density),upperquantilefit$fitted.values[sort.list(local.density)],col="red") summary(upperquantilefit,se="boot") # urgh p~0.005! doesn't look super convincing: maybe we'd better look at a large sample size (more redshift slices and other regions of the sky