DPMGibbsSkewN: Slice Sampling of Dirichlet Process Mixture of skew normal distributions

Description

Slice Sampling of Dirichlet Process Mixture of skew normal distributions

Usage

DPMGibbsSkewN(
  z,
  hyperG0,
  a = 1e-04,
  b = 1e-04,
  N,
  doPlot = TRUE,
  nbclust_init = 30,
  plotevery = N/10,
  diagVar = TRUE,
  use_variance_hyperprior = TRUE,
  verbose = TRUE,
  ...
)

Value

a object of class DPMclust with the following attributes:

mcmc_partitions:: a list of length N. Each element mcmc_partitions[n] is a vector of length n giving the partition of the n observations.
alpha:: a vector of length N. cost[j] is the cost associated to partition c[[j]]
U_SS_list:: a list of length N containing the lists of sufficient statistics for all the mixture components at each MCMC iteration
weights_list:
logposterior_list:: a list of length N containing the logposterior values at each MCMC iterations
data:: the data matrix d x n with d dimensions in rows and n observations in columns
nb_mcmcit:: the number of MCMC iterations
clust_distrib:: the parametric distribution of the mixture component - "skewnorm"
hyperG0:: the prior on the cluster location

Arguments

z: data matrix d x n with d dimensions in rows and n observations in columns.
hyperG0: prior mixing distribution.
a: shape hyperparameter of the Gamma prior on the concentration parameter of the Dirichlet Process. Default is 0.0001.
b: scale hyperparameter of the Gamma prior on the concentration parameter of the Dirichlet Process. Default is 0.0001. If 0, then the concentration is fixed set to a.
N: number of MCMC iterations.
doPlot: logical flag indicating whether to plot MCMC iteration or not. Default to TRUE.
nbclust_init: number of clusters at initialization. Default to 30 (or less if there are less than 30 observations).
plotevery: an integer indicating the interval between plotted iterations when doPlot is TRUE.
diagVar: logical flag indicating whether the variance of a cluster is a diagonal matrix. Default is FALSE (full matrix).
use_variance_hyperprior: logical flag indicating whether a hyperprior is added for the variance parameter. Default is TRUE which decrease the impact of the variance prior on the posterior. FALSE is useful for using an informative prior.
verbose: logical flag indicating whether partition info is written in the console at each MCMC iteration.
...: additional arguments to be passed to plot_DPMsn. Only used if doPlot is TRUE.

Author

Boris Hejblum

References

Hejblum BP, Alkhassim C, Gottardo R, Caron F and Thiebaut R (2019) Sequential Dirichlet Process Mixtures of Multivariate Skew t-distributions for Model-based Clustering of Flow Cytometry Data. The Annals of Applied Statistics, 13(1): 638-660. <doi: 10.1214/18-AOAS1209> <arXiv: 1702.04407> https://arxiv.org/abs/1702.04407 tools:::Rd_expr_doi("10.1214/18-AOAS1209")

Examples

Run this code

rm(list=ls())

#Number of data
n <- 1000
set.seed(123)

d <- 2
ncl <- 4

# Sample data

sdev <- array(dim=c(d,d,ncl))

#xi <- matrix(nrow=d, ncol=ncl, c(-1.5, 1, 1.5, 1, 1.5, -2, -2, -2))
xi <- matrix(nrow=d, ncol=ncl, c(-0.5, 0, 0.5, 0, 0.5, -1, -1, 1))
##xi <- matrix(nrow=d, ncol=ncl, c(-0.3, 0, 0.5, 0.5, 0.5, -1.2, -1, 1))
psi <- matrix(nrow=d, ncol=4, c(0.4, -0.6, 0.8, 0, 0.3, -0.7, -0.3, -1.2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters
sdev[, ,1] <- matrix(nrow=d, ncol=d, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=d, ncol=d, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=d, ncol=d, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)


c <- rep(0,n)
z <- matrix(0, nrow=d, ncol=n)
for(k in 1:n){
 c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
 z[,k] <- xi[, c[k]] + psi[, c[k]]*abs(rnorm(1)) + sdev[, , c[k]]%*%matrix(rnorm(d, mean = 0,
                                                                       sd = 1), nrow=d, ncol=1)
 #cat(k, "/", n, " observations simulated\n", sep="")
}

# Set parameters of G0
hyperG0 <- list()
hyperG0[["b_xi"]] <- rep(0,d)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.0001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d + 1
hyperG0[["lambda"]] <- diag(d)

 # hyperprior on the Scale parameter of DPM
 a <- 0.0001
 b <- 0.0001

 # do some plots
 doPlot <- TRUE
 nbclust_init <- 30



 ## Data
 ########
 library(ggplot2)
 p <- (ggplot(data.frame("X"=z[1,], "Y"=z[2,]), aes(x=X, y=Y))
       + geom_point()
       + ggtitle("Simple example in 2d data")
       +xlab("D1")
       +ylab("D2")
       +theme_bw())
 p

 c2plot <- factor(c)
 levels(c2plot) <- c("3", "2", "4", "1")
 pp <- (ggplot(data.frame("X"=z[1,], "Y"=z[2,], "Cluster"=as.character(c2plot)))
       + geom_point(aes(x=X, y=Y, colour=Cluster, fill=Cluster))
       + ggtitle("Slightly overlapping skew-normal simulation\n")
       + xlab("D1")
       + ylab("D2")
       + theme_bw()
       + scale_colour_discrete(guide=guide_legend(override.aes = list(size = 6, shape=22))))
 pp


 ## alpha priors plots
 #####################
 prioralpha <- data.frame("alpha"=rgamma(n=5000, shape=a, scale=1/b),
                         "distribution" =factor(rep("prior",5000),
                         levels=c("prior", "posterior")))
 p <- (ggplot(prioralpha, aes(x=alpha))
       + geom_histogram(aes(y=..density..),
                        colour="black", fill="white")
       + geom_density(alpha=.2, fill="red")
       + ggtitle(paste("Prior distribution on alpha: Gamma(", a,
                 ",", b, ")\n", sep=""))
      )
 p


if(interactive()){
 # Gibbs sampler for Dirichlet Process Mixtures
 ##############################################

 MCMCsample_sn <- DPMGibbsSkewN(z, hyperG0, a, b, N=2500,
                                doPlot, nbclust_init, plotevery=200,
                                gg.add=list(theme_bw(),
                                 guides(shape=guide_legend(override.aes = list(fill="grey45")))),
                               diagVar=FALSE)

 s <- summary(MCMCsample_sn, burnin = 2000, thin=10)
 #cluster_est_binder(MCMCsample_sn$mcmc_partitions[1000:1500])
 print(s)
 plot(s)
 #plot_ConvDPM(MCMCsample_sn, from=2)






 # k-means

 plot(x=z[1,], y=z[2,], col=kmeans(t(z), centers=4)$cluster,
      xlab = "d = 1", ylab= "d = 2", main="k-means with K=4 clusters")

 KM <- kmeans(t(z), centers=4)
 KMclust <- factor(KM$cluster)
 levels(KMclust) <- c("2", "4", "1", "3")
 dataKM <- data.frame("X"=z[1,], "Y"=z[2,],
                    "Cluster"=as.character(KMclust))
 dataCenters <- data.frame("X"=KM$centers[,1],
                           "Y"=KM$centers[,2],
                           "Cluster"=c("2", "4", "1", "3"))


 p <- (ggplot(dataKM)
       + geom_point(aes(x=X, y=Y, col=Cluster))
       + geom_point(aes(x=X, y=Y, fill=Cluster, order=Cluster),
                    data=dataCenters, shape=22, size=5)
       + scale_colour_discrete(name="Cluster",
                               guide=guide_legend(override.aes=list(size=6, shape=22)))
       + ggtitle("K-means with K=4 clusters\n")
       + theme_bw()
 )
 p

 postalpha <- data.frame("alpha"=MCMCsample_sn$alpha[501:1000],
                         "distribution" = factor(rep("posterior",1000-500),
                         levels=c("prior", "posterior")))

 postK <- data.frame("K"=sapply(lapply(postalpha$alpha, "["),
                                function(x){sum(x/(x+0:(1000-1)))}))

 p <- (ggplot(postalpha, aes(x=alpha))
       + geom_histogram(aes(y=..density..), binwidth=.1,
                        colour="black", fill="white")
       + geom_density(alpha=.2, fill="blue")
       + ggtitle("Posterior distribution of alpha\n")
       # Ignore NA values for mean
       # Overlay with transparent density plot
       + geom_vline(aes(xintercept=mean(alpha, na.rm=T)),
                    color="red", linetype="dashed", size=1)
     )
 p

 p <- (ggplot(postK, aes(x=K))
       + geom_histogram(aes(y=..density..),
                        colour="black", fill="white")
       + geom_density(alpha=.2, fill="blue")
       + ggtitle("Posterior distribution of predicted K\n")
       # Ignore NA values for mean
       # Overlay with transparent density plot
       + geom_vline(aes(xintercept=mean(K, na.rm=T)),
                    color="red", linetype="dashed", size=1)
       #+ scale_x_continuous(breaks=c(0:6)*2, minor_breaks=c(0:6)*2+1)
       + scale_x_continuous(breaks=c(1:12))
     )
 p

 p <- (ggplot(drop=FALSE, alpha=.6)
       + geom_density(aes(x=alpha, fill=distribution),
                      color=NA, alpha=.6,
                      data=postalpha)
       + geom_density(aes(x=alpha, fill=distribution),
                      color=NA, alpha=.6,
                      data=prioralpha)
       + ggtitle("Prior and posterior distributions of alpha\n")
       + scale_fill_discrete(drop=FALSE)
       + theme_bw()
       + xlim(0,100)
     )
 p

#Skew Normal
n=100000
xi <- 0
d <- 0.995
alpha <- d/sqrt(1-d^2)
z <- rtruncnorm(n,a=0, b=Inf)
e <- rnorm(n, mean = 0, sd = 1)
x <- d*z + sqrt(1-d^2)*e
o <- 1
y <- xi+o*x
nu=1.3
w <- rgamma(n,scale=nu/2, shape=nu/2)
yy <- xi+o*x/w
snd <- data.frame("Y"=y,"YY"=yy)
p <- (ggplot(snd)+geom_density(aes(x=Y), fill="blue", alpha=.2)
     + theme_bw()
     + ylab("Density")
     + ggtitle("Y~SN(0,1,10)\n")
     + xlim(-1,6)
     + ylim(0,0.8)
     )
p

p <- (ggplot(snd)+geom_density(aes(x=YY), fill="blue", alpha=.2)
     + theme_bw()
     + ylab("Density")
     + ggtitle("Y~ST(0,1,10,1.3)\n")
     + xlim(-2,40)
     + ylim(0,0.8)
     )
p

p <- (ggplot(snd)
     + geom_density(aes(x=Y, fill="blue"), alpha=.2)
     + geom_density(aes(x=YY, fill="red"), alpha=.2)
     + theme_bw()
     +theme(legend.text = element_text(size = 13), legend.position="bottom")
     + ylab("Density")
     + xlim(-2,40)
     + ylim(0,0.8)
     + scale_fill_manual(name="", labels=c("Y~SN(0,1,10)       ", "Y~ST(0,1,10,1.3)"),
     guide="legend", values=c("blue", "red"))
     )
p

#Variations
n=100000
xi <- -1
d <- 0.995
alpha <- d/sqrt(1-d^2)
z <- rtruncnorm(n,a=0, b=Inf)
e <- rnorm(n, mean = 0, sd = 1)
x <- d*z + sqrt(1-d^2)*e
snd <- data.frame("X"=x)
p <- (ggplot(snd)+geom_density(aes(x=X), fill="blue", alpha=.2)
     + theme_bw()
     + ylab("Density")
     + ggtitle("X~SN(10)\n")
     + xlim(-1.5,4)
     + ylim(0,1.6)
     )
p

o <- 0.5
y <- xi+o*x
snd <- data.frame("Y"=y)
p <- (ggplot(snd)+geom_density(aes(x=Y), fill="blue", alpha=.2)
     + theme_bw()
     + ylab("Density")
     + ggtitle("Y~SN(-1,1,10)\n")
     + xlim(-1.5,4)
     + ylim(0,1.6)
     )
p




#Simple toy example
###################

n <- 500
set.seed(12345)


d <- 2
ncl <- 4

# Sample data

sdev <- array(dim=c(d,d,ncl))

xi <- matrix(nrow=d, ncol=ncl, c(-1.5, 1, 1.5, 1, 1.5, -2, -2, -2))
psi <- matrix(nrow=d, ncol=4, c(0.4, -0.6, 0.8, 0, 0.3, -0.7, -0.3, -1.2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters
sdev[, ,1] <- matrix(nrow=d, ncol=d, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=d, ncol=d, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=d, ncol=d, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)

#' # Set parameters of G0
hyperG0 <- list()
hyperG0[["b_xi"]] <- rep(0,d)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.0001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d + 1
hyperG0[["lambda"]] <- diag(d)


c <- rep(0,n)
z <- matrix(0, nrow=d, ncol=n)
for(k in 1:n){
 c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
 z[,k] <- xi[, c[k]] + psi[, c[k]]*abs(rnorm(1)) + sdev[, , c[k]]%*%matrix(rnorm(d, mean = 0,
                                                                        sd = 1), nrow=d, ncol=1)
 cat(k, "/", n, " observations simulated\n", sep="")
}

 MCMCsample_sn_sep <- DPMGibbsSkewN(z, hyperG0, a, b, N=600,
                                    doPlot, nbclust_init, plotevery=100,
                                    gg.add=list(theme_bw(),
                               guides(shape=guide_legend(override.aes = list(fill="grey45")))),
                                    diagVar=TRUE)

 s <- summary(MCMCsample_sn, burnin = 400)

}

Run the code above in your browser using DataLab