Gaussian Mixture Models and Expectation Maximization Algorithm

# clear
rm(list = ls(all=TRUE))

# load
library(tidyverse)
library(mixtools)
library(aricode)

# seed
set.seed(100)

First, lets generate some random data points coming from 2 different distributions.

# example from normal 1
c1=rnorm(
  n=1000,
  mean = mu1, 
  sd = sd1
)

# example from normal 2
c2=rnorm(
  n=1000,
  mean = mu2, 
  sd = sd2
)

# mixed observations
combined=c(
  c1,
  c2
)

# look at the histogram
hist(combined)

Here is the example function to implement the EM Algorithm for a mixture of gaussians in R

# example EM algorithm implementation
em.gaussian = function(
  k=k,
  mean.start=mean.start,
  sd.start=sd.start,
  mixed.data=mixed.data,
  max.iter=1000,
  priors=priors
)
{
  # get initial clustering and new means
  mean.cluster.probs=list()
  for(i in 1:k)
  {
    # get probabilities
    probs=dnorm(
      x=mixed.data,
      mean = mean.start[i],
      sd = sd.start[i]
    )
    mean.cluster.probs[[i]]=probs
  }
  
  # put in table
  mean.cluster.probs.tbl=as.data.frame(
    bind_cols(
      mean.cluster.probs
      )
    )
  
  # change column names
  colnames(mean.cluster.probs.tbl) = seq(
    from=1,
    to=dim(
      mean.cluster.probs.tbl
      )[2]
    )
  
  # get clusters
  cluster.assignment=apply(
    mean.cluster.probs.tbl,
    1,
    which.max
  )
  
  # get new means
  new.means=c()
  new.sds=c()
  for (i in unique(cluster.assignment)) 
  {
    new.means[i]=mean(
      x=mixed.data[cluster.assignment==i],
      na.rm = TRUE
    )
    new.sds[i]=sd(
      x=mixed.data[cluster.assignment==i],
      na.rm = TRUE
    )
  }
  
  # counts of how many times the clusters have not changed
  cc=0
  num.iter=0
  curr.cluster.assignment=cluster.assignment
  while (cc<5 & num.iter < max.iter) 
  { 
    # get initial clustering and new means
    mean.cluster.probs=list()
    for(i in 1:k)
    { 
      # get probabilities
      probs=dnorm(
        x=mixed.data,
        mean = new.means[i],
        sd = new.sds[i]
      )
      mean.cluster.probs[[i]]=probs
    } 
    
    # put in table
    mean.cluster.probs.tbl=as.data.frame(
      bind_cols(
        mean.cluster.probs
      )
    )
    
    # change column names
    colnames(mean.cluster.probs.tbl) = seq(
      from=1,
      to=dim(
        mean.cluster.probs.tbl
      )[2]
    )
    
    # get clusters
    cluster.assignment=apply(
      mean.cluster.probs.tbl,
      1,
      which.max
    ) 
    
    # get new means
    new.means=c()
    new.sds=c()
    for (i in unique(cluster.assignment)) 
    { 
      new.means[i]=mean(
        x=mixed.data[cluster.assignment==i],
        na.rm = TRUE
      )
      new.sds[i]=sd(
        x=mixed.data[cluster.assignment==i],
        na.rm = TRUE
      )
    }
    
    # check cluster assignments
    # first get length
    total.length=length(cluster.assignment)
    if(sum(curr.cluster.assignment == cluster.assignment) == total.length )
    {
      cc=cc+1
    } else 
    {
      cc=0
    }
    
    # increase the iteration counter
    num.iter=num.iter+1
    
    # save cluster assignments
    curr.cluster.assignment=cluster.assignment
    
  }
  
  # return the values of interest
  return(
    list(
      'cluster.assignment'=cluster.assignment,
      'mu'=new.means,
      'sigma' = new.sds,
      'probability.table'=mean.cluster.probs.tbl,
      'num.iter'=num.iter,
      'Number.of.repeat.cluster.assignments'=cc
    )
  )
} 

We can now run the algorithm with the data provided, and see how well it does. Note that we need to provide several parameters as input to the model.

  • k = number of clusters/underlying distributions. If not known beforehand, then several models with different K need to be fit, and the best one can be identified through AIC or BIC (not shown in this example). If the wrong k is provided, the model will force a wrong number of underlying distributions, and we will get poor results.
  • mean.start = the initial values for the mean parameters (mu) of the underlying Gaussian distributions. These values are updated in the EM algorithm, but to guarantee an appropriate convergence, a reasonable starting point goes a long way.
  • sd.start = Similar to the case above, the initial values for the standard deviations of the underlying gaussian distributions.
  • mixed data = the data coming from the mixture of Gaussians.
  • priors = the prior probability of any data coming from each underlying distribution. In this case we chose 0.5 for both, as we have a good idea that about half of the data points come from each gaussian.
# initialize for example
mean.start=c(3,5)
sd.start=c(1,1)
mixed.data=combined
k=2
priors=c(0.5, 0.5)

# example run of my implementation of gaussian EM
result.mine=em.gaussian(
  k=k,
  mean.start=mean.start,
  sd.start=sd.start,
  mixed.data=mixed.data,
  max.iter=1000,
  priors=priors
)

# show pretty plot or the results
plot(
  y = mixed.data,
  x = rnorm(
    n=length(mixed.data),
    mean=1,
    sd=0.01
  ),
  col = result.mine$cluster.assignment,
  pch = result.mine$cluster.assignment,
  xlab = '',
  ylab = 'Mixed Data',
  xlim = c(0.7, 1.3)
)

As with many statistical problems in R, Gaussian Mixture models can be solved using an R-Package. For today’s example we will also be solving the Mixture Model using the mixtools R-Package, and comparing to our implementation.

# run package
gm <- normalmixEM(
  x = mixed.data,
  k=2,
  lambda=c(0.5,0.5),
  mu=c(0.3,0.4),
  sigma=c(0.05,0.06)
)
## number of iterations= 21
# get cluasters from package
final.clusters  = apply(
      gm$posterior,
      1,
      which.max
    ) 

# show pretty plot
plot(
  y = mixed.data,
  x = rnorm(
    n=length(mixed.data),
    mean=1,
    sd=0.01
  ),
  col = final.clusters,
  pch = result.mine$cluster.assignment,
  xlab = '',
  ylab = 'Mixed Data',
  xlim = c(0.7, 1.3)
)

# show correlation
aricode::ARI(
  c1=final.clusters,
  c2=result.mine$cluster.assignment
)
## [1] 0.998
# show means
print('package')
## [1] "package"
gm$mu
## [1]  5.023375 14.994704
gm$sigma
## [1] 2.051942 1.978351
# show mine
print('mine')
## [1] "mine"
result.mine$mu
## [1]  5.032405 15.019486
result.mine$sigma
## [1] 2.050432 1.941514
# show truth
print('truth')
## [1] "truth"
mu1
## [1] 5
mu2
## [1] 15
sd1
## [1] 2
sd2
## [1] 2

Thanks to our decent initialization, and to the fact that the underlying Gaussians were separable, both our model and the R-Package did reasonably well. Just as an example, let’s look at this same data, but let’s fit the wrong k.

# run package
gm <- normalmixEM(
  x = mixed.data,
  k=3,
  lambda=c(0.3,0.3, 0.4),
  mu=c(0.3,0.4, 0.5),
  sigma=c(0.05,0.06, 0.07)
)
## number of iterations= 51
# show means
print('package')
## [1] "package"
gm$mu
## [1] -0.09894892  5.04092032 14.99162713
gm$sigma
## [1] 0.2253413 2.0251885 1.9814283

As you can see the EM algorithm estimated an additional Gaussian, that doesn’t seem to make sense. Of course, this poor estimation makes sense, as we generated the data from 2 gaussians. Lets generate some random data points coming from 3 different distributions and see if our implementatino still works

# parameters
mu1 = 2
sd1 = 0.5
mu2 = 5
sd2 = 1
mu3 = 9
sd3 = 1

# example from normal 1
c1=rnorm(
  n=1000,
  mean = mu1, 
  sd = sd1
)

# example from normal 2
c2=rnorm(
  n=1000,
  mean = mu2, 
  sd = sd2
)

# example from normal 3
c3=rnorm(
  n=1000,
  mean = mu3, 
  sd = sd3
)

# mixed observations
combined=c(
  c1,
  c2,
  c3
)

# look at the histogram
hist(combined)

Now lets see how the EM algorithms perform. We will provide the correct K

# initialize for example
mean.start=c(3,5,7)
sd.start=c(1,1,1)
mixed.data=combined
k=3
priors=c(0.4, 0.3, 0.3)

# example run of my implementation of gaussian EM
result.mine=em.gaussian(
  k=k,
  mean.start=mean.start,
  sd.start=sd.start,
  mixed.data=mixed.data,
  max.iter=10000,
  priors=priors
)

# run package
gm <- normalmixEM(
  x = mixed.data,
  k=3,
  lambda=c(0.3,0.3, 0.4),
  mu=c(3,4,5),
  sigma=c(1,1,1)
)
## number of iterations= 42
# show results - our implementation
result.mine$mu
## [1] 1.994175 5.001069 9.021738
result.mine$sigma
## [1] 0.4940448 0.8741949 0.9702257
# mixtools package results
gm$mu
## [1] 1.981255 4.960435 9.008221
gm$sigma
## [1] 0.4922258 0.9670880 0.9991039

Both implementations also did reasonably well with 3 gaussians, as expected, since the data comes from 3 gausians that are well separated, and we gave the correct k to the algorithm. We can also use the EM algorithm to solve a a mixture of Multivariate Gaussians. We will demonstrate this using the mixtools and MASS packages.

# generate random data from a mixture of 3 multivariate normals
mvnorm.data1 = MASS::mvrnorm(
  n = 500,
  mu = c(1, 10),
  Sigma = matrix(
    data = c(1,0,0,1),
    2,
    2
  )
)
mvnorm.data2 = MASS::mvrnorm(
  n = 500,
  mu = c(5, 20),
  Sigma = matrix(
    data = c(1,0,0,1),
    2,
    2
  )
)
mvnorm.data3 = MASS::mvrnorm(
  n = 500,
  mu = c(5, 25),
  Sigma = matrix(
    data = c(1,0,0,1),
    2,
    2
  )
)
mvnorm.data = rbind(
  mvnorm.data1,
  mvnorm.data2,
  mvnorm.data3
)

# plot data
plot(
  mvnorm.data,
  xlab = 'dimension 1',
  ylab = 'dimension 2',
  main = 'Mixture of 3 Multivariate Gaussians'
)

# estimate using EM. We will let mixtools generate random initialization for mu and sigma.
gm.mv = mvnormalmixEM(
  x = mvnorm.data,
  k=3
)
## number of iterations= 27
# show summary
summary(gm.mv)
## summary of mvnormalmixEM object:
##          comp 1    comp 2
## lambda 0.333333  0.331812
## mu1    0.974596 10.025152
## mu2    5.093102 20.020855
## mu3    5.034380 24.879757
## loglik at estimate:  -5880.702

Mixtools did great solving the MV Gaussian Mixture, albeit, again, the data is appropriate, separable, comes from the distribution we are assuming, and we input the correct k. Remember, good data in the input = good data in the output; also, use the correct tool for the job.