Lecture 05: Density Estimation and GMMs – CS 189, Fall 2025
In this lecture we will explore the implementation of the Gaussian Mixture Model (GMM) using the Expectation-Maximization (EM) algorithm.
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
pd.set_option('plotting.backend', 'plotly')
# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
os.makedirs("images")
### Uncomment for HTML Export
import plotly.io as pio
pio.renderers.default = "notebook_connected"
The Gaussian Distribution
The probability density function of a univariate Gaussian distribution with mean $\mu$ and variance $\sigma^2$ is given by:
$$ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}} $$
from scipy.stats import norm
mean = 0
variance = .7
x = np.linspace(-4, 4, 100)
p = norm.pdf(x, loc=mean, scale=np.sqrt(variance)) # scale = standard deviation, loc = mean
fig = px.line(x=x, y=p, title=f"Standard Normal Distribution (mean={mean}, variance={variance})",
labels={"x": "x", "y": "p(x)"}, width = 700, height = 400)
# fig.write_image("images/standard_normal.pdf", scale=2, height=400, width=700)
fig
Multivariate Normal Distribution
The equation for the multivariate normal is given by: $$ f(\mathbf{x}) = \frac{1}{\sqrt{(2\pi)^D|\Sigma|}} \exp\left(-\frac{1}{2}(\mathbf{x}-\mu)^T\Sigma^{-1}(\mathbf{x}-\mu)\right) $$ where $\mathbf{x}$ is a $D$-dimensional random vector, $\mu$ is the mean vector, and $\Sigma$ is the covariance matrix.
def mv_normal_pdf(X, mu, Sigma):
"""Compute the multivariate normal density at points X."""
d = X.shape[1]
X_centered = X - mu
Sigma_inv = np.linalg.inv(Sigma)
norm_const = 1 / np.sqrt((2 * np.pi) ** d * np.linalg.det(Sigma))
exp_term = np.exp(-0.5 * np.sum(X_centered @ Sigma_inv * X_centered, axis=1))
return norm_const * exp_term
mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])
from scipy.stats import multivariate_normal
normal = multivariate_normal(mean=mu, cov=Sigma)
normal.pdf(np.array([[1, 0.5]]))
np.float64(0.061762753301732566)
mv_normal_pdf(np.array([[1, 0.5]]), mu, Sigma)
array([0.06176275])
def plot_bivariate_normal(mu, Sigma, fig=None):
from scipy.stats import multivariate_normal
normal = multivariate_normal(mean=mu, cov=Sigma)
u = np.linspace(-9, 9, 100)
X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
Z = normal.pdf(X)
if fig is None:
fig = make_subplots(rows=1, cols=2,
specs=[[{'type': 'surface'}, {'type': 'contour'}]],)
fig.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100),
z=Z.reshape(100,100), colorscale='Viridis',
contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)), row=1, col=1)
fig.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
line_smoothing=1.3,
#contours_coloring='lines',
showscale=False,
row=1, col=2
)
fig.update_layout(width=900, height=500)
return fig
mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])
plot_bivariate_normal(mu, Sigma)
The following interactive plot will only work in a Jupyter notebook environment. It allows you to visualize how changing the mean and covariance matrix affects the shape of the bivariate normal distribution.
from ipywidgets import interactive_output, FloatSlider, HBox, VBox, widgets
u = np.linspace(-9, 9, 100)
X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
normal = multivariate_normal(mean=mu, cov=Sigma)
Z = normal.pdf(X)
fig1 = go.FigureWidget()
fig1.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100),
z=Z.reshape(100,100), colorscale='Viridis',
contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)))
fig1.update_layout(width=600, height=500)
fig2 = go.FigureWidget()
fig2.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
line_smoothing=1.3)
fig2.update_layout(width=400, height=500)
mu1 = FloatSlider(min=-5, max=5, step=0.1, value=1, description='mu1')
mu2 = FloatSlider(min=-5, max=5, step=0.1, value=0, description='mu2')
sigma11 = FloatSlider(min=0.1, max=5, step=0.1, value=3, description='sigma11')
sigma22 = FloatSlider(min=0.1, max=5, step=0.1, value=2, description='sigma22')
sigma12 = FloatSlider(min=-3, max=3, step=0.1, value=0.4, description='sigma12')
def update(mu1, mu2, sigma11, sigma22, sigma12):
mu = np.array([mu1, mu2])
Sigma = np.array([[sigma11, sigma12], [sigma12, sigma22]])
normal = multivariate_normal(mean=mu, cov=Sigma)
Z = normal.pdf(X).reshape(100,100)
with fig1.batch_update():
fig1.data[0].z = Z
with fig2.batch_update():
fig2.data[0].z = Z
interactive_output(update, {
'mu1': mu1, 'mu2': mu2,
'sigma11': sigma11, 'sigma22': sigma22, 'sigma12': sigma12
})
HBox([VBox([mu1, mu2, sigma11, sigma22, sigma12]), fig1, fig2],
layout=widgets.Layout(align_items='center'))
Return to Lecture
The Bike Dataset
As with the previous lecture, we will use Professor Gonzalez's bike ride dataset to illustrate the concepts. The dataset contains the speed and length of bike rides taken with different bikes.
bikes = pd.read_csv("speed_length_data.csv")
bikes.head()
Speed | Length | |
---|---|---|
0 | 9.699485 | 16.345721 |
1 | 11.856724 | 23.159855 |
2 | 7.608359 | 25.434056 |
3 | 14.280750 | 25.867538 |
4 | 12.004195 | 7.105725 |
bikes.plot.scatter(x='Speed', y='Length', title='Speed vs Length of Bike Segments',
height=800)
The Gaussian Mixture Model
A Gaussian Mixture Model (GMM) is a probabilistic model that assumes all the data points are generated from a mixture of several Gaussian distributions:
$$ p(x \, \vert \, \pi, \mu, \Sigma) = \sum_{k=1}^{K} \pi_k \, \mathcal{N}(x | \mu_k, \Sigma_k) $$
Just as with the K-Means model, we can use the GaussianMixture
class from sklearn.mixture
to fit a GMM to our data.
from sklearn.mixture import GaussianMixture
# Create a Gaussian Mixture Model with 4 components
gmm = GaussianMixture(n_components=4, random_state=42, )
# Fit the model to the data
gmm.fit(bikes[['Speed', 'Length']])
# Get the cluster labels
bikes['scikit gmm'] = gmm.predict(bikes[['Speed', 'Length']]).astype(str)
bikes['prob'] = gmm.predict_proba(bikes[['Speed', 'Length']]).max(axis=1)
bikes
Speed | Length | scikit gmm | prob | |
---|---|---|---|---|
0 | 9.699485 | 16.345721 | 2 | 1.000000 |
1 | 11.856724 | 23.159855 | 0 | 0.987510 |
2 | 7.608359 | 25.434056 | 0 | 1.000000 |
3 | 14.280750 | 25.867538 | 3 | 0.941168 |
4 | 12.004195 | 7.105725 | 1 | 1.000000 |
... | ... | ... | ... | ... |
519 | 12.149287 | 9.877558 | 1 | 0.999998 |
520 | 14.464280 | 28.253430 | 3 | 0.999959 |
521 | 8.865093 | 12.628237 | 2 | 0.999996 |
522 | 11.610448 | 24.479626 | 0 | 0.998426 |
523 | 8.460227 | 23.218372 | 0 | 0.999984 |
524 rows × 4 columns
mu = gmm.means_
Sigma = [np.linalg.inv(p) for p in gmm.precisions_]
p = gmm.weights_
def gmm_surface(mu, Sigma, p, u_pts, v_pts):
from scipy.stats import multivariate_normal
u, v = np.meshgrid(u_pts, v_pts)
X_pts = np.array([u.flatten(), v.flatten()]).T
Z = np.zeros(X_pts.shape[0])
for k in range(len(p)):
Z += p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(X_pts)
return go.Contour(x=u_pts, y=v_pts, z=Z.reshape(u.shape),
colorscale='Viridis',
colorbar=dict(x=1.05, y=0.35, len=0.75)
)
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)
fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig = px.scatter(bikes, x='Speed', y='Length', symbol='scikit gmm',
size='prob', color="scikit gmm", title='GMM Clustering',
color_continuous_scale="Viridis_r", size_max=15)
fig.update_layout(width=800, height=800)
Return to Lecture
Ancestor Sampling for the GMM
# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100
mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])
z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))
log_likelihood = np.sum(np.log(np.sum(
pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])),
axis=1 )))
# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]
fig = px.scatter(x=x, y=np.random.rand(N)/20,
title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
opacity = 0.7,
color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)
fig
Return to Lecture
Implementing the GMM using the EM Algorithm
Directly maximizing the log-likelihood function is challenging:
\begin{align*} \log p\left(\mathcal{D} \,\vert\, \mu, \Sigma \right) & = \log \left( \prod_{n=1}^{N} \sum_{k=1}^{K} \pi_k \, \mathcal{N}(x_n | \mu_k, \Sigma_k) \right) \\ & = \sum_{n=1}^{N} \log \left(\sum_{k=1}^{K} \pi_k \, \mathcal{N}(x_n | \mu_k, \Sigma_k) \right) \\ \end{align*}
because of the summation inside of the logarithm. Instead, we use the Expectation-Maximization (EM) algorithm to iteratively optimize the parameters.
The Initialization Step
A typical way to initialize GMM models is to start with k-means clustering to find the initial means of the Gaussian components.
from sklearn.cluster import KMeans
def initialize_gmm(x, K):
N, D = x.shape
kmeans = KMeans(n_clusters=K)
kmeans.fit(x)
mu = kmeans.cluster_centers_
Sigma = np.array([np.eye(D) for _ in range(K)])
p = np.ones(K) / K
return mu, Sigma, p
mu, Sigma, p = initialize_gmm(bikes[['Speed', 'Length']], 4)
display(mu, Sigma, p)
array([[10.10039658, 23.89181286], [11.93793333, 8.30468598], [ 9.70501333, 15.73336323], [14.92499214, 27.94287697]])
array([[[1., 0.], [0., 1.]], [[1., 0.], [0., 1.]], [[1., 0.], [0., 1.]], [[1., 0.], [0., 1.]]])
array([0.25, 0.25, 0.25, 0.25])
The (E)xpectation -Step
In the E-step, we compute the responsibilities, which represent the probability that each data point belongs to each Gaussian component given the current parameters.
def E_step(x, mu, Sigma, p):
"""E-step of the EM algorithm.
Computes the posterior probabilities of the latent variables (z) given the data.
"""
N, D = x.shape
K = len(p)
assert(Sigma.shape == (K, D, D))
assert(mu.shape == (K,D))
p_z_given_x = np.zeros((N, K))
for k in range(K):
p_z_given_x[:, k] = p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(x)
p_z_given_x /= p_z_given_x.sum(axis=1, keepdims=True) # Normalize to get probabilities
return p_z_given_x
p_z_given_x = E_step(bikes[['Speed', 'Length']], mu, Sigma, p)
p_z_given_x.shape
(524, 4)
p_z_given_x.sum(axis=1) # Each row should sum to 1
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
The (M)aximization -Step
In this step, we update the parameters of the Gaussian components based on the responsibilities computed in the E-step.
def M_step(x, p_z_given_x):
"""M-step of the EM algorithm.
Updates the parameters (mu, sigma, p) based on the posterior probabilities.
"""
N, D = x.shape
N, K = p_z_given_x.shape
mu_new = np.zeros((K, D))
Sigma_new = np.zeros((K, D, D))
p_new = np.zeros(K)
for k in range(K):
N_k = p_z_given_x[:, k].sum()
mu_new[k, :] = p_z_given_x[:, k] @ x / N_k
Sigma_new[k, :, :] = (p_z_given_x[:, k] * (x - mu_new[k, :]).T @ (x - mu_new[k, :])) / N_k
Sigma_new[k, :, :] += 1e-3 * np.eye(D) # Regularization
p_new[k] = N_k / N
return mu_new, Sigma_new, p_new
M_step(bikes[['Speed', 'Length']], p_z_given_x)
(array([[10.09617716, 23.89225105], [11.93828458, 8.30572562], [ 9.70499521, 15.74671976], [14.89141779, 27.92341173]]), array([[[ 2.12616243, -0.3292685 ], [-0.3292685 , 1.82506808]], [[ 0.71456371, 0.1300181 ], [ 0.1300181 , 2.97990471]], [[ 1.24335546, 1.25078618], [ 1.25078618, 4.42408655]], [[ 1.48193648, 0.11209861], [ 0.11209861, 1.70595836]]]), array([0.43889036, 0.22708304, 0.22788724, 0.10613936]))
def em_algorithm(x, K, max_iters=100, initial_variance=100):
D = 2
p = np.ones(K) / K
# sample initial mu from data
mu, Sigma, p = initialize_gmm(x, K)
mu = mu + np.random.randn(*mu.shape) * 3
soln_path = [(mu, Sigma, p)]
for i in range(max_iters):
p_z_given_x = E_step(x, mu, Sigma, p)
mu, Sigma, p = M_step(x, p_z_given_x)
soln_path.append((mu, Sigma, p))
return mu, Sigma, p, soln_path
mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values,
K=4,
max_iters=50)
print("mu", mu)
print("Sigma", Sigma)
mu [[10.07832505 24.01592504] [11.97377525 8.28451473] [ 9.79348087 16.04456075] [14.92540409 28.00936288]] Sigma [[[ 2.20087175 -0.18005775] [-0.18005775 1.50601405]] [[ 0.57086189 0.22368929] [ 0.22368929 2.95139598]] [[ 1.41478244 1.75054955] [ 1.75054955 5.96588238]] [[ 1.45618894 0.04305385] [ 0.04305385 1.47688638]]]
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)
mu, Sigma, p = soln_path[-1]
fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')
from ipywidgets import IntSlider
np.random.seed(42)
mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values,
K=4,
max_iters=100)
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)
mu, Sigma, p = soln_path[0]
fig = go.FigureWidget()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')
def update(step):
mu, Sigma, p = soln_path[step]
with fig.batch_update():
fig.data[0].z = gmm_surface(mu, Sigma, p, speed_pts, length_pts).z
with fig.batch_update():
fig.data[-1].x = mu[:, 0]
fig.data[-1].y = mu[:, 1]
step_slider = IntSlider(min=0, max=len(soln_path)-1, step=1, value=0, description='Step')
interactive_output(update, {'step': step_slider})
VBox([fig, step_slider])
Issue with the MLE of the GMM
# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100
mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])
z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))
log_likelihood = np.sum(np.log(np.sum(
pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])),
axis=1 )))
# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]
fig = px.scatter(x=x, y=np.random.rand(N)/20,
title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
opacity = 0.7,
color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)
fig
mu = np.array([x.min(), x.mean(), x.max()])
Sigma = np.array([1e-100, 10, 1e-100])
pi = np.array([0.3, 0.4, 0.3])
log_likelihood = np.sum(np.log(np.sum(
pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])),
axis=1 )))
fig = px.scatter(x=x, y=np.random.rand(N)/20,
title=f'Extreme Values from GMM (Log Likelihood: {log_likelihood:.2f})',
opacity = 0.7,
color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 100)
u = np.append(u, mu)
u.sort()
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)
fig.update_layout(yaxis_range=[0, 1])
fig