Lecture 05: Density Estimation and GMMs – CS 189, Fall 2025

In this lecture we will explore the implementation of the Gaussian Mixture Model (GMM) using the Expectation-Maximization (EM) algorithm.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
pd.set_option('plotting.backend', 'plotly')
In [2]:
# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")
In [3]:
### Uncomment for HTML Export
import plotly.io as pio
pio.renderers.default = "notebook_connected"

The Gaussian Distribution

The probability density function of a univariate Gaussian distribution with mean $\mu$ and variance $\sigma^2$ is given by:

$$ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} e^{-\frac{(x-\mu)^2}{2\sigma^2}} $$

In [4]:
from scipy.stats import norm
mean = 0 
variance = .7
x = np.linspace(-4, 4, 100)
p = norm.pdf(x, loc=mean, scale=np.sqrt(variance)) # scale = standard deviation, loc = mean
fig = px.line(x=x, y=p, title=f"Standard Normal Distribution (mean={mean}, variance={variance})",
        labels={"x": "x", "y": "p(x)"}, width = 700, height = 400)
# fig.write_image("images/standard_normal.pdf", scale=2, height=400, width=700)
fig

Multivariate Normal Distribution

The equation for the multivariate normal is given by: $$ f(\mathbf{x}) = \frac{1}{\sqrt{(2\pi)^D|\Sigma|}} \exp\left(-\frac{1}{2}(\mathbf{x}-\mu)^T\Sigma^{-1}(\mathbf{x}-\mu)\right) $$ where $\mathbf{x}$ is a $D$-dimensional random vector, $\mu$ is the mean vector, and $\Sigma$ is the covariance matrix.

In [5]:
def mv_normal_pdf(X, mu, Sigma):
    """Compute the multivariate normal density at points X."""
    d = X.shape[1]
    X_centered = X - mu
    Sigma_inv = np.linalg.inv(Sigma)
    norm_const = 1 / np.sqrt((2 * np.pi) ** d * np.linalg.det(Sigma))
    exp_term = np.exp(-0.5 * np.sum(X_centered @ Sigma_inv * X_centered, axis=1))
    return norm_const * exp_term
In [6]:
mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])

from scipy.stats import multivariate_normal
normal = multivariate_normal(mean=mu, cov=Sigma)
normal.pdf(np.array([[1, 0.5]]))
Out[6]:
np.float64(0.061762753301732566)
In [7]:
mv_normal_pdf(np.array([[1, 0.5]]), mu, Sigma)
Out[7]:
array([0.06176275])
In [8]:
def plot_bivariate_normal(mu, Sigma, fig=None):
    from scipy.stats import multivariate_normal
    normal = multivariate_normal(mean=mu, cov=Sigma)
    u = np.linspace(-9, 9, 100)
    X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
    Z = normal.pdf(X)
    if fig is None:
        fig = make_subplots(rows=1, cols=2,
                            specs=[[{'type': 'surface'}, {'type': 'contour'}]],)
    fig.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100), 
                    z=Z.reshape(100,100), colorscale='Viridis',
                    contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)), row=1, col=1)
    fig.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
                    line_smoothing=1.3,
                    #contours_coloring='lines',
                    showscale=False,
                    row=1, col=2
                    )
    fig.update_layout(width=900, height=500)
    return fig
In [9]:
mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])
plot_bivariate_normal(mu, Sigma)

The following interactive plot will only work in a Jupyter notebook environment. It allows you to visualize how changing the mean and covariance matrix affects the shape of the bivariate normal distribution.

In [10]:
from ipywidgets import interactive_output, FloatSlider, HBox, VBox, widgets

u = np.linspace(-9, 9, 100)
X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
normal = multivariate_normal(mean=mu, cov=Sigma)
Z = normal.pdf(X)
fig1 = go.FigureWidget()
fig1.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100), 
                z=Z.reshape(100,100), colorscale='Viridis',
                contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)))
fig1.update_layout(width=600, height=500)
fig2 = go.FigureWidget()
fig2.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
                line_smoothing=1.3)
fig2.update_layout(width=400, height=500)

mu1 = FloatSlider(min=-5, max=5, step=0.1, value=1, description='mu1')
mu2 = FloatSlider(min=-5, max=5, step=0.1, value=0, description='mu2')
sigma11 = FloatSlider(min=0.1, max=5, step=0.1, value=3, description='sigma11')
sigma22 = FloatSlider(min=0.1, max=5, step=0.1, value=2, description='sigma22')
sigma12 = FloatSlider(min=-3, max=3, step=0.1, value=0.4, description='sigma12')


def update(mu1, mu2, sigma11, sigma22, sigma12):
    mu = np.array([mu1, mu2])
    Sigma = np.array([[sigma11, sigma12], [sigma12, sigma22]])
    normal = multivariate_normal(mean=mu, cov=Sigma)
    Z = normal.pdf(X).reshape(100,100)
    with fig1.batch_update():
        fig1.data[0].z = Z
    with fig2.batch_update():
        fig2.data[0].z = Z

interactive_output(update, {
    'mu1': mu1, 'mu2': mu2,
    'sigma11': sigma11, 'sigma22': sigma22, 'sigma12': sigma12
})

HBox([VBox([mu1, mu2, sigma11, sigma22, sigma12]), fig1, fig2],  
     layout=widgets.Layout(align_items='center'))
Out[10]:





Return to Lecture





The Bike Dataset

As with the previous lecture, we will use Professor Gonzalez's bike ride dataset to illustrate the concepts. The dataset contains the speed and length of bike rides taken with different bikes.

In [11]:
bikes = pd.read_csv("speed_length_data.csv")
bikes.head()
Out[11]:
Speed Length
0 9.699485 16.345721
1 11.856724 23.159855
2 7.608359 25.434056
3 14.280750 25.867538
4 12.004195 7.105725
In [12]:
bikes.plot.scatter(x='Speed', y='Length', title='Speed vs Length of Bike Segments', 
                   height=800)

The Gaussian Mixture Model

A Gaussian Mixture Model (GMM) is a probabilistic model that assumes all the data points are generated from a mixture of several Gaussian distributions:

$$ p(x \, \vert \, \pi, \mu, \Sigma) = \sum_{k=1}^{K} \pi_k \, \mathcal{N}(x | \mu_k, \Sigma_k) $$

Just as with the K-Means model, we can use the GaussianMixture class from sklearn.mixture to fit a GMM to our data.

In [13]:
from sklearn.mixture import GaussianMixture
# Create a Gaussian Mixture Model with 4 components
gmm = GaussianMixture(n_components=4, random_state=42, )
# Fit the model to the data
gmm.fit(bikes[['Speed', 'Length']])
# Get the cluster labels
bikes['scikit gmm'] = gmm.predict(bikes[['Speed', 'Length']]).astype(str)
bikes['prob'] = gmm.predict_proba(bikes[['Speed', 'Length']]).max(axis=1)
bikes
Out[13]:
Speed Length scikit gmm prob
0 9.699485 16.345721 2 1.000000
1 11.856724 23.159855 0 0.987510
2 7.608359 25.434056 0 1.000000
3 14.280750 25.867538 3 0.941168
4 12.004195 7.105725 1 1.000000
... ... ... ... ...
519 12.149287 9.877558 1 0.999998
520 14.464280 28.253430 3 0.999959
521 8.865093 12.628237 2 0.999996
522 11.610448 24.479626 0 0.998426
523 8.460227 23.218372 0 0.999984

524 rows × 4 columns

In [14]:
mu = gmm.means_
Sigma = [np.linalg.inv(p) for p in gmm.precisions_]
p = gmm.weights_
In [15]:
def gmm_surface(mu, Sigma, p, u_pts, v_pts):
    from scipy.stats import multivariate_normal
    u, v = np.meshgrid(u_pts, v_pts)
    X_pts = np.array([u.flatten(), v.flatten()]).T
    Z = np.zeros(X_pts.shape[0])
    for k in range(len(p)):
        Z += p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(X_pts)
    return go.Contour(x=u_pts, y=v_pts, z=Z.reshape(u.shape), 
                      colorscale='Viridis',
                      colorbar=dict(x=1.05, y=0.35, len=0.75)
                    )
In [16]:
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
In [17]:
fig = px.scatter(bikes, x='Speed', y='Length', symbol='scikit gmm', 
           size='prob', color="scikit gmm", title='GMM Clustering',
           color_continuous_scale="Viridis_r", size_max=15)
fig.update_layout(width=800, height=800)





Return to Lecture





Ancestor Sampling for the GMM

In [18]:
# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100

mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])
In [19]:
z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))
In [20]:
log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))
In [21]:
# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)

fig





Return to Lecture





Implementing the GMM using the EM Algorithm

Directly maximizing the log-likelihood function is challenging:

\begin{align*} \log p\left(\mathcal{D} \,\vert\, \mu, \Sigma \right) & = \log \left( \prod_{n=1}^{N} \sum_{k=1}^{K} \pi_k \, \mathcal{N}(x_n | \mu_k, \Sigma_k) \right) \\ & = \sum_{n=1}^{N} \log \left(\sum_{k=1}^{K} \pi_k \, \mathcal{N}(x_n | \mu_k, \Sigma_k) \right) \\ \end{align*}

because of the summation inside of the logarithm. Instead, we use the Expectation-Maximization (EM) algorithm to iteratively optimize the parameters.

The Initialization Step

A typical way to initialize GMM models is to start with k-means clustering to find the initial means of the Gaussian components.

In [22]:
from sklearn.cluster import KMeans
def initialize_gmm(x, K):
    N, D = x.shape
    kmeans = KMeans(n_clusters=K)
    kmeans.fit(x)
    mu = kmeans.cluster_centers_
    Sigma = np.array([np.eye(D) for _ in range(K)])
    p = np.ones(K) / K
    return mu, Sigma, p
In [23]:
mu, Sigma, p = initialize_gmm(bikes[['Speed', 'Length']], 4)
display(mu, Sigma, p)
array([[10.10039658, 23.89181286],
       [11.93793333,  8.30468598],
       [ 9.70501333, 15.73336323],
       [14.92499214, 27.94287697]])
array([[[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]]])
array([0.25, 0.25, 0.25, 0.25])

The (E)xpectation -Step

In the E-step, we compute the responsibilities, which represent the probability that each data point belongs to each Gaussian component given the current parameters.

In [24]:
def E_step(x, mu, Sigma, p):
    """E-step of the EM algorithm.
    Computes the posterior probabilities of the latent variables (z) given the data.
    """
    N, D = x.shape
    K = len(p)
    assert(Sigma.shape == (K, D, D))
    assert(mu.shape == (K,D))        
    p_z_given_x = np.zeros((N, K))
    for k in range(K):
        p_z_given_x[:, k] = p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(x)
    p_z_given_x /= p_z_given_x.sum(axis=1, keepdims=True) # Normalize to get probabilities
    return p_z_given_x  
In [25]:
p_z_given_x = E_step(bikes[['Speed', 'Length']], mu, Sigma, p)
p_z_given_x.shape
Out[25]:
(524, 4)
In [26]:
p_z_given_x.sum(axis=1)  # Each row should sum to 1
Out[26]:
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

The (M)aximization -Step

In this step, we update the parameters of the Gaussian components based on the responsibilities computed in the E-step.

In [27]:
def M_step(x, p_z_given_x):
    """M-step of the EM algorithm.
    Updates the parameters (mu, sigma, p) based on the posterior probabilities.
    """
    N, D = x.shape
    N, K = p_z_given_x.shape
    mu_new = np.zeros((K, D))
    Sigma_new = np.zeros((K, D, D))
    p_new = np.zeros(K)
    
    for k in range(K):
        N_k = p_z_given_x[:, k].sum()
        mu_new[k, :] = p_z_given_x[:, k] @ x / N_k
        Sigma_new[k, :, :] = (p_z_given_x[:, k] * (x - mu_new[k, :]).T @ (x - mu_new[k, :])) / N_k
        Sigma_new[k, :, :] += 1e-3 * np.eye(D)  # Regularization
        p_new[k] = N_k / N

    return mu_new, Sigma_new, p_new
In [28]:
M_step(bikes[['Speed', 'Length']], p_z_given_x)
Out[28]:
(array([[10.09617716, 23.89225105],
        [11.93828458,  8.30572562],
        [ 9.70499521, 15.74671976],
        [14.89141779, 27.92341173]]),
 array([[[ 2.12616243, -0.3292685 ],
         [-0.3292685 ,  1.82506808]],
 
        [[ 0.71456371,  0.1300181 ],
         [ 0.1300181 ,  2.97990471]],
 
        [[ 1.24335546,  1.25078618],
         [ 1.25078618,  4.42408655]],
 
        [[ 1.48193648,  0.11209861],
         [ 0.11209861,  1.70595836]]]),
 array([0.43889036, 0.22708304, 0.22788724, 0.10613936]))
In [29]:
def em_algorithm(x, K, max_iters=100, initial_variance=100):
    D = 2
    p = np.ones(K) / K
    # sample initial mu from data
    mu, Sigma, p = initialize_gmm(x, K)
    mu = mu + np.random.randn(*mu.shape) * 3
    soln_path = [(mu, Sigma, p)]
    for i in range(max_iters):
        p_z_given_x = E_step(x, mu, Sigma, p)
        mu, Sigma, p = M_step(x, p_z_given_x)
        soln_path.append((mu, Sigma, p))
    return mu, Sigma, p, soln_path
In [30]:
mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values, 
                                       K=4, 
                                       max_iters=50)
print("mu", mu)
print("Sigma", Sigma)
mu [[10.07832505 24.01592504]
 [11.97377525  8.28451473]
 [ 9.79348087 16.04456075]
 [14.92540409 28.00936288]]
Sigma [[[ 2.20087175 -0.18005775]
  [-0.18005775  1.50601405]]

 [[ 0.57086189  0.22368929]
  [ 0.22368929  2.95139598]]

 [[ 1.41478244  1.75054955]
  [ 1.75054955  5.96588238]]

 [[ 1.45618894  0.04305385]
  [ 0.04305385  1.47688638]]]
In [31]:
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

mu, Sigma, p = soln_path[-1]
fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')
In [32]:
from ipywidgets import  IntSlider
np.random.seed(42)
mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values, 
                                       K=4, 
                                       max_iters=100)
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

mu, Sigma, p = soln_path[0]
fig = go.FigureWidget()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')

def update(step):
    mu, Sigma, p = soln_path[step]
    with fig.batch_update():
        fig.data[0].z = gmm_surface(mu, Sigma, p, speed_pts, length_pts).z
    with fig.batch_update():
        fig.data[-1].x = mu[:, 0]
        fig.data[-1].y = mu[:, 1]
step_slider = IntSlider(min=0, max=len(soln_path)-1, step=1, value=0, description='Step')
interactive_output(update, {'step': step_slider}) 
VBox([fig, step_slider])
Out[32]:

Issue with the MLE of the GMM

In [33]:
# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100

mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])

z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))

log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))
In [34]:
# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)

fig
In [35]:
mu = np.array([x.min(), x.mean(), x.max()])
Sigma = np.array([1e-100, 10, 1e-100])
pi = np.array([0.3, 0.4, 0.3])

log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Extreme Values from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 100)
u = np.append(u, mu)
u.sort()
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)
fig.update_layout(yaxis_range=[0, 1])

fig