import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
pd.set_option('plotting.backend', 'plotly')

# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")

### Uncomment for HTML Export
import plotly.io as pio
pio.renderers.default = "notebook_connected"

from scipy.stats import norm
mean = 0 
variance = .7
x = np.linspace(-4, 4, 100)
p = norm.pdf(x, loc=mean, scale=np.sqrt(variance)) # scale = standard deviation, loc = mean
fig = px.line(x=x, y=p, title=f"Standard Normal Distribution (mean={mean}, variance={variance})",
        labels={"x": "x", "y": "p(x)"}, width = 700, height = 400)
# fig.write_image("images/standard_normal.pdf", scale=2, height=400, width=700)
fig

def mv_normal_pdf(X, mu, Sigma):
    """Compute the multivariate normal density at points X."""
    d = X.shape[1]
    X_centered = X - mu
    Sigma_inv = np.linalg.inv(Sigma)
    norm_const = 1 / np.sqrt((2 * np.pi) ** d * np.linalg.det(Sigma))
    exp_term = np.exp(-0.5 * np.sum(X_centered @ Sigma_inv * X_centered, axis=1))
    return norm_const * exp_term

mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])

from scipy.stats import multivariate_normal
normal = multivariate_normal(mean=mu, cov=Sigma)
normal.pdf(np.array([[1, 0.5]]))

np.float64(0.061762753301732566)

mv_normal_pdf(np.array([[1, 0.5]]), mu, Sigma)

array([0.06176275])

def plot_bivariate_normal(mu, Sigma, fig=None):
    from scipy.stats import multivariate_normal
    normal = multivariate_normal(mean=mu, cov=Sigma)
    u = np.linspace(-9, 9, 100)
    X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
    Z = normal.pdf(X)
    if fig is None:
        fig = make_subplots(rows=1, cols=2,
                            specs=[[{'type': 'surface'}, {'type': 'contour'}]],)
    fig.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100), 
                    z=Z.reshape(100,100), colorscale='Viridis',
                    contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)), row=1, col=1)
    fig.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
                    line_smoothing=1.3,
                    #contours_coloring='lines',
                    showscale=False,
                    row=1, col=2
                    )
    fig.update_layout(width=900, height=500)
    return fig

mu = np.array([1, 0])
Sigma = np.array([[3, 0.4], [0.4, 2]])
plot_bivariate_normal(mu, Sigma)

from ipywidgets import interactive_output, FloatSlider, HBox, VBox, widgets

u = np.linspace(-9, 9, 100)
X = np.array(np.meshgrid(u,u)).reshape(2,-1).T
normal = multivariate_normal(mean=mu, cov=Sigma)
Z = normal.pdf(X)
fig1 = go.FigureWidget()
fig1.add_surface(x=X[:,0].reshape(100,100), y=X[:,1].reshape(100,100), 
                z=Z.reshape(100,100), colorscale='Viridis',
                contours=dict(z=dict(show=True, size=.01, start=0, end=0.3)))
fig1.update_layout(width=600, height=500)
fig2 = go.FigureWidget()
fig2.add_contour(x=u, y=u, z=Z.reshape(100,100), colorscale='Viridis',
                line_smoothing=1.3)
fig2.update_layout(width=400, height=500)

mu1 = FloatSlider(min=-5, max=5, step=0.1, value=1, description='mu1')
mu2 = FloatSlider(min=-5, max=5, step=0.1, value=0, description='mu2')
sigma11 = FloatSlider(min=0.1, max=5, step=0.1, value=3, description='sigma11')
sigma22 = FloatSlider(min=0.1, max=5, step=0.1, value=2, description='sigma22')
sigma12 = FloatSlider(min=-3, max=3, step=0.1, value=0.4, description='sigma12')


def update(mu1, mu2, sigma11, sigma22, sigma12):
    mu = np.array([mu1, mu2])
    Sigma = np.array([[sigma11, sigma12], [sigma12, sigma22]])
    normal = multivariate_normal(mean=mu, cov=Sigma)
    Z = normal.pdf(X).reshape(100,100)
    with fig1.batch_update():
        fig1.data[0].z = Z
    with fig2.batch_update():
        fig2.data[0].z = Z

interactive_output(update, {
    'mu1': mu1, 'mu2': mu2,
    'sigma11': sigma11, 'sigma22': sigma22, 'sigma12': sigma12
})

HBox([VBox([mu1, mu2, sigma11, sigma22, sigma12]), fig1, fig2],  
     layout=widgets.Layout(align_items='center'))

bikes = pd.read_csv("speed_length_data.csv")
bikes.head()

bikes.plot.scatter(x='Speed', y='Length', title='Speed vs Length of Bike Segments', 
                   height=800)

from sklearn.mixture import GaussianMixture
# Create a Gaussian Mixture Model with 4 components
gmm = GaussianMixture(n_components=4, random_state=42, )
# Fit the model to the data
gmm.fit(bikes[['Speed', 'Length']])
# Get the cluster labels
bikes['scikit gmm'] = gmm.predict(bikes[['Speed', 'Length']]).astype(str)
bikes['prob'] = gmm.predict_proba(bikes[['Speed', 'Length']]).max(axis=1)
bikes

mu = gmm.means_
Sigma = [np.linalg.inv(p) for p in gmm.precisions_]
p = gmm.weights_

def gmm_surface(mu, Sigma, p, u_pts, v_pts):
    from scipy.stats import multivariate_normal
    u, v = np.meshgrid(u_pts, v_pts)
    X_pts = np.array([u.flatten(), v.flatten()]).T
    Z = np.zeros(X_pts.shape[0])
    for k in range(len(p)):
        Z += p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(X_pts)
    return go.Contour(x=u_pts, y=v_pts, z=Z.reshape(u.shape), 
                      colorscale='Viridis',
                      colorbar=dict(x=1.05, y=0.35, len=0.75)
                    )

num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)

fig = px.scatter(bikes, x='Speed', y='Length', symbol='scikit gmm', 
           size='prob', color="scikit gmm", title='GMM Clustering',
           color_continuous_scale="Viridis_r", size_max=15)
fig.update_layout(width=800, height=800)

# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100

mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])

z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))

log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))

# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)

fig

from sklearn.cluster import KMeans
def initialize_gmm(x, K):
    N, D = x.shape
    kmeans = KMeans(n_clusters=K)
    kmeans.fit(x)
    mu = kmeans.cluster_centers_
    Sigma = np.array([np.eye(D) for _ in range(K)])
    p = np.ones(K) / K
    return mu, Sigma, p

mu, Sigma, p = initialize_gmm(bikes[['Speed', 'Length']], 4)
display(mu, Sigma, p)

array([[10.10039658, 23.89181286],
       [11.93793333,  8.30468598],
       [ 9.70501333, 15.73336323],
       [14.92499214, 27.94287697]])

array([[[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]],

       [[1., 0.],
        [0., 1.]]])

array([0.25, 0.25, 0.25, 0.25])

def E_step(x, mu, Sigma, p):
    """E-step of the EM algorithm.
    Computes the posterior probabilities of the latent variables (z) given the data.
    """
    N, D = x.shape
    K = len(p)
    assert(Sigma.shape == (K, D, D))
    assert(mu.shape == (K,D))        
    p_z_given_x = np.zeros((N, K))
    for k in range(K):
        p_z_given_x[:, k] = p[k] * multivariate_normal(mu[k], Sigma[k]).pdf(x)
    p_z_given_x /= p_z_given_x.sum(axis=1, keepdims=True) # Normalize to get probabilities
    return p_z_given_x

p_z_given_x = E_step(bikes[['Speed', 'Length']], mu, Sigma, p)
p_z_given_x.shape

(524, 4)

p_z_given_x.sum(axis=1)  # Each row should sum to 1

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

def M_step(x, p_z_given_x):
    """M-step of the EM algorithm.
    Updates the parameters (mu, sigma, p) based on the posterior probabilities.
    """
    N, D = x.shape
    N, K = p_z_given_x.shape
    mu_new = np.zeros((K, D))
    Sigma_new = np.zeros((K, D, D))
    p_new = np.zeros(K)
    
    for k in range(K):
        N_k = p_z_given_x[:, k].sum()
        mu_new[k, :] = p_z_given_x[:, k] @ x / N_k
        Sigma_new[k, :, :] = (p_z_given_x[:, k] * (x - mu_new[k, :]).T @ (x - mu_new[k, :])) / N_k
        Sigma_new[k, :, :] += 1e-3 * np.eye(D)  # Regularization
        p_new[k] = N_k / N

    return mu_new, Sigma_new, p_new

M_step(bikes[['Speed', 'Length']], p_z_given_x)

(array([[10.09617716, 23.89225105],
        [11.93828458,  8.30572562],
        [ 9.70499521, 15.74671976],
        [14.89141779, 27.92341173]]),
 array([[[ 2.12616243, -0.3292685 ],
         [-0.3292685 ,  1.82506808]],
 
        [[ 0.71456371,  0.1300181 ],
         [ 0.1300181 ,  2.97990471]],
 
        [[ 1.24335546,  1.25078618],
         [ 1.25078618,  4.42408655]],
 
        [[ 1.48193648,  0.11209861],
         [ 0.11209861,  1.70595836]]]),
 array([0.43889036, 0.22708304, 0.22788724, 0.10613936]))

def em_algorithm(x, K, max_iters=100, initial_variance=100):
    D = 2
    p = np.ones(K) / K
    # sample initial mu from data
    mu, Sigma, p = initialize_gmm(x, K)
    mu = mu + np.random.randn(*mu.shape) * 3
    soln_path = [(mu, Sigma, p)]
    for i in range(max_iters):
        p_z_given_x = E_step(x, mu, Sigma, p)
        mu, Sigma, p = M_step(x, p_z_given_x)
        soln_path.append((mu, Sigma, p))
    return mu, Sigma, p, soln_path

mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values, 
                                       K=4, 
                                       max_iters=50)
print("mu", mu)
print("Sigma", Sigma)

mu [[10.07832505 24.01592504]
 [11.97377525  8.28451473]
 [ 9.79348087 16.04456075]
 [14.92540409 28.00936288]]
Sigma [[[ 2.20087175 -0.18005775]
  [-0.18005775  1.50601405]]

 [[ 0.57086189  0.22368929]
  [ 0.22368929  2.95139598]]

 [[ 1.41478244  1.75054955]
  [ 1.75054955  5.96588238]]

 [[ 1.45618894  0.04305385]
  [ 0.04305385  1.47688638]]]

num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

mu, Sigma, p = soln_path[-1]
fig = go.Figure()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')

from ipywidgets import  IntSlider
np.random.seed(42)
mu, Sigma, p, soln_path = em_algorithm(bikes[['Speed', 'Length']].values, 
                                       K=4, 
                                       max_iters=100)
num_points = 100
speed_pts = np.linspace(bikes['Speed'].min()-3, bikes['Speed'].max()+3, num_points)
length_pts = np.linspace(bikes['Length'].min()-3, bikes['Length'].max()+3, num_points)

mu, Sigma, p = soln_path[0]
fig = go.FigureWidget()
fig.add_trace(gmm_surface(mu, Sigma, p, speed_pts, length_pts))
fig.update_layout(width=800, height=800)
fig.add_traces(px.scatter(bikes, x='Speed', y='Length', color='scikit gmm').data)
fig.add_scatter(x=mu[:,0], y=mu[:,1], mode='markers', marker=dict(color='black', size=10), name='Centers')

def update(step):
    mu, Sigma, p = soln_path[step]
    with fig.batch_update():
        fig.data[0].z = gmm_surface(mu, Sigma, p, speed_pts, length_pts).z
    with fig.batch_update():
        fig.data[-1].x = mu[:, 0]
        fig.data[-1].y = mu[:, 1]
step_slider = IntSlider(min=0, max=len(soln_path)-1, step=1, value=0, description='Step')
interactive_output(update, {'step': step_slider}) 
VBox([fig, step_slider])

# Ancestor Sampling to create a synthetic dataset
np.random.seed(42)
N = 100

mu = np.array([-1, 2, 5])
pi = np.array([0.2, 0.5, 0.3])
Sigma = np.array([0.2, 0.5, .1])

z = np.random.choice(len(mu), size=N, p=pi)
x = np.random.normal(mu[z], np.sqrt(Sigma[z]))

log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))

# Sort for better visualization
ind = z.argsort()
z = z[ind]
x = x[ind]

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Synthetic Dataset from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 1000)
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)

fig

mu = np.array([x.min(), x.mean(), x.max()])
Sigma = np.array([1e-100, 10, 1e-100])
pi = np.array([0.3, 0.4, 0.3])

log_likelihood = np.sum(np.log(np.sum(
    pi[z] * norm.pdf(x[:, None], loc=mu[z], scale=np.sqrt(Sigma[z])), 
    axis=1 )))

fig = px.scatter(x=x, y=np.random.rand(N)/20,  
                 title=f'Extreme Values from GMM (Log Likelihood: {log_likelihood:.2f})',
                 opacity = 0.7,
                 color=z.astype(str), labels={'color': 'True Cluster'}, height=400)
u = np.linspace(-4, 9, 100)
u = np.append(u, mu)
u.sort()
df = pd.DataFrame({'x': u})
for k in range(len(mu)):
    df[f'p{k}'] = pi[k] * norm.pdf(u, loc=mu[k], scale=np.sqrt(Sigma[k]))
fig.add_traces(px.line(df, x='x', y=df.columns[1:], labels={'y': 'Density'}).data)
fig.update_layout(width=800, height=400)
fig.update_layout(yaxis_range=[0, 1])

fig

	Speed	Length
0	9.699485	16.345721
1	11.856724	23.159855
2	7.608359	25.434056
3	14.280750	25.867538
4	12.004195	7.105725

	Speed	Length	scikit gmm	prob
0	9.699485	16.345721	2	1.000000
1	11.856724	23.159855	0	0.987510
2	7.608359	25.434056	0	1.000000
3	14.280750	25.867538	3	0.941168
4	12.004195	7.105725	1	1.000000
...	...	...	...	...
519	12.149287	9.877558	1	0.999998
520	14.464280	28.253430	3	0.999959
521	8.865093	12.628237	2	0.999996
522	11.610448	24.479626	0	0.998426
523	8.460227	23.218372	0	0.999984

Lecture 05: Density Estimation and GMMs – CS 189, Fall 2025

The Gaussian Distribution

Multivariate Normal Distribution

The Bike Dataset

The Gaussian Mixture Model

Ancestor Sampling for the GMM

Implementing the GMM using the EM Algorithm

The Initialization Step

The (E)xpectation -Step

The (M)aximization -Step

Issue with the MLE of the GMM