import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
pd.set_option('plotting.backend', 'plotly')

# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")

# Uncomment for HTML Export
import plotly.io as pio
pio.renderers.default = "notebook_connected"

bikes = pd.read_csv("speed_length_data.csv")
bikes.head()

bikes.plot.scatter(x='Speed', y='Length', title='Speed vs Length of Bike Segments', 
                   height=800)

from sklearn.cluster import KMeans
# Create a KMeans model with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
# Fit the model to the data
kmeans.fit(bikes[['Speed', 'Length']])
# Get the cluster labels
bikes['scikit k-means'] = kmeans.predict(bikes[['Speed', 'Length']]).astype(str)

fig = px.scatter(
    bikes, x='Speed', y='Length', color='scikit k-means',
    title='K-Means Clustering of Bike Segments',
    height=800)
fig.add_scatter(
    x=kmeans.cluster_centers_[:,0],
    y=kmeans.cluster_centers_[:,1],
    mode='markers',
    marker=dict(color='black', size=10),
    name='Centroids'
)
#fig.write_image("images/bike_kmeans.pdf", scale=2, height=800, width=700)

def initialize_centers(x, k):
    """Randomly select k unique points from x to use as initial centers."""
    ind = np.random.choice(np.arange(x.shape[0]), k, replace=False)
    return x[ind]

k = 4
x = bikes[['Speed', 'Length']].to_numpy()
centers = initialize_centers(x, k)
centers

array([[ 8.94348446, 24.05257377],
       [12.7764257 , 18.58972398],
       [13.18110706, 28.41900961],
       [10.5910518 , 15.32649042]])

def compute_assignments(x, centers):
    """Assign each point in x to the nearest center."""
    distances = np.linalg.norm(x[:, np.newaxis] - centers, axis=2)
    return np.argmin(distances, axis=1)

assignments = compute_assignments(x, centers)
assignments

array([3, 0, 0, 2, 3, 3, 3, 0, 0, 0, 3, 0, 0, 3, 3, 3, 3, 2, 3, 1, 0, 3,
       1, 3, 3, 0, 3, 2, 2, 3, 0, 3, 3, 0, 0, 0, 1, 3, 0, 1, 3, 3, 3, 0,
       3, 0, 3, 2, 2, 3, 0, 0, 0, 0, 2, 1, 3, 3, 3, 3, 3, 3, 0, 3, 1, 2,
       3, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 3, 2, 0, 3, 3, 3, 3, 0, 0,
       0, 3, 3, 0, 0, 2, 0, 3, 2, 3, 3, 3, 3, 0, 1, 2, 0, 3, 3, 0, 3, 3,
       3, 3, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 3, 3, 3, 3, 0, 0, 3, 2, 3, 3,
       3, 3, 0, 1, 2, 0, 3, 0, 0, 3, 3, 3, 3, 3, 0, 2, 3, 3, 0, 3, 3, 3,
       0, 3, 0, 0, 3, 0, 3, 0, 0, 2, 3, 0, 3, 3, 3, 2, 0, 3, 2, 3, 0, 3,
       0, 3, 0, 3, 0, 3, 0, 0, 1, 1, 0, 2, 2, 3, 3, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 3, 3, 0, 0, 3, 3, 0, 3, 3, 2, 0, 0, 1, 0, 3, 1, 0, 0, 0,
       0, 2, 3, 3, 3, 3, 2, 3, 0, 0, 0, 2, 2, 1, 3, 0, 1, 3, 1, 3, 0, 0,
       0, 1, 3, 0, 3, 0, 0, 0, 3, 3, 0, 3, 3, 2, 1, 0, 2, 3, 0, 0, 3, 2,
       0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 3, 3, 2, 3,
       3, 2, 3, 2, 3, 3, 0, 1, 3, 2, 0, 0, 0, 3, 3, 2, 2, 0, 0, 0, 2, 1,
       0, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 2, 0, 0, 3, 0, 0, 2, 0, 3, 1,
       3, 0, 3, 3, 3, 3, 0, 0, 3, 3, 1, 3, 3, 3, 3, 0, 0, 3, 1, 0, 2, 0,
       0, 3, 3, 0, 3, 2, 0, 2, 3, 0, 3, 2, 1, 3, 0, 0, 3, 3, 3, 3, 3, 0,
       3, 3, 0, 3, 0, 1, 3, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 3, 3, 0, 3, 0,
       0, 3, 0, 0, 3, 0, 2, 0, 3, 0, 3, 3, 0, 3, 3, 0, 0, 2, 0, 3, 3, 3,
       0, 0, 0, 0, 3, 3, 3, 2, 3, 0, 3, 3, 3, 0, 0, 3, 3, 0, 3, 0, 0, 3,
       3, 0, 1, 3, 0, 0, 0, 0, 3, 3, 0, 2, 1, 0, 3, 1, 3, 1, 3, 3, 3, 0,
       0, 2, 2, 1, 3, 0, 3, 0, 3, 0, 3, 0, 2, 0, 3, 2, 0, 2, 0, 0, 1, 0,
       2, 2, 3, 3, 0, 0, 3, 2, 3, 3, 3, 3, 2, 3, 0, 0, 0, 2, 3, 0, 0, 3,
       0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 3, 0, 0, 3, 2, 3, 0, 0])

def update_centers(x, assignments, k):
    """Update centers based on the current assignments."""
    return np.array([x[assignments == i].mean(axis=0) for i in range(k)])

centers = update_centers(x, assignments, k)
centers

array([[ 9.94615559, 24.01761893],
       [11.40447103, 19.54414142],
       [14.72567631, 27.80903952],
       [10.80705086, 11.42630463]])

def k_means_clustering(x, k, max_iters=100):
    centers = initialize_centers(x, k)
    assignments_old = -np.ones(x.shape[0])
    soln_path = [centers]
    for _ in range(max_iters):
        assignments = compute_assignments(x, centers)
        centers = update_centers(x, assignments, k)
        soln_path.append(centers)
        if np.array_equal(assignments, assignments_old):
            break
        assignments_old = assignments
    return centers, assignments, soln_path

np.random.seed(43)
centers, assignments, soln_path = k_means_clustering(x, k)
len(soln_path)

11

### Construct an animation of the k-means algorithm.
### You do not need to understand the code below for the class.
### It is just for making the animation.

## Prepare a giant table with all the data and centers labeled with the iteration.
pts = []
for i, centers in enumerate(soln_path):
    df = bikes[['Speed', 'Length']].copy()
    df['Class'] = compute_assignments(x, centers).astype(str)
    df2 = pd.DataFrame(centers, columns=['Speed', 'Length'])
    df2['Class'] = 'Center'
    df_combined = pd.concat([df, df2], ignore_index=True)
    # I also need the index of each point in center for the animation
    # the index acts as a unique identifier for each point across frames
    df_combined.reset_index(inplace=True)
    # The iteration number tracks the frame in the animation
    df_combined['Iteration'] = i
    pts.append(df_combined)
# I stack all the data into one big table.
frames = pd.concat(pts, ignore_index=True)

## Make the animation
fig = px.scatter(frames, x='Speed', y='Length', color='Class', 
                 animation_group='index',
                 animation_frame='Iteration', title='K-Means Clustering',
                 width=700, height=800)
## The aspect ratio of the plot is missleading.
fig.update_layout(
    xaxis=dict(scaleanchor="y", scaleratio=1),
    yaxis=dict(scaleanchor="x", scaleratio=1)
)
# fig.write_image("images/bike_kmeans_animation_0.pdf", height=800, width=700)

## Touchup the centers to make them more visible
fig.update_traces(marker=dict(size=12, symbol='x', color='black'), 
                  selector=dict(legendgroup='Center') )
for i, f in enumerate(fig.frames):
    for trace in f.data:
        if trace.name == 'Center':
            trace.update(marker=dict(size=12, symbol='x', color='black'))
    # go.Figure(f.data, f.layout).write_image(
    #     f"images/bike_kmeans_animation_{i+1}.pdf", height=800, width=700)
# fig.write_html("images/bike_kmeans_animation.html",include_plotlyjs='cdn', full_html=True)
fig

from PIL import Image
img = np.array(Image.open("bike2.jpeg"))
print(img.shape)
px.imshow(img)

(1280, 960, 3)

image_df = pd.DataFrame(img.reshape(-1,3), columns=['R', 'G', 'B'])
image_df['color'] = ("rgb(" + 
                     image_df['R'].astype(str) + "," + 
                     image_df['G'].astype(str) + "," + 
                     image_df['B'].astype(str) + ")")
fig = go.Figure()
small_image_df = image_df.sample(100000, random_state=42)
fig.add_scatter3d(x=small_image_df['R'], y=small_image_df['G'], z=small_image_df['B'],
                   mode='markers', marker=dict(color=small_image_df['color'], opacity=0.5, size=2))
fig.update_layout(scene=dict(xaxis_title='R', yaxis_title='G', zaxis_title='B'), 
                  width=800, height=800,)
# fig.write_html("images/bike_color_space.html",include_plotlyjs='cdn', full_html=True)
fig

from sklearn.cluster import KMeans
# Apply k-means clustering to the RGB columns
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
image_df['cluster'] = kmeans.fit_predict(image_df[['R', 'G', 'B']])
image_df['cluster'].value_counts()

cluster
5    273889
6    256235
4    180678
1    164461
3    150634
2    124331
7     49735
0     28837
Name: count, dtype: int64

from plotly.subplots import make_subplots
img_kmeans = (
    kmeans.cluster_centers_[image_df['cluster'].values]
    .reshape(img.shape)
)
# make two linked subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Original Image", "K-Means Compressed Image"),
    specs=[[{"type": "xy"}, {"type": "xy"}]]
)
fig.add_trace(px.imshow(img).data[0], row=1, col=1)
fig.add_trace(px.imshow(img_kmeans).data[0], row=1, col=2)

# 1) Make both subplots share identical ranges
# (use half-pixel bounds to match how Image traces are rendered)
H, W = img.shape[:2]
xrange = [-0.5, W - 0.5]
yrange = [H - 0.5, -0.5]  # origin at top
for c in (1, 2):
    fig.update_xaxes(range=xrange, row=1, col=c)
    fig.update_yaxes(range=yrange, row=1, col=c)

# 2) Lock square pixels and prevent domain stretch
fig.update_yaxes(scaleanchor="x",  scaleratio=1, constrain="domain", row=1, col=1)
fig.update_yaxes(scaleanchor="x2", scaleratio=1, constrain="domain", row=1, col=2)

# Link panning/zooming between the two images (now safe)
fig.update_xaxes(matches="x")
fig.update_yaxes(matches="y")

# Cosmetic
# fig.write_html("images/bike_kmeans_compression.html",include_plotlyjs='cdn', full_html=True)
fig.update_layout(width=900, height=600, margin=dict(t=50, b=30, l=20, r=20))

scores = pd.DataFrame(columns=['k'])
scores['k'] = [2, 4, 8, 16, 32, 64, 128, 256]
scores.set_index('k', inplace=True)

# Apply k-means clustering to the RGB columns
from sklearn.cluster import KMeans
for k in scores.index:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(image_df[['R', 'G', 'B']])
    # scores.loc[k, 'score'] = kmeans.score(image_df[['R', 'G', 'B']])
    scores.loc[k, 'score'] = kmeans.inertia_ # Negative score

fig = px.line(
    scores, 
    title="K-Means Objective vs Number of Clusters",
    markers=True,
    labels={"index": "Number of Clusters (k)", 
            "value": "K-Means Objective"},
    width=700, height=400
)
fig.update_layout(xaxis_type="log", xaxis_exponentformat='power', showlegend=False)
fig.update_layout(margin=dict(t=50, b=30, l=20, r=20))
# fig.write_image("images/kmeans_score_vs_k.pdf", scale=2, height=400, width=700)
fig

def wake_word_detector(
        p_wake = 0.0001,               # P(wake) prior probability of wake word
        p_detect_g_wake = 0.99,        # P(detect | wake) likelihood of detection given wake
        p_detect_g_nowake = 0.001      # P(detect | no wake) likelihood of detection given no wake
):
    # P(wake | detect) = P(detect | wake) * P(wake) / P(detect)
    p_detect = p_wake * p_detect_g_wake + (1 - p_wake) * p_detect_g_nowake
    p_wake_g_detect = p_detect_g_wake * p_wake / p_detect
    return p_wake_g_detect

wake_word_detector()

0.09009009009009011

p_detect_g_nowake = np.logspace(-6, -4, 100)
p_wake_g_detect = wake_word_detector(p_detect_g_nowake=p_detect_g_nowake, 
                                     p_detect_g_wake=1.0)
fig = px.line(
    x=p_detect_g_nowake,
    y=p_wake_g_detect,
    title="P(Wake | Detect) vs False Positive Rate for Perfect Recall",
    labels={
        "x": "P(Detect | No Wake) (False Positive Rate)",
        "y": "P(Wake | Detect)"
    },
    log_x=True
)
fig.update_layout( xaxis_exponentformat='power')
#fig.write_image("images/wake_word_detector_fpr.pdf", scale=2, height=500, width=700)
fig

p_detect_g_wake = np.logspace(-0.8, 0, 100)
p_wake_g_detect = wake_word_detector(p_detect_g_wake=p_detect_g_wake,
                                     p_detect_g_nowake=0.0001)
fig = px.line(
    x=p_detect_g_wake,
    y=p_wake_g_detect,
    title="P(Wake | Detect) vs Recall (Sensitivity) for FPR=0.001",
    labels={
        "x": "P(Detect | Wake) Recall",
        "y": "P(Wake | Detect)"
    },
    log_x=True
)
fig.update_layout( xaxis_exponentformat='power')
#fig.write_image("images/wake_word_detector_recall.pdf", scale=2, height=500, width=700)
fig

	Speed	Length
0	9.699485	16.345721
1	11.856724	23.159855
2	7.608359	25.434056
3	14.280750	25.867538
4	12.004195	7.105725

Lecture 04: K-Means and Probability – CS 189, Fall 2025

The Bike Dataset

Scikit-Learn K-Means Clustering

Implementing the K-Means Clustering Algorithm

Initialization

Assignment

Update Centers

Lloyd's Algorithm

K-Means on Pixel Data

Choosing the Number of Clusters

Wake Word Detector