import plotly.io as pio; pio.renderers.default = "notebook_connected"

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np

# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")

def make_plot_grid(figs, rows, cols):
    """Create a grid of figures with Plotly figures."""
    from plotly.subplots import make_subplots
    def get_trace_type(fig):
        for trace in fig.data:
            if trace.type == 'surface':
                return 'surface'  # required for go.Surface
            elif trace.type.startswith('scatter3d') or trace.type.startswith('mesh3d'):
                return 'scene'  # 3D scene
        return 'xy'  # default 2D
    specs = [[{'type': get_trace_type(fig)} for fig in figs[i:i+cols]] for i in range(0, len(figs), cols)]
    fig_grid = make_subplots(rows=rows, cols=cols, specs=specs,
                             subplot_titles=[fig.layout.title.text for fig in figs])
    for i, fig in enumerate(figs):
        fig_grid.add_traces(fig.data, rows=(i//cols) + 1, cols=(i%cols) + 1 )
    return fig_grid

def plot_lr_predictions(w, cancer):
    """Plot predictions of the logistic model."""
    cancer = cancer.copy()
    cancer['logistic_pred'] = np.where(
        logistic_model(w, cancer[['mean radius', 'mean texture']].values) > 0.5,
        'Pred Malignant', 'Pred Benign'
    )
    # Create a scatter plot with logistic predictions
    fig = px.scatter(cancer, x='mean radius', y='mean texture', 
                     symbol='logistic_pred', color='target',
                     symbol_sequence=[ "circle-open", "cross"])
    for (i,t) in enumerate(fig.data): t.legendgroup = str(i)
    # decision boundary
    xs = np.linspace(cancer['mean radius'].min(), cancer['mean radius'].max(), 100)
    decision_boundary = -(w[0] * xs ) / w[1]
    fig.add_scatter(x=xs, y=decision_boundary, mode='lines', 
                    name='Decision Boundary', legendgroup='Decision Boundary',
                    line=dict(color='black', width=2, dash='dash', ))
    # probability surface
    ys = np.linspace(cancer['mean texture'].min(), cancer['mean texture'].max(), 100)
    X, Y = np.meshgrid(xs, ys)
    Z = logistic_model(w, np.c_[X.ravel(), Y.ravel()]).reshape(X.shape)
    fig.add_contour(x=xs, y=ys, z=Z, 
                    colorscale='Matter_r', opacity=0.5,
                    name='Probability Surface', 
                    colorbar=dict(x=1.05, y=0.3, len=0.75))

    fig.update_layout(title=f'w=({w[0]:.2f}, {w[1]:.2f})',
                      xaxis_range=[xs.min(), xs.max()], yaxis_range=[ys.min(), ys.max()],
                      xaxis_title='Mean Radius (scaled)', yaxis_title='Mean Texture (scaled)',
                      width=800, height=600)
    return fig

def plot_loss(w1, w2, error, ncontours=50):
    surf_fig = go.Figure()
    surf_fig.add_surface(z=error, x=w1, y=w2, 
                         colorscale='Viridis_r', opacity=0.7, showscale=False,
                         contours=dict(z=dict(show=True, highlightcolor="white", 
                                          start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours)))
    surf_fig.update_layout(title="Loss Surface")
    contour_fig = go.Figure()
    contour_fig.add_contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(),
                            colorscale='Viridis_r', opacity=0.7,
                            contours=dict(start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours),
                            colorbar=dict(x=1.05, y=0.35, len=0.75))
    contour_fig.update_layout(title="Loss Contours")
    fig = make_plot_grid([surf_fig, contour_fig], 1, 2).update_layout(height=800)
    fig.update_layout(scene=dict(xaxis_title='w1', yaxis_title='w2', zaxis_title='Loss', aspectmode='cube'))
    fig.update_layout(xaxis_range=[w1.min(), w1.max()], yaxis_range=[w2.min(), w2.max()],
                      xaxis_title='w1', yaxis_title='w2')
    return fig

def plot_gradient(w1, w2, error, dw1, dw2, scale=1.0):
    fig = plot_loss(w1, w2, error)
    fig.add_trace(
        go.Cone(
            x=w1.flatten(), y=w2.flatten(), z=np.zeros_like(error).flatten(),  # Ground plane
            u=dw1.flatten(), v=dw2.flatten(), w=np.zeros_like(error).flatten(),  # No vertical component
            sizeref=2, anchor="tail", showscale=False
        ), 1,1)
    contour_fig = ff.create_quiver(
        x=w1.flatten(), y=w2.flatten(), u=dw1.flatten(), v=dw2.flatten(), 
        line_width=2, line_color="white",
        scale = scale, arrow_scale=.2, showlegend=False)
    fig.add_traces(contour_fig.data, rows=1, cols=2)
    return fig

def add_solution_path(fig, errors, ws):
    s = np.linspace(0, 1, len(ws))
    fig.add_scatter3d(x=ws[:, 0], y=ws[:, 1], z=errors, marker_color=s, marker_size=5,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    row=1, col=1)
    fig.add_scatter(x=ws[:, 0], y=ws[:, 1], marker_color=s,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    showlegend=False,
                    row=1, col=2)
    fig.add_scatter3d(x=[ws[-1, 0]], y=[ws[-1, 1]], z=[errors[-1]],
                       mode='markers', marker=dict(color='red', size=10),
                       name='Final Solution', legendgroup='Final Solution',
                       row=1, col=1)
    fig.add_scatter(x=[ws[-1, 0]], y=[ws[-1, 1]],
                       mode='markers', marker=dict(color='red', size=20),
                       name='Final Solution', legendgroup='Final Solution',
                       showlegend=False,
                       row=1, col=2)
    return fig

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
cancer_dict = datasets.load_breast_cancer(as_frame=True)
cancer_df = pd.DataFrame(cancer_dict.data, columns=cancer_dict.feature_names)
cancer_df['target'] = cancer_dict.target.astype(str)
cancer_df = cancer_df[['mean radius', 'mean texture', 'target']].dropna()
scaler = StandardScaler()
cancer_df[['mean radius', 'mean texture']] = scaler.fit_transform(cancer_df[['mean radius', 'mean texture']])
print("The dataset:", cancer_df.shape)
display(cancer_df.head())

cancer = dict()
cancer['npts'] = 30
cancer['w1'], cancer['w2'] = np.meshgrid(
    np.linspace(-10, 1, cancer['npts']), 
    np.linspace(-5, 1.3, cancer['npts'])
)
cancer['ws'] = np.stack([cancer['w1'].flatten(), cancer['w2'].flatten()]).T
cancer['x'] = cancer_df[['mean radius', 'mean texture']].values
cancer['t'] = cancer_df['target'].values.astype(float)

The dataset: (569, 3)

def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_model(w, x):
    """Logistic model for binary classification."""
    z = w @ x.T
    return sigmoid(z)

def NLL(w):
    """Negative log-likelihood for logistic regression."""
    x = cancer['x']
    t = cancer['t']
    z = w @ x.T
    # return -np.mean(t * np.log(logistic_model(w, x)) + (1 - t) * np.log(1 - logistic_model(w, x)))
    # more numerically stable version
    # np.mean(np.log(1+ np.exp(z)) - t * z) 
    # stable softplus: log(1+exp(z))
    softplus_z = np.logaddexp(0, z)
    return np.mean(softplus_z - t * z)


def grad_NLL(w, x=cancer['x'], t=cancer['t']):
    """Compute the gradient of the negative log-likelihood."""
    p = logistic_model(w, x)
    grad = np.mean((p - t).reshape(-1, 1) * x, 0)
    return grad

cancer['error'] = np.array([NLL(w) 
                for w in cancer['ws']])
cancer['error'] = cancer['error'].reshape(cancer['w1'].shape)
# guesses = np.array([[-1., -1.],
#                     [-2., -4.]])
best_ind = np.argmin(cancer['error'])
cancer['grid_best'] = np.array([cancer['w1'].flatten()[best_ind], 
                                cancer['w2'].flatten()[best_ind]])

(cancer['dw1'], cancer['dw2']) = np.array(
    [grad_NLL(w) for w in cancer['ws']]).T
cancer['dw1'] = cancer['dw1'].reshape(cancer['w1'].shape)
cancer['dw2'] = cancer['dw2'].reshape(cancer['w1'].shape)

sine = dict()
np.random.seed(42)
sine['n'] = 200
# Generate some random data
sine['x'] = np.random.rand(sine['n']) * 2.5 * np.pi
sine['x'] = np.sort(sine['x']) # sort for easier plotting
sine['y'] = np.sin(1.1 + 2.5 * sine['x']) + 0.5 * np.random.randn(sine['n'])
sine_df = pd.DataFrame({'x': sine['x'], 'y': sine['y']})

def sine_model(w, x):
    return np.sin(w[0] + x * w[1])

def sine_MSE(w):
    x = sine['x']
    y = sine['y']
    y_hat = sine_model(w, x)
    return np.mean((y - y_hat) ** 2)

def grad_sine_MSE(w, x=sine['x'], y=sine['y']):
    """Compute the gradient of the negative log-likelihood."""
    y_hat = sine_model(w, x)
    grad_w0 = -2 * np.mean((y - y_hat) * np.cos(w[0] + w[1] * x))
    grad_w1 = -2 * np.mean((y - y_hat) * x * np.cos(w[0] + w[1] * x))
    return np.array([grad_w0, grad_w1])

sine['guesses'] = np.array([[0, 2], [2, 3], [0, 3.5]])
sine['xhat'] = np.linspace(sine_df['x'].min(), sine_df['x'].max(), 100)
sine['pred_df'] = pd.DataFrame({'x': sine['xhat']})
for w in sine['guesses']:
    sine['pred_df'][f'yhat(w={w})'] = sine_model(w, sine['xhat'])
sine['pred_df'].head()

sine['npts'] = 30
sine['w0'], sine['w1'] = np.meshgrid(
    np.linspace(-1.5, 3, sine['npts']), np.linspace(1, 4, sine['npts']))
# combine w1 and w2 into a single tensor
sine['ws'] = np.stack([sine['w0'].flatten(), sine['w1'].flatten()]).T

sine['error'] = np.array([sine_MSE(w) for w in sine['ws']]).reshape(sine['w0'].shape)

ind = np.argmin(sine['error'])
sine['grid_best'] = sine['ws'][ind,:]
sine['grid_best_error'] = sine['error'].flatten()[ind]
print(f"Best weights: {sine['grid_best']}, with error: {sine['grid_best_error']}")

Best weights: [1.29310345 2.44827586], with error: 0.23832770254453578

(sine['dw0'], sine['dw1']) = np.array(
    [grad_sine_MSE(w) for w in sine['ws']]).T
sine['dw0'] = sine['dw0'].reshape(sine['w1'].shape)
sine['dw1'] = sine['dw1'].reshape(sine['w1'].shape)

def gradient_descent(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6):
    """Basic gradient descent algorithm.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    for e in range(nepochs):
        w = w_old - learning_rate * gradient(w_old)
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

# w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=1, 
                     nepochs=100)
errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

# w0 = np.array([1.2, 2.])
# w0 = np.array([.5, 2.5])
w0 = np.array([2, 2])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.01, 
                        nepochs=20)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

def make_loss_curve(path, error_func):
    """Make a loss curve from a path of weights."""
    errors = [error_func(w) for w in path]
    fig = px.line(x=np.arange(len(errors)), y=errors, 
                  labels={'x': 'Iteration (Gradient Steps)', 'y': 'Loss (Error)'})
    fig.update_traces(line_width=4)
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    return fig

w0 = np.array([2.5, 1.9])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.05, 
                        nepochs=50)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
fig.show()

w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
# w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=10, 
                     nepochs=100)

fig_loss = make_loss_curve(path, NLL)
fig_loss.show()

errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

x = np.linspace(-10, 10, 100)
y = x**2 + 1
fig = px.line(x=x, y=y, labels={'x': 'x', 'y': 'f(x)'})
fig.update_traces(line_width=5)

def grad_f(x):
    return 2 * x
w0 = 2
path = gradient_descent(w0, grad_f, learning_rate=1.1, nepochs=10)
fig.add_scatter(x=path, y=path**2 + 1, mode='markers+lines',
                marker=dict(size=10, color='red'),
                line=dict(color='black', width=2, dash="dash"),
                name='Gradient Descent Path')

def hessian_NLL(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = cancer['x']
    p = logistic_model(w, x)
    hessian = (x.T @ np.diag(p * (1 - p)) @ x ) / len(p)
    return hessian

def hessian_sine_MSE(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = sine['x']
    y = sine['y']
    dw0dw0 = 2 * np.mean(np.cos(w[0] + w[1] * x)**2 + 
                (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw0dw1 = 2 * np.mean(x * np.cos(w[0] + w[1] * x)**2 +
                x * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw1dw1 = 2 * np.mean(x**2 * np.cos(w[0] + w[1] * x)**2 + 
                (x**2) * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    hessian = np.array([[dw0dw0, dw0dw1], [dw0dw1, dw1dw1]])
    return hessian

# unfortunately on colab we need to upgrade sympy to get the needed functionality
# !pip install --upgrade sympy

# import sympy as sp
# # define our symbols
# w0, w1, x, y = sp.symbols('w0 w1 x y')
# # Define a symbolic expression for the error
# E = (y - sp.sin(w0 + w1 * x))**2
# # Compute the gradient of E with respect to w0 and w1
# gE = [sp.diff(E, var) for var in (w0, w1)]
# gE

# hessian_E = sp.hessian(E, (w0, w1))
# hessian_E

# n = sp.symbols('n', integer=True)    # number of data points (symbolic length)
# i = sp.Idx('i', n)                   # index variable for summation
# w0, w1 = sp.symbols('w0 w1')         # weights
# x = sp.IndexedBase('x', shape=(n,))  # indexed variable for x data
# y = sp.IndexedBase('y', shape=(n,))  # indexed variable for y data

# E_i = (y[i] - sp.sin(w0 + w1 * x[i]))**2
# E = sp.summation(E_i, (i, 0, n - 1)) / n
# H = sp.hessian(E, (w0, w1))

# Hfun = sp.lambdify((w0, w1, x, y, n), H)
# def hessian_sine_MSE2(w):
#     return np.array(Hfun(w[0], w[1], sine['x'], sine['y'], len(sine['x'])))

# hessian_sine_MSE(np.array([1.1, 2.5]))

# hessian_sine_MSE2(np.array([1.1, 2.5]))

def taylor_loss(w, w_star, L_star, g_star, H_star):
    delta = w - w_star
    return (L_star + delta @ g_star + 1/2 * delta.T @ H_star @ delta)

# w_star = np.array([-2., -2.])
w_star = cancer['grid_best']
L_star = NLL(w_star)
g_star = grad_NLL(w_star)
H_star = hessian_NLL(w_star)
s,u = np.linalg.eigh(H_star)
print("w_star:", w_star)
print("L_star:", L_star)
print("g_star:", g_star)
print("H_star:\n", H_star)
print("Eigenvalues of H_star:", s)
print("Eigegenvectors of H_star:\n", u)

w_star: [-3.93103448 -0.87241379]
L_star: 0.2757763227300472
g_star: [0.00030902 0.0013768 ]
H_star:
 [[ 0.01454776 -0.00817985]
 [-0.00817985  0.08057159]]
Eigenvalues of H_star: [0.01354943 0.08156991]
Eigegenvectors of H_star:
 [[-0.99263445 -0.12114806]
 [-0.12114806  0.99263445]]

fig = plot_gradient(cancer['w1'], cancer['w2'], 
                    cancer['error'], cancer['dw1'], cancer['dw2'])

cancer['taylor_loss'] = np.array([
    taylor_loss(w, w_star, L_star, g_star, H_star)
for w in cancer['ws']]).reshape(cancer['w1'].shape)

fig.add_surface(z=cancer['taylor_loss'], x=cancer['w1'], y=cancer['w2'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                                      size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)

fig = go.Figure()
fig.add_contour(z=cancer['taylor_loss'].flatten(), x=cancer['w1'].flatten(), 
                y=cancer['w2'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                            size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
lam, U = np.linalg.eigh(H_star)
scale = scaling / np.sqrt(np.abs(lam))

cx, cy = cancer['grid_best']
for i, (lami, ui, si) in enumerate(zip(lam, U.T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

# w_star = np.array([-2., -2.])
sine['w_star'] = sine['grid_best']
sine['L_star'] = sine_MSE(sine['w_star'])
sine['g_star'] = grad_sine_MSE(sine['w_star'])
sine['H_star'] = hessian_sine_MSE(sine['w_star'])
sine['lam'], sine['U'] = np.linalg.eigh(sine['H_star'])
print("w_star:", sine['w_star'])
print("L_star:", sine['L_star'])
print("g_star:", sine['g_star'])
print("H_star:\n", sine['H_star'])
print("Eigenvalues of H_star:", sine['lam'])
print("Eigenvectors of H_star:\n", sine['U'])

w_star: [1.29310345 2.44827586]
L_star: 0.23832770254453578
g_star: [ 0.02639336 -0.10396998]
H_star:
 [[ 0.94921519  3.62350418]
 [ 3.62350418 19.09410494]]
Eigenvalues of H_star: [ 0.25236948 19.79095065]
Eigenvectors of H_star:
 [[-0.98200554  0.18885209]
 [ 0.18885209  0.98200554]]

fig = plot_gradient(sine['w0'], sine['w1'], 
                    sine['error'], sine['dw0'], sine['dw1'], scale=0.1)

sine['taylor_loss'] = np.array([
    taylor_loss(w, sine['w_star'], sine['L_star'], sine['g_star'], sine['H_star'])
for w in sine['ws']]).reshape(sine['w1'].shape)

fig.add_surface(z=sine['taylor_loss'], x=sine['w0'], y=sine['w1'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                                      size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)

fig = go.Figure()
fig.add_contour(z=sine['taylor_loss'].flatten(), x=sine['w0'].flatten(), 
                y=sine['w1'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                            size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
scale = scaling / np.sqrt(np.abs(sine['lam']))

cx, cy = sine['grid_best']
for i, (lami, ui, si) in enumerate(zip(sine['lam'], sine['U'].T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

H_bad = np.array([[.1, 0], [0, 1]])
def quad(w):
    """A poorly conditioned quadratic function."""
    return np.sum((w @ H_bad) * w,1)
    
def grad_quad(w):
    """Gradient of a poorly conditioned quadratic function."""
    return np.array(w @ H_bad * 2)

w1,w2 = np.meshgrid(np.linspace(-5, 5, 30), np.linspace(-5, 5, 30))
ws = np.hstack([w1.reshape(-1, 1), w2.reshape(-1, 1)]) 
error =  quad(ws).reshape(w1.shape)
contour = go.Contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(), colorscale='Viridis_r',
                contours=dict(start=0, end=20, size=.5))
go.Figure(data=contour)

w0 = np.array([-4., 0])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=.9, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

def gd_momentum(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6, momentum=0.9):
    """Gradient descent with momentum.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
        momentum: Momentum factor.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    v = np.zeros_like(w_old)
    for e in range(nepochs):
        g = gradient(w_old)
        v = momentum * v - learning_rate * g
        w = w_old + v
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

w0 = np.array([-4.,-2.])
path = gd_momentum(w0, grad_quad, 
                   learning_rate=.9, 
                   momentum=0.3, 
                   nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([2.2, 2.2])
#w0 = np.array([.5, 2.5])
w0 = np.array([1.5, 2])
path = gd_momentum(w0, 
                   grad_sine_MSE,
                   learning_rate=.1, 
                   momentum=0.9, 
                   nepochs=50)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

def linear_learning_rate(initial_lr, T, end_lr=0):
    """Linearly decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        T: Total number of iterations.
        end_lr: Final learning rate.
    Returns:
        A function that takes the current iteration and returns the learning rate."""
    return lambda t: initial_lr * (1 - min(1, t / T)) + end_lr * min(1, t / T)

lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

def power_law_learning_rate(initial_lr, T, s=1, c=1):
    """Power-law decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        T: Total number of epochs.
        s: Scaling factor.
        c: Exponent.
    Returns:
        A function that takes the current epoch and returns the learning rate."""
    return lambda t: initial_lr * (1 + t/s) ** (-c)

lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

def exponential_learning_rate(initial_lr, decay_rate=0.95, decay_steps=10):
    """Exponential decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        decay_rate: Decay rate (0 < decay_rate < 1).
        decay_steps: Number of steps before applying decay.
    Returns:
        A function that takes the current step and returns the learning rate."""
    return lambda t: initial_lr * decay_rate**(np.floor(t / decay_steps))

lr = exponential_learning_rate(initial_lr=0.1, 
                               decay_rate=0.90, 
                               decay_steps=10)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

def cosine_learning_rate(T, max_lr, min_lr=0):
    """Cosine annealing learning rate.
    Args:
        T: Total number of epochs.
        max_lr: Maximum learning rate.
        min_lr: Minimum learning rate.
    Returns:
        A function that takes the current epoch and returns the learning rate."""
    return lambda t: min_lr + 0.5 * (max_lr - min_lr) * (1 + np.cos(np.pi * t / T))

lr = cosine_learning_rate(T=100, max_lr=1.0)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

linear_lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
power_lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
exp_lr = exponential_learning_rate(initial_lr=0.1, decay_rate=0.90, decay_steps=5)
cosine_lr = cosine_learning_rate(T=100, max_lr=0.1, min_lr=0.01)
t = np.arange(100)
fig = go.Figure()
fig.add_trace(go.Scatter(x=t, y=[linear_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Linear', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[power_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Power Law', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[exp_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Exponential', line=dict(width=4))) 
fig.add_trace(go.Scatter(x=t, y=[cosine_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Cosine', line=dict(width=4))) 
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),
                  xaxis_title='Epoch', yaxis_title='Learning Rate', 
                  height=800)

def gd_adam(w_0, gradient, 
    learning_rate=lambda t: 0.1, 
    nepochs=100, 
    epsilon=1e-6, 
    beta1=0.9, 
    beta2=0.999):
    """Adam optimizer that combines momentum and RMSProp.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
        beta1: Exponential decay rate for the first moment estimate.
        beta2: Exponential decay rate for the second moment estimate.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    s = np.zeros_like(w_old) # (Momentum)
    r = np.zeros_like(w_old) # (RMSProp)
    for t in range(nepochs):
        g = gradient(w_old)
        # Update biased first moment estimate and second moment estimate
        s = beta1 * s + (1 - beta1) * g
        r = beta2 * r + (1 - beta2) * g**2
        # Bias correction
        s_hat = s / (1 - beta1 ** (t + 1)) 
        r_hat = r / (1 - beta2 ** (t + 1))
        # Weight Update
        w_old = w_old - learning_rate(t) * s_hat / (np.sqrt(r_hat) + epsilon)
        path.append(w_old)
    return np.array(path)

w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
path = gd_adam(
    w0, 
    grad_sine_MSE,
    learning_rate=lambda t: 0.1, 
    nepochs=100, 
    epsilon=1e-6, 
    beta1=0.9, 
    beta2=0.999)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

class AdamOptimizer:
    def __init__(self, 
                 learning_rate=lambda t: 0.1, 
                 epsilon=1e-6, 
                 beta1=0.9, 
                 beta2=0.999):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.beta1 = beta1
        self.beta2 = beta2
        self.t = 0

    def step(self, w, g):
        # initialize state variables on first call
        if not hasattr(self, 's'):
            self.s = np.zeros_like(w) # (Momentum)
            self.r = np.zeros_like(w) # (RMSProp)
        # Update biased first moment estimate and second moment estimate
        self.s = self.beta1 * self.s + (1 - self.beta1) * g
        self.r = self.beta2 * self.r + (1 - self.beta2) * g ** 2
        # Bias correction
        s_hat = self.s / (1 - self.beta1 ** (self.t + 1))
        r_hat = self.r / (1 - self.beta2 ** (self.t + 1))
        # Weight Update
        w = w - self.learning_rate(self.t) * s_hat / (np.sqrt(r_hat) + self.epsilon)
        self.t += 1
        return w

class VanillaOptimizer:
    def __init__(self, learning_rate=lambda t: 0.1):
        self.learning_rate = learning_rate
        self.t = 0

    def step(self, w, g):
        w = w - self.learning_rate(self.t) * g
        self.t += 1
        return w

class MomentumUpdate:
    def __init__(self, learning_rate=lambda t: 0.1, momentum=0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.t = 0

    def step(self, w, g):
        if not hasattr(self, 'v'):
            self.v = np.zeros_like(w)
        self.v = self.momentum * self.v - self.learning_rate(self.t) * g
        self.t += 1
        return w + self.v

def gd(w_0, gradient, optimizer, nepochs = 100, epsilon=1e-8):
    """Generic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for t in range(nepochs):
        g = gradient(w)
        w = optimizer.step(w, g)
        path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)

w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1,  
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.1, 
    momentum=0.9)
basic = VanillaOptimizer(learning_rate=lambda t: 0.1)

path = gd(w0, grad_sine_MSE, adam, nepochs=100)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

def sgd(w_0, x, t, gradient, optimizer, nepochs = 2, epsilon=1e-8):
    """Generic stochastic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for e in range(nepochs):
        random_indices = np.random.permutation(len(t))
        for i in random_indices:
            g = gradient(w, x[i], t[i])
            w = optimizer.step(w, g)
            path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)

w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 2
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1, 
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.01, 
    momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)

path = sgd(w0, sine['x'], sine['y'], grad_sine_MSE, 
           adam, 
           nepochs=nepochs)


errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

#w0 = np.array([-10., -5.])
w0 = np.array([-1., 2.])
#w0 = np.array([0., -4])
nepochs = 1
coslr = cosine_learning_rate(T=nepochs*len(cancer['t']), max_lr=0.4, min_lr=0.01)
adam = AdamOptimizer(
    learning_rate=coslr,
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=coslr, 
    momentum=0.5)
vanilla = VanillaOptimizer(learning_rate=coslr)

path = sgd(w0, cancer['x'], cancer['t'], grad_NLL, 
           adam, 
           nepochs=nepochs)


errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, NLL)
fig_loss.show()

def mb_sgd(w_0, x, t, gradient, optimizer, 
           batch_size=32, nepochs = 2, epsilon=1e-8):
    """Generic stochastic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        batch_size: Size of each mini-batch.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for e in range(nepochs):
        random_indices = np.random.permutation(len(t))
        for start in range(0, len(t), batch_size):
            end = start + batch_size
            batch_indices = random_indices[start:end]
            g = gradient(w, x[batch_indices], t[batch_indices])
            w = optimizer.step(w, g)
            path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)

w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 10
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1, 
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.01, 
    momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)

path = mb_sgd(
    w0, sine['x'], sine['y'], grad_sine_MSE, 
    adam, 
    batch_size=32,
    nepochs=nepochs)

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

	mean radius	mean texture
0	1.097064	-2.073335
1	1.829821	-0.353632
2	1.579888	0.456187
3	-0.768909	0.253732
4	1.750297	-1.151816

	x	yhat(w=[0. 2.])	yhat(w=[2. 3.])	yhat(w=[0. 3.5])
0	0.043371	0.086632	0.847619	0.151215
1	0.121225	0.240082	0.701797	0.411673
2	0.199080	0.387723	0.517864	0.641752
3	0.276935	0.525982	0.305809	0.824474
4	0.354790	0.651515	0.077147	0.946355

Lecture 13: Gradient Descent and SGD – CS 189, Fall 2025

Plotting Code you Can Safely Ignore (but need to run).

Lecture 12 Setup

The Logistic Regression Problem

The Squared Error Loss Squared Loss on a Non-Linear Model

The Gradient Descent Algorithm

Gradient Descent Applied to Logistic Regression

Gradient Descent Applied to Sine Model

Loss Curves

Convergence on the Parabola

The Second Order Structure

The Hessian of the Logistic Regression Model

The Hessian of the Sine Regression Model

Taylor Expansion of the Loss

Applied to Logistic Regression

Applied to Sine Regression

Oscillating Loss Functions

Gradient Descent with Momentum

Learning Rate Schedules

Linear Schedule

Power-law Schedule

Exponential Schedule

Cosine Schedule

Comparing Schedules

Adam Optimizer

Factorized Adam Optimizer

Stochastic Gradient Descent

Minibatch Stochastic Gradient Descent