import plotly.io as pio; pio.renderers.default = "notebook_connected"

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np

# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")

def make_plot_grid(figs, rows, cols):
    """Create a grid of figures with Plotly figures."""
    from plotly.subplots import make_subplots
    def get_trace_type(fig):
        for trace in fig.data:
            if trace.type == 'surface':
                return 'surface'  # required for go.Surface
            elif trace.type.startswith('scatter3d') or trace.type.startswith('mesh3d'):
                return 'scene'  # 3D scene
        return 'xy'  # default 2D
    specs = [[{'type': get_trace_type(fig)} for fig in figs[i:i+cols]] for i in range(0, len(figs), cols)]
    fig_grid = make_subplots(rows=rows, cols=cols, specs=specs,
                             subplot_titles=[fig.layout.title.text for fig in figs])
    for i, fig in enumerate(figs):
        fig_grid.add_traces(fig.data, rows=(i//cols) + 1, cols=(i%cols) + 1 )
    return fig_grid

def plot_lr_predictions(w, cancer):
    """Plot predictions of the logistic model."""
    cancer = cancer.copy()
    cancer['logistic_pred'] = np.where(
        logistic_model(w, cancer[['mean radius', 'mean texture']].values) > 0.5,
        'Pred Malignant', 'Pred Benign'
    )
    # Create a scatter plot with logistic predictions
    fig = px.scatter(cancer, x='mean radius', y='mean texture', 
                     symbol='logistic_pred', color='target',
                     symbol_sequence=[ "circle-open", "cross"])
    for (i,t) in enumerate(fig.data): t.legendgroup = str(i)
    # decision boundary
    xs = np.linspace(cancer['mean radius'].min(), cancer['mean radius'].max(), 100)
    decision_boundary = -(w[0] * xs ) / w[1]
    fig.add_scatter(x=xs, y=decision_boundary, mode='lines', 
                    name='Decision Boundary', legendgroup='Decision Boundary',
                    line=dict(color='black', width=2, dash='dash', ))
    # probability surface
    ys = np.linspace(cancer['mean texture'].min(), cancer['mean texture'].max(), 100)
    X, Y = np.meshgrid(xs, ys)
    Z = logistic_model(w, np.c_[X.ravel(), Y.ravel()]).reshape(X.shape)
    fig.add_contour(x=xs, y=ys, z=Z, 
                    colorscale='Matter_r', opacity=0.5,
                    name='Probability Surface', 
                    colorbar=dict(x=1.05, y=0.3, len=0.75))

    fig.update_layout(title=f'w=({w[0]:.2f}, {w[1]:.2f})',
                      xaxis_range=[xs.min(), xs.max()], yaxis_range=[ys.min(), ys.max()],
                      xaxis_title='Mean Radius (scaled)', yaxis_title='Mean Texture (scaled)',
                      width=800, height=600)
    return fig

def plot_loss(w1, w2, error, ncontours=50):
    surf_fig = go.Figure()
    surf_fig.add_surface(z=error, x=w1, y=w2, 
                         colorscale='Viridis_r', opacity=0.7, showscale=False,
                         contours=dict(z=dict(show=True, highlightcolor="white", 
                                          start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours)))
    surf_fig.update_layout(title="Loss Surface")
    contour_fig = go.Figure()
    contour_fig.add_contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(),
                            colorscale='Viridis_r', opacity=0.7,
                            contours=dict(start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours),
                            colorbar=dict(x=1.05, y=0.35, len=0.75))
    contour_fig.update_layout(title="Loss Contours")
    fig = make_plot_grid([surf_fig, contour_fig], 1, 2).update_layout(height=800)
    fig.update_layout(scene=dict(xaxis_title='w1', yaxis_title='w2', zaxis_title='Loss', aspectmode='cube'))
    fig.update_layout(xaxis_range=[w1.min(), w1.max()], yaxis_range=[w2.min(), w2.max()],
                      xaxis_title='w1', yaxis_title='w2')
    return fig

def plot_gradient(w1, w2, error, dw1, dw2, scale=1.0):
    fig = plot_loss(w1, w2, error)
    fig.add_trace(
        go.Cone(
            x=w1.flatten(), y=w2.flatten(), z=np.zeros_like(error).flatten(),  # Ground plane
            u=dw1.flatten(), v=dw2.flatten(), w=np.zeros_like(error).flatten(),  # No vertical component
            sizeref=2, anchor="tail", showscale=False
        ), 1,1)
    contour_fig = ff.create_quiver(
        x=w1.flatten(), y=w2.flatten(), u=dw1.flatten(), v=dw2.flatten(), 
        line_width=2, line_color="white",
        scale = scale, arrow_scale=.2, showlegend=False)
    fig.add_traces(contour_fig.data, rows=1, cols=2)
    return fig

def add_solution_path(fig, errors, ws):
    s = np.linspace(0, 1, len(ws))
    fig.add_scatter3d(x=ws[:, 0], y=ws[:, 1], z=errors, marker_color=s, marker_size=5,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    row=1, col=1)
    fig.add_scatter(x=ws[:, 0], y=ws[:, 1], marker_color=s,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    showlegend=False,
                    row=1, col=2)
    fig.add_scatter3d(x=[ws[-1, 0]], y=[ws[-1, 1]], z=[errors[-1]],
                       mode='markers', marker=dict(color='red', size=10),
                       name='Final Solution', legendgroup='Final Solution',
                       row=1, col=1)
    fig.add_scatter(x=[ws[-1, 0]], y=[ws[-1, 1]],
                       mode='markers', marker=dict(color='red', size=20),
                       name='Final Solution', legendgroup='Final Solution',
                       showlegend=False,
                       row=1, col=2)
    return fig

w = np.linspace(-2,7,100)
f = lambda w: w**2 - 3 * w + 4
fig = px.line(x=w, y=f(w), labels={'x': 'w', 'y': 'f(w)'})
fig.update_traces(line_width=5)
# fig.write_image("images/function1.pdf")
fig

w = np.arange(-2, 7)
f = lambda w: w**2 - 3 * w + 4
fig = px.scatter(x=w, y=f(w), labels={'x': 'w', 'y': 'f(w)'})
fig.update_traces(marker_size=7)
# fig.write_image("images/function2.pdf",) 
fig

w = np.linspace(-3,3,100)
f = lambda w: w**4 - 5 * w**2 + w + 4
fig = px.line(x=w, y=f(w), labels={'x': 'w', 'y': 'f(w)'})
#increase the line width
fig.update_traces(line_width=5)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
# fig.write_image("images/function3.pdf")
fig

def plot_secant(w, f, x1, x2):
    """Plot the secant line between two points on the function."""
    y1, y2 = f(x1), f(x2)
    fig = px.line(x=w, y=f(w), labels={'x': 'w', 'y': 'f(w)'})
    fig.add_scatter(x=[x1, x2], y=[f(x1), f(x2)], 
                mode='markers+lines', name='Secant Line',
                marker=dict(size=20, color='green'),
                line=dict(color='green', dash="dash"))
    fig.update_traces(line_width=5)
    fig.update_layout(showlegend=False)
    return fig

w = np.linspace(-2,7,100)
f = lambda w: w**2 - 3 * w + 4
fig_convex = plot_secant(w, f, -1, 5)
fig_convex.update_layout(title="Convex Function")
# fig_convex.write_image("images/convex_function.pdf")

w = np.linspace(-3,3,100)
f = lambda w: w**4 - 5 * w**2 + w + 4
fig_nonconvex = plot_secant(w, f, -1, 1)
fig_nonconvex.update_layout(title="Non-convex Function")
# fig_nonconvex.write_image("images/non_convex_function.pdf")

fig = make_plot_grid([fig_convex, fig_nonconvex], 1, 2)
fig.update_layout(showlegend=False)

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
cancer_dict = datasets.load_breast_cancer(as_frame=True)
cancer_df = pd.DataFrame(cancer_dict.data, columns=cancer_dict.feature_names)
cancer_df['target'] = cancer_dict.target.astype(str)
cancer_df = cancer_df[['mean radius', 'mean texture', 'target']].dropna()
scaler = StandardScaler()
cancer_df[['mean radius', 'mean texture']] = scaler.fit_transform(cancer_df[['mean radius', 'mean texture']])
print("The dataset:", cancer_df.shape)
display(cancer_df.head())
px.scatter(cancer_df, x='mean radius', y='mean texture', color='target', opacity=0.7)

The dataset: (569, 3)

def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_model(w, x):
    """Logistic model for binary classification."""
    z = w @ x.T
    return sigmoid(z)

guesses = np.array([[-1., -1.],[-2., -4.]])
figs = [plot_lr_predictions(w, cancer_df) for w in guesses]
figs[1].update_traces(showlegend=False)
fig = make_plot_grid(figs, 1, 2)
# minor fixup of plots 
fig.update_layout(height=600, xaxis_range=figs[0].layout.xaxis.range, 
                  yaxis_range=figs[0].layout.yaxis.range)
fig.update_layout(height=600, xaxis2_range=figs[1].layout.xaxis.range, 
                  yaxis2_range=figs[1].layout.yaxis.range)
fig

def neg_log_likelihood(w):
    """Negative log-likelihood for logistic regression."""
    x = cancer['x']
    t = cancer['t']
    z = w @ x.T
    # return -np.mean(t * np.log(logistic_model(w, x)) + (1 - t) * np.log(1 - logistic_model(w, x)))
    # more numerically stable version
    # np.mean(np.log(1+ np.exp(z)) - t * z) 
    # stable softplus: log(1+exp(z))
    softplus_z = np.logaddexp(0, z)
    return np.mean(softplus_z - t * z)

cancer = dict()
cancer['npts'] = 30
cancer['w1'], cancer['w2'] = np.meshgrid(
    np.linspace(-10, 1, cancer['npts']), 
    np.linspace(-5, 1.3, cancer['npts'])
)
cancer['ws'] = np.stack([cancer['w1'].flatten(), cancer['w2'].flatten()]).T
cancer['x'] = cancer_df[['mean radius', 'mean texture']].values
cancer['t'] = cancer_df['target'].values.astype(float)

cancer['error'] = np.array([neg_log_likelihood(w) 
                for w in cancer['ws']])
cancer['error'] = cancer['error'].reshape(cancer['w1'].shape)
fig = plot_loss(cancer['w1'], cancer['w2'], cancer['error'])
for i, g in enumerate(guesses):
    fig.add_scatter3d(x=[g[0]], y=[g[1]], z=[neg_log_likelihood(g)],
                      mode='markers', marker=dict(size=10, color=colors[i + 2]),
                      name=f'Weight {g}', legendgroup=str(i), row=1, col=1)
    fig.add_scatter(x=[g[0]], y=[g[1]], mode='markers',
                    marker=dict(size=10, color=colors[i + 2]),
                    name=f'Weight {g}', legendgroup=str(i), showlegend=False,
                    row=1, col=2)
fig

best_ind = np.argmin(cancer['error'])
cancer['grid_best'] = np.array([cancer['w1'].flatten()[best_ind], 
                                cancer['w2'].flatten()[best_ind]])

fig = plot_loss(cancer['w1'], cancer['w2'], cancer['error'])
for i, g in enumerate(guesses):
    fig.add_scatter3d(x=[g[0]], y=[g[1]], z=[neg_log_likelihood(g)],
                      mode='markers', marker=dict(size=10, color=colors[i + 2]),
                      name=f'Weight {g}', legendgroup=str(i), row=1, col=1)
    fig.add_scatter(x=[g[0]], y=[g[1]], mode='markers',
                    marker=dict(size=10, color=colors[i + 2]),
                    name=f'Weight {g}', legendgroup=str(i), showlegend=False,
                    row=1, col=2)
fig.add_scatter3d(x=[cancer['grid_best'][0]], y=[cancer['grid_best'][1]], z=[cancer['error'].min()],
    mode='markers', marker=dict(size=10, color='red'),
    name=f"Best Weight [{cancer['grid_best'][0]:0.2f}, {cancer['grid_best'][1]:0.2f}]", 
    legendgroup=f"Best",
    row=1, col=1)
fig.add_scatter(x=[cancer['grid_best'][0]], y=[cancer['grid_best'][1]],
    mode='markers', marker=dict(size=10, color='red'),
    name=f"Best Weight [{cancer['grid_best'][0]:0.2f}, {cancer['grid_best'][1]:0.2f}]", 
    legendgroup=f"Best",
    showlegend=False, row=1, col=2)

fig

plot_lr_predictions(cancer['grid_best'], cancer_df)

sine = dict()
np.random.seed(42)
sine['n'] = 200
# Generate some random data
sine['x'] = np.random.rand(sine['n']) * 2.5 * np.pi
sine['x'] = np.sort(sine['x']) # sort for easier plotting
sine['y'] = np.sin(1.1 + 2.5 * sine['x']) + 0.5 * np.random.randn(sine['n'])
sine_df = pd.DataFrame({'x': sine['x'], 'y': sine['y']})

fig = px.scatter(sine_df, x='x', y='y')
fig.update_traces(marker_color='black') 
data_trace = fig.data[0]
# fig.write_image("images/sine_data.pdf", width=800, height=400)
fig

def sine_model(w, x):
    return np.sin(w[0] + x * w[1])

sine['guesses'] = np.array([[0, 2], [2, 3], [0, 3.5]])

sine['xhat'] = np.linspace(sine_df['x'].min(), sine_df['x'].max(), 100)
sine['pred_df'] = pd.DataFrame({'x': sine['xhat']})
for w in sine['guesses']:
    sine['pred_df'][f'yhat(w={w})'] = sine_model(w, sine['xhat'])
sine['pred_df'].head()

fig = go.Figure()
for i, w in enumerate(sine['pred_df'].columns[1:]):
    fig.add_trace(go.Scatter(x=sine['pred_df']['x'], y=sine['pred_df'][w], 
                             mode='lines', name=w, 
                             line=dict(width=4, color=colors[i+2])))
fig.update_traces(line_width=4)
fig.add_trace(data_trace)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), 
                  xaxis_title='x', yaxis_title='y')
# fig.write_image("images/sine_with_3_models.pdf", width=800, height=400)
fig

def sine_MSE(w):
    x = sine['x']
    y = sine['y']
    y_hat = sine_model(w, x)
    return np.mean((y - y_hat) ** 2)

sine['npts'] = 30
sine['w0'], sine['w1'] = np.meshgrid(
    np.linspace(-1.5, 3, sine['npts']), np.linspace(1, 4, sine['npts']))
# combine w1 and w2 into a single tensor
sine['ws'] = np.stack([sine['w0'].flatten(), sine['w1'].flatten()]).T

sine['error'] = np.array([sine_MSE(w) for w in sine['ws']]).reshape(sine['w0'].shape)

fig = plot_loss(sine['w0'], sine['w1'], sine['error'])
for i, w in enumerate(sine['guesses']):
    fig.add_trace(go.Scatter3d(x=[w[0]], y=[w[1]], z=[sine_MSE(w)],
                               mode='markers', marker=dict(size=5, color=colors[i+2]),
                               name=f'w=({w[0]}, {w[1]})', legendgroup=str(i)),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=[w[0]], y=[w[1]],
                               mode='markers', marker=dict(size=20, color=colors[i+2]),
                               name=f'w=({w[0]}, {w[1]})', legendgroup=str(i), showlegend=False),
                  row=1, col=2) 
    
fig

ind = np.argmin(sine['error'])
sine['grid_best'] = sine['ws'][ind,:]
sine['grid_best_error'] = sine['error'].flatten()[ind]
print(f"Best weights: {sine['grid_best']}, with error: {sine['grid_best_error']}")

Best weights: [1.29310345 2.44827586], with error: 0.23832770254453578

fig = plot_loss(sine['w0'], sine['w1'], sine['error'])
for i, w in enumerate(sine['guesses']):
    fig.add_trace(go.Scatter3d(x=[w[0]], y=[w[1]], z=[sine_MSE(w)],
                               mode='markers', marker=dict(size=5, color=colors[i+2]),
                               name=f'w=({w[0]}, {w[1]})', legendgroup=str(i)),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=[w[0]], y=[w[1]],
                               mode='markers', marker=dict(size=20, color=colors[i+2]),
                               name=f'w=({w[0]}, {w[1]})', legendgroup=str(i), showlegend=False),
                  row=1, col=2) 
fig.add_scatter3d(x=[sine['grid_best'][0]], y=[sine['grid_best'][1]], z=[sine['grid_best_error']],
    mode='markers', marker=dict(size=10, color='red'),
    name=f"Best Weight [{sine['grid_best'][0]:0.2f}, {sine['grid_best'][1]:0.2f}]", legendgroup=f"Best",
    row=1, col=1)
fig.add_scatter(x=[sine['grid_best'][0]], y=[sine['grid_best'][1]],
    mode='markers', marker=dict(size=20, color='red'),
    name=f"Best Weight [{sine['grid_best'][0]:0.2f}, {sine['grid_best'][1]:0.2f}]", legendgroup=f"Best",
    showlegend=False, row=1, col=2)
    
fig

fig = go.Figure()
for i, w in enumerate(sine['pred_df'].columns[1:]):
    fig.add_trace(go.Scatter(x=sine['pred_df']['x'], y=sine['pred_df'][w], mode='lines', name=w, 
                             line=dict(width=4, color=colors[i+2])))
fig.update_traces(line_width=4)
fig.add_trace(data_trace)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20), 
                  xaxis_title='x', yaxis_title='y')
sine['pred_df']['grid_best'] = sine_model(sine['grid_best'], sine['xhat'])  
fig.add_scatter(x=sine['pred_df']['x'], y=sine['pred_df']['grid_best'], 
                mode='lines', 
                name=f"Best w=({sine['grid_best'][0]:.2f}, {sine['grid_best'][1]:.2f})",
                line=dict(width=8, color='red')
)
# fig.write_image("images/sine_with_3_models.pdf", width=800, height=400)
fig

fig1 = px.scatter(x=cancer['x'][:,0], 
                  y=cancer['t'] + 0.01*np.random.normal(size=cancer['t'].shape), 
                  color=cancer['t'].astype(str), title="Cancer Data (Jittered)")

ws = np.linspace(-20, 3, 100)
nll = np.array([
    np.mean(np.log1p(np.exp(-w*cancer['x'][:,0])) - cancer['t'] * (w*cancer['x'][:,0]))
    for w in ws])

fig2 = px.line(x=ws, y=nll, labels={'x': 'w', 'y': 'Negative Log-Likelihood'})
ind = np.argmin(nll)
best_w = ws[ind]
fig2.add_vline(x=best_w, line=dict(color='red', dash='dash'), 
               annotation_text=f"Best w={best_w:.2f}", 
               annotation_position="top right")
fig2.add_scatter(x=[best_w], y=[nll[ind]], mode='markers', 
                 marker=dict(color='red', size=10),
                 name="Best w")
xtest = np.linspace(cancer['x'][:,0].min(), cancer['x'][:,0].max(), 100)
fig1.add_scatter(x=xtest, 
                 y=sigmoid(best_w * xtest), 
                 mode='lines', line=dict(color='black', width=4),
                 name=f'Logistic Model (w={best_w:.2f})')
fig = make_plot_grid([fig1, fig2], 1, 2)
fig.update_layout(height=600)

fig1 = px.scatter(x=cancer['x'][:,0], 
                  y=cancer['x'][:,1], title="Cancer Data")
ws = np.linspace(-5, 5, 100)
sqloss = [np.mean((cancer['x'][:,1] - (w * cancer['x'][:,0]))**2) for w in ws]
fig2 = px.line(x=ws, y=sqloss, labels={'x': 'w', 'y': 'Squared Loss'})
ind = np.argmin(sqloss)
best_w = ws[ind] 
xtest = np.linspace(cancer['x'][:,0].min(), cancer['x'][:,0].max(), 100)
fig1.add_scatter(x=xtest, 
                 y=best_w * xtest, 
                 mode='lines', line=dict(color='black', width=4),
                 name=f'Linear Model (w={best_w:.2f})')
fig2.add_vline(x=best_w, line=dict(color='red', dash='dash'), 
               annotation_text=f"Best w={best_w:.2f}", 
               annotation_position="top right")
fig2.add_scatter(x=[best_w], y=[sqloss[ind]], mode='markers', 
                 marker=dict(color='red', size=10),
                 name="Best w")
fig = make_plot_grid([fig1, fig2], 1, 2)
fig.update_layout(height=600)

offset = 3
fig1 = px.scatter(x=cancer['x'][:,0]+offset, 
                  y=cancer['t'] + 0.01*np.random.normal(size=cancer['t'].shape), 
                  color=cancer['t'].astype(str), title="Cancer Data (Jittered)")
#fig1.show()
w1,w2 = np.meshgrid(np.linspace(5,15, 40), np.linspace(-8, 0, 40))
ws = np.stack([w1.flatten(), w2.flatten()]).T
nll = np.array([
    np.mean(np.mean(np.logaddexp(0, w[1]*(cancer['x'][:,0]+offset) + w[0])) - 
    cancer['t'] * (w[1]*(cancer['x'][:,0]+offset) + w[0]))
    for w in ws]).reshape(w1.shape)

fig2 = plot_loss(w1, w2, nll)
ind = np.argmin(nll)
best_w = ws[ind,:]
fig2.add_scatter3d(x=[best_w[0]], y=[best_w[1]], z=[nll.flatten()[ind]],
                   mode='markers', marker=dict(size=10, color='red'),
                   name=f'Best w=({best_w[0]:.2f}, {best_w[1]:.2f})', legendgroup='Best',
                   row=1, col=1)
fig2.add_scatter(x=[best_w[0]], y=[best_w[1]],
                   mode='markers', marker=dict(size=20, color='red'),
                   name=f'Best w=({best_w[0]:.2f}, {best_w[1]:.2f})', legendgroup='Best',
                   showlegend=False,
                   row=1, col=2)
fig2.show()
xtest = np.linspace(cancer['x'][:,0].min()+offset, cancer['x'][:,0].max()+offset, 100)
fig1.add_scatter(x=xtest, 
                 y=sigmoid(best_w[1] * xtest + best_w[0]), 
                 mode='lines', line=dict(color='black', width=4),
                 name=f'Logistic Model (w=({best_w[0]:.2f}, {best_w[1]:.2f}))')
fig1.update_layout(height=600)

def grad_NLL(w):
    """Compute the gradient of the negative log-likelihood."""
    p = logistic_model(w, cancer['x'])
    grad = np.mean((p - cancer['t']).reshape(-1, 1) * cancer['x'], 0)
    return grad

grad_NLL(np.array([-1, 2]))

array([0.30254033, 0.44824536])

(cancer['dw1'], cancer['dw2']) = np.array(
    [grad_NLL(w) for w in cancer['ws']]).T
cancer['dw1'] = cancer['dw1'].reshape(cancer['w1'].shape)
cancer['dw2'] = cancer['dw2'].reshape(cancer['w1'].shape)
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
fig = plot_gradient(cancer['w1'], cancer['w2'], cancer['error'], cancer['dw1'], cancer['dw2'], scale=2)
fig

def grad_sine_MSE(w):
    """Compute the gradient of the negative log-likelihood."""
    x = sine['x']
    y = sine['y']
    y_hat = sine_model(w, x)
    grad_w0 = -2 * np.mean((y - y_hat) * np.cos(w[0] + w[1] * x))
    grad_w1 = -2 * np.mean((y - y_hat) * x * np.cos(w[0] + w[1] * x))
    return np.array([grad_w0, grad_w1])

grad_sine_MSE(np.array([0, 2]))

array([-0.18866281,  0.66526642])

(sine['dw0'], sine['dw1']) = np.array(
    [grad_sine_MSE(w) for w in sine['ws']]).T
sine['dw0'] = sine['dw0'].reshape(sine['w1'].shape)
sine['dw1'] = sine['dw1'].reshape(sine['w1'].shape)
fig = plot_gradient(sine['w0'], sine['w1'], sine['error'], sine['dw0'], sine['dw1'], scale=0.1)
fig

def gradient_descent(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6):
    """Basic gradient descent algorithm.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    for e in range(nepochs):
        w = w_old - learning_rate * gradient(w_old)
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

# w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=10, 
                     nepochs=100)
errors = [neg_log_likelihood(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

w0 = np.array([1.2, 2.])
w0 = np.array([.5, 2.5])
w0 = np.array([1, 3])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.2, 
                        nepochs=20)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

def make_loss_curve(path, error_func):
    """Make a loss curve from a path of weights."""
    errors = [error_func(w) for w in path]
    fig = px.line(x=np.arange(len(errors)), y=errors, 
                  labels={'x': 'Iteration (Gradient Steps)', 'y': 'Loss (Error)'})
    fig.update_traces(line_width=4)
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    return fig

w0 = np.array([2.5, 1.9])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.1, 
                        nepochs=50)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
fig.show()

w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
# w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=10, 
                     nepochs=100)

fig_loss = make_loss_curve(path, neg_log_likelihood)
fig_loss.show()

errors = [neg_log_likelihood(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

x = np.linspace(-10, 10, 100)
y = x**2 + 1
fig = px.line(x=x, y=y, labels={'x': 'x', 'y': 'f(x)'})
fig.update_traces(line_width=5)

def grad_f(x):
    return 2 * x
w0 = 2
path = gradient_descent(w0, grad_f, learning_rate=1.08, nepochs=10)
fig.add_scatter(x=path, y=path**2 + 1, mode='markers+lines',
                marker=dict(size=10, color='red'),
                line=dict(color='black', width=2, dash="dash"),
                name='Gradient Descent Path')

def hessian_NLL(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = cancer['x']
    p = logistic_model(w, x)
    hessian = (x.T @ np.diag(p * (1 - p)) @ x ) / len(p)
    return hessian

def hessian_sine_MSE(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = sine['x']
    y = sine['y']
    dw0dw0 = 2 * np.mean(np.cos(w[0] + w[1] * x)**2 + 
                (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw0dw1 = 2 * np.mean(x * np.cos(w[0] + w[1] * x)**2 +
                x * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw1dw1 = 2 * np.mean(x**2 * np.cos(w[0] + w[1] * x)**2 + 
                (x**2) * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    hessian = np.array([[dw0dw0, dw0dw1], [dw0dw1, dw1dw1]])
    return hessian

# unfortunately on colab we need to upgrade sympy to get the needed functionality
# !pip install --upgrade sympy

# import sympy as sp
# # define our symbols
# w0, w1, x, y = sp.symbols('w0 w1 x y')
# # Define a symbolic expression for the error
# E = (y - sp.sin(w0 + w1 * x))**2
# # Compute the gradient of E with respect to w0 and w1
# gE = [sp.diff(E, var) for var in (w0, w1)]
# gE

# hessian_E = sp.hessian(E, (w0, w1))
# hessian_E

# n = sp.symbols('n', integer=True)    # number of data points (symbolic length)
# i = sp.Idx('i', n)                   # index variable for summation
# w0, w1 = sp.symbols('w0 w1')         # weights
# x = sp.IndexedBase('x', shape=(n,))  # indexed variable for x data
# y = sp.IndexedBase('y', shape=(n,))  # indexed variable for y data

# E_i = (y[i] - sp.sin(w0 + w1 * x[i]))**2
# E = sp.summation(E_i, (i, 0, n - 1)) / n
# H = sp.hessian(E, (w0, w1))

# Hfun = sp.lambdify((w0, w1, x, y, n), H)
# def hessian_sine_MSE2(w):
#     return np.array(Hfun(w[0], w[1], sine['x'], sine['y'], len(sine['x'])))

# hessian_sine_MSE(np.array([1.1, 2.5]))

# hessian_sine_MSE2(np.array([1.1, 2.5]))

def taylor_loss(w, w_star, L_star, g_star, H_star):
    delta = w - w_star
    return (L_star + delta @ g_star + 1/2 * delta.T @ H_star @ delta)

# w_star = np.array([-2., -2.])
w_star = cancer['grid_best']
L_star = neg_log_likelihood(w_star)
g_star = grad_NLL(w_star)
H_star = hessian_NLL(w_star)
s,u = np.linalg.eigh(H_star)
print("w_star:", w_star)
print("L_star:", L_star)
print("g_star:", g_star)
print("H_star:\n", H_star)
print("Eigenvalues of H_star:", s)
print("Eigegenvectors of H_star:\n", u)

w_star: [-3.93103448 -0.87241379]
L_star: 0.2757763227300472
g_star: [0.00030902 0.0013768 ]
H_star:
 [[ 0.01454776 -0.00817985]
 [-0.00817985  0.08057159]]
Eigenvalues of H_star: [0.01354943 0.08156991]
Eigegenvectors of H_star:
 [[-0.99263445 -0.12114806]
 [-0.12114806  0.99263445]]

fig = plot_gradient(cancer['w1'], cancer['w2'], 
                    cancer['error'], cancer['dw1'], cancer['dw2'])

cancer['taylor_loss'] = np.array([
    taylor_loss(w, w_star, L_star, g_star, H_star)
for w in cancer['ws']]).reshape(cancer['w1'].shape)

fig.add_surface(z=cancer['taylor_loss'], x=cancer['w1'], y=cancer['w2'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                                      size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)

fig = go.Figure()
fig.add_contour(z=cancer['taylor_loss'].flatten(), x=cancer['w1'].flatten(), 
                y=cancer['w2'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                            size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
lam, U = np.linalg.eigh(H_star)
scale = scaling / np.sqrt(np.abs(lam))

cx, cy = cancer['grid_best']
for i, (lami, ui, si) in enumerate(zip(lam, U.T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

# w_star = np.array([-2., -2.])
sine['w_star'] = sine['grid_best']
sine['L_star'] = sine_MSE(sine['w_star'])
sine['g_star'] = grad_sine_MSE(sine['w_star'])
sine['H_star'] = hessian_sine_MSE(sine['w_star'])
sine['lam'], sine['U'] = np.linalg.eigh(sine['H_star'])
print("w_star:", sine['w_star'])
print("L_star:", sine['L_star'])
print("g_star:", sine['g_star'])
print("H_star:\n", sine['H_star'])
print("Eigenvalues of H_star:", sine['lam'])
print("Eigenvectors of H_star:\n", sine['U'])

w_star: [1.29310345 2.44827586]
L_star: 0.23832770254453578
g_star: [ 0.02639336 -0.10396998]
H_star:
 [[ 0.94921519  3.62350418]
 [ 3.62350418 19.09410494]]
Eigenvalues of H_star: [ 0.25236948 19.79095065]
Eigenvectors of H_star:
 [[-0.98200554  0.18885209]
 [ 0.18885209  0.98200554]]

fig = plot_gradient(sine['w0'], sine['w1'], 
                    sine['error'], sine['dw0'], sine['dw1'], scale=0.1)

sine['taylor_loss'] = np.array([
    taylor_loss(w, sine['w_star'], sine['L_star'], sine['g_star'], sine['H_star'])
for w in sine['ws']]).reshape(sine['w1'].shape)

fig.add_surface(z=sine['taylor_loss'], x=sine['w0'], y=sine['w1'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                                      size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)

fig = go.Figure()
fig.add_contour(z=sine['taylor_loss'].flatten(), x=sine['w0'].flatten(), 
                y=sine['w1'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                            size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
scale = scaling / np.sqrt(np.abs(sine['lam']))

cx, cy = sine['grid_best']
for i, (lami, ui, si) in enumerate(zip(sine['lam'], sine['U'].T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

H_bad = np.array([[.1, 0], [0, 1]])
def quad(w):
    """A poorly conditioned quadratic function."""
    return np.sum((w @ H_bad) * w,1)
    
def grad_quad(w):
    """Gradient of a poorly conditioned quadratic function."""
    return np.array(w @ H_bad * 2)

w1,w2 = np.meshgrid(np.linspace(-5, 5, 30), np.linspace(-5, 5, 30))
ws = np.hstack([w1.reshape(-1, 1), w2.reshape(-1, 1)]) 
error =  quad(ws).reshape(w1.shape)
contour = go.Contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(), colorscale='Viridis_r',
                contours=dict(start=0, end=20, size=.5))
go.Figure(data=contour)

w0 = np.array([-4., 0])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=.9, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

def gd_momentum(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6, momentum=0.9):
    """Gradient descent with momentum.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
        momentum: Momentum factor.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    v = np.zeros_like(w_old)
    for e in range(nepochs):
        g = gradient(w_old)
        v = momentum * v - learning_rate * g
        w = w_old + v
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

w0 = np.array([-4.,-2.])
path = gd_momentum(w0, grad_quad, 
                   learning_rate=.9, 
                   momentum=0.3, 
                   nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

w0 = np.array([2.2, 2.2])
#w0 = np.array([.5, 2.5])
# w0 = np.array([1, 3])
path = gd_momentum(w0, 
                   grad_sine_MSE,
                   learning_rate=.1, 
                   momentum=0.9, 
                   nepochs=50)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

	mean radius	mean texture
0	1.097064	-2.073335
1	1.829821	-0.353632
2	1.579888	0.456187
3	-0.768909	0.253732
4	1.750297	-1.151816

	x	yhat(w=[0. 2.])	yhat(w=[2. 3.])	yhat(w=[0. 3.5])
0	0.043371	0.086632	0.847619	0.151215
1	0.121225	0.240082	0.701797	0.411673
2	0.199080	0.387723	0.517864	0.641752
3	0.276935	0.525982	0.305809	0.824474
4	0.354790	0.651515	0.077147	0.946355

Lecture 12: Gradient Descent – CS 189, Fall 2025

Plotting Code you Can Safely Ignore (but need to run).

Optimization Basics

Convexity

Understanding the Error Surface

The Cross Entropy Loss Surface for Logistic Regression

Logistic Regression Model

Plotting the Cross Entropy Loss Surface

Choosing the Best Parameters

The Squared Error Loss Squared Loss on a Non-Linear Model

Non-linear Sine Regression Model

Visualizing Squared Loss

Choosing the Best Parameters

Slido Visualization

Visualizing the Gradient of the Error Function

The Gradient of the Cross Entropy Loss for Logistic Regression

The Gradient Squared Error for our Sine Model

The Gradient Descent Algorithm

Gradient Descent Applied to Logistic Regression

Gradient Descent Applied to Sine Model

Loss Curves

Slido Visualization

The Second Order Structure

The Hessian of the Logistic Regression Model

The Hessian of the Sine Regression Model

Taylor Expansion of the Loss

Applied to Logistic Regression

Applied to Sine Regression

Oscillating Loss Functions

Gradient Descent with Momentum