import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error as mse
from scipy import stats
import plotly.express as px
import sklearn.linear_model as lm

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default = "notebook_connected"

from sklearn.datasets import load_breast_cancer

# Load the dataset
data_dict = load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
data['malignant'] = (data_dict['target'] == 0)  # 1 for malignant, 0 for benign

# Display the first few rows
data.head()

# Split the data into training and testing sets
X = data[['mean radius']].to_numpy()
y = data['malignant'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 455
Testing set size: 114

toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
})
toy_df["str_y"] = toy_df["y"].astype(str)
toy_df.sort_values("x")

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
fig.update_traces(marker_size=20)
fig.write_image("toy_data.png")

def sigmoid(z):
    return 1/(1+np.e**-z)

def ss_error_on_toy_data(theta):
    p_hat = sigmoid(toy_df['x'] * theta)
    return 1/2.0 * np.sum((toy_df['y'] - p_hat)**2)

w_error = pd.DataFrame({"w": np.linspace(-10, 10, 100)})
w_error["Error"] = w_error["w"].apply(ss_error_on_toy_data)
fig = px.line(w_error, x="w", y="Error", width=800,
    title="Error on Toy Classification Data")
fig.update_traces(line=dict(width=4))
fig.show()
fig.write_image("toy_data_error.png")

# Set the initial guess as w = 0
from scipy.optimize import minimize

best_w = minimize(ss_error_on_toy_data, x0 = 0)["x"][0]
best_w

np.float64(0.5446897669010056)

non_optimal_w = minimize(ss_error_on_toy_data, x0 = -5)["x"][0]
non_optimal_w

np.float64(-10.858380927026204)

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_w), 
    mode="lines", line_color="black", 
    name=f"LR Model: w = {best_w:.2f}"))
fig.update_traces(line=dict(width=4))
fig.update_traces(marker_size=20)
fig.show()
fig.write_image("toy_data_with_model_optimal.png")

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * non_optimal_w), 
    mode="lines", line_color="black", 
    name=f"LR Model: w = {best_w:.2f}"))
fig.update_traces(line=dict(width=4))
fig.update_traces(marker_size=20)
fig.show()
fig.write_image("toy_data_with_model_non_optimal.png")

p_hat_error = pd.DataFrame({"p": np.arange(0.001, 0.999, 0.01)}) 
p_hat_error["Squared Error"] = 1/2.0* (1 - p_hat_error["p"])**2
fig = px.line(p_hat_error, x="p", y="Squared Error", width=800,
        title="Squared Loss for One Individual when y=1")
fig.update_traces(line=dict(width=4))
fig.show()
fig.write_image("squared_loss_y1.png")

p_hat_error["Neg Log Error"] = -np.log(p_hat_error["p"])

fig = px.line(p_hat_error.melt(id_vars="p", value_name="Error"), 
        x="p", y="Error", color="variable", width=800,
        title="Error Comparison for One Observation when y = 1")
fig.update_traces(line=dict(width=4))
fig.show()
fig.write_image("loss_comparison_y1.png")

p_hat_error = pd.DataFrame({"p": np.arange(0.001, 0.999, 0.01)}) 
p_hat_error["Squared Error"] = 1/2.0 * (1 - (1-p_hat_error["p"]))**2
p_hat_error["Neg Log Error"] = -np.log(1 - p_hat_error["p"])
fig = px.line(p_hat_error.melt(id_vars="p", value_name="Error"), 
        x="p", y="Error", color="variable", width=800,
        title="Error Comparison for One Observation when y = 0")
fig.update_traces(line=dict(width=4))
fig.show()
fig.write_image("loss_comparison_y0.png")

def cross_entropy(y, p):
    return - y * np.log(p) - (1 - y) * np.log(1 - p)
def mean_cross_entropy_on_toy_data(w):
    p = sigmoid(toy_df["x"] * w)
    return np.mean(cross_entropy(toy_df["y"], p))

w_error["Cross-Entropy"] = w_error["w"].apply(mean_cross_entropy_on_toy_data).dropna()
fig = px.line(w_error, x="w", y="Cross-Entropy", width=800,
           title="Cross-Entropy on Toy Classification Data")
fig.update_xaxes(range=[w_error["w"].min(), 4])
fig.update_traces(line=dict(width=4))
fig.show()

toy_model = lm.LogisticRegression(C=10)

# We fit to two data points: (-1, 0) and (1, 1).
toy_model.fit([[-1], [1]], [0,1])

# Generate estimated probabilities across a range of x-values.
xtest = np.linspace(-5, 5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
         color="str_y", symbol="str_y", 
         symbol_sequence=["circle", "cross"],
         title=f"LR Fit (slope = {toy_model.coef_[0][0]}, intercept = {toy_model.intercept_[0]})",
         render_mode="svg")
fig.update_traces(marker=dict(size=15))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="LR Model with C=10", 
                line_color="black", opacity=0.5)

toy_model = lm.LogisticRegression(C=1000)

# We fit to two data points: (-1, 0) and (1, 1).
toy_model.fit([[-1], [1]], [0,1])

# Generate estimated probabilities across a range of x-values.
xtest = np.linspace(-5, 5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
         color="str_y", symbol="str_y", 
         symbol_sequence=["circle", "cross"],
         title=f"LR Fit (slope = {toy_model.coef_[0][0]}, intercept = {toy_model.intercept_[0]})",
         render_mode="svg")
fig.update_traces(marker=dict(size=15))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="LR Model with C=1000", 
                line_color="black", opacity=0.5)

model = lm.LogisticRegression()
model.fit(X_train, y_train)

print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

Slope: 0.9488287268826906
Intercept: -14.039681312756318

# Preview the first 10 rows
model.predict_proba(X_train)[:10]

array([[0.99581772, 0.00418228],
       [0.0025449 , 0.9974551 ],
       [0.99520834, 0.00479166],
       [0.98082253, 0.01917747],
       [0.98775229, 0.01224771],
       [0.56062812, 0.43937188],
       [0.59074823, 0.40925177],
       [0.96062939, 0.03937061],
       [0.9180791 , 0.0819209 ],
       [0.00790344, 0.99209656]])

model.classes_

array([False,  True])

# Obtain P(Y=1|x) from the output.
p = model.predict_proba(X_train)[:, 1]

# Apply decision rule: predict Class 1 if P(Y=1|x) >= 0.5.
(p >= 0.5).astype(int)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

# .predict will automatically apply a 0.5 threshold for a logistic regression model.
classes = model.predict(X_train).astype(int)

classes

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

# Convert X_train to a DataFrame for compatibility
X_train_df = pd.DataFrame(X_train, columns=["mean radius"])
X_train_df["Predicted Class"] = pd.Categorical(model.predict(X_train))

test_points = pd.DataFrame({"mean radius": np.linspace(5, 30, 100)})
test_points["Predicted Prob"] = model.predict_proba(test_points[["mean radius"]])[:, 1]

fig = px.scatter(X_train_df, x="mean radius", y=y_train.astype(int), color="Predicted Class", opacity=0.6)

# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=test_points["mean radius"], y=test_points["Predicted Prob"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black", line_width=5, line_dash="dash"))
fig.add_vline(x = -model.intercept_[0]/model.coef_[0][0], line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="right")

fig = px.scatter(X_train_df, x="mean radius", y=np.zeros(len(X_train_df)),
                 symbol="Predicted Class", symbol_sequence=[ "circle-open", "cross"], 
                 color="Predicted Class", height=300, opacity=0.6)
# fig.update_traces(marker_symbol='line-ns-open')
fig.update_traces(marker_size=8)
fig.update_layout(
    yaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title=""),
)

decision_boundary =  -model.intercept_[0]/model.coef_[0][0]
fig.add_vline(x = decision_boundary, line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="top right")

def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

print(model.predict(X_train)[:5].astype(int))
print(y_train[:5].astype(int))
accuracy(X_train, y_train)

[0 1 0 0 0]
[0 1 0 0 0]

np.float64(0.8703296703296703)

model.score(X_train, y_train)

0.8703296703296703

from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(y_train, model.predict(X_train))
cm

array([[266,  20],
       [ 39, 130]])

fig = px.imshow(cm, x=["0", "1"], y=["0", "1"],
          labels=dict(x="Predicted", y="Actual"), 
          text_auto=True, 
          color_continuous_scale="Blues", 
          width=400, height=400)
fig.update_xaxes(side="top")

Y_hat = model.predict(X_train)
tp = np.sum((Y_hat == 1) & (y_train == 1))
tn = np.sum((Y_hat == 0) & (y_train == 0))

fp = np.sum((Y_hat == 1) & (y_train == 0))
fn = np.sum((Y_hat == 0) & (y_train == 1))


print("True Positives: ", tp)
print("True Negatives: ", tn)
print("False Positives:", fp)
print("False Negatives:", fn)

True Positives:  130
True Negatives:  266
False Positives: 20
False Negatives: 39

precision = tp / (tp + fp)
precision

np.float64(0.8666666666666667)

recall = tp / (tp + fn)
recall

np.float64(0.7692307692307693)

fpr = fp/(fp + tn)
fpr

np.float64(0.06993006993006994)

tpr = tp/(tp + fn)
tpr

np.float64(0.7692307692307693)

def plot_predictions(threshold = 0.5):
    # Convert X_train to a DataFrame for compatibility
    X_train_df = pd.DataFrame(X_train, columns=["mean radius"])
    X_train_df["Predicted Class"] = model.predict_proba(X_train)[:, 1] >= threshold

    X_train_df["Predicted Class"] = pd.Categorical(X_train_df["Predicted Class"].astype(int))
    fig = px.scatter(X_train_df, 
            x="mean radius", y=y_train.astype(int), color="Predicted Class", 
            title=f"Logistic Regression Predictions (Threshold = {threshold})")
    # Add the logistic regression model predictions
    # Make the data points for the LR model curve
    test_points = pd.DataFrame({"mean radius": np.linspace(5, 30, 100)})
    test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]
    fig.add_trace(go.Scatter(x=test_points["mean radius"], y=test_points["Predicted Prob"], 
                            mode="lines", name="Logistic Regression Model", 
                            line_color="black", line_width=5, line_dash="dash"))
    decision_boundary = (-np.log(1/threshold - 1) - model.intercept_[0])/model.coef_[0][0]
    fig.add_vline(x = decision_boundary, line_dash="dash", line_color="black",
                  annotation_text="Decision Boundary", annotation_position="right")
    return fig

plot_predictions(0.5)

plot_predictions(0.25)

plot_predictions(0.75)

# Define performance metrics dependent on the threshold value.
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(model, X, T) == Y)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    denominator = np.sum(Y_hat == 1)
    if denominator == 0:
        denominator = np.nan
    return np.sum((Y_hat == 1) & (Y == 1)) / denominator
    
def recall_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def tpr_threshold(X, Y, T): # Same as recall
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)

metrics = pd.DataFrame()
metrics["Threshold"] = np.linspace(0, 1, 1000)
metrics["Accuracy"] = [accuracy_threshold(X_train, y_train, t) for t in metrics["Threshold"]]
metrics["Precision"] = [precision_threshold(X_train, y_train, t) for t in metrics["Threshold"]]
metrics["Recall"] = [recall_threshold(X_train, y_train, t) for t in metrics["Threshold"]]
metrics.head()

fig = px.line(metrics, 
        x="Threshold", y="Accuracy",
        title="Accuracy vs. Threshold",
        render_mode="svg", width=600, height=600)

fig.add_scatter(x=[metrics.loc[metrics['Accuracy'].idxmax(), 'Threshold']], y=[metrics.loc[metrics['Accuracy'].idxmax(), 'Accuracy']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"Accuracy Max {metrics.loc[metrics['Accuracy'].idxmax(), 'Accuracy']:.5f}",)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

# The threshold that maximizes accuracy.
metrics.sort_values("Accuracy", ascending=False).head()

fig = px.line(metrics, 
        x="Threshold", y=["Accuracy", "Precision", "Recall"],
        title="Performance Metrics vs. Threshold",
        render_mode="svg", height=600, width=600)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="Recall", y="Precision",
        title="Precision vs. Recall",
        width=600, height=600,
        render_mode="svg")
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

metrics["F1"] = (2 * metrics["Precision"] * metrics["Recall"] 
                     / (metrics["Precision"] + metrics["Recall"]))
ind = metrics['F1'].idxmax()
metrics.loc[ind,:]

Threshold    0.548549
Accuracy     0.879121
Precision    0.919118
Recall       0.739645
F1           0.819672
Name: 548, dtype: float64

fig = px.line(metrics, x="Threshold", y="F1",
              title="Finding F1 Score Maximum",
              render_mode="svg",
              height=600, width=600)

fig.add_scatter(x=[metrics.loc[ind, 'Threshold']], y=[metrics.loc[ind, 'F1']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}",)

fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="Recall", y="Precision",
              title="Precision vs. Recall", width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[metrics.loc[ind, 'Recall']], y=[metrics.loc[ind, 'Precision']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}")
fig.update_layout(legend=dict(x=.5, y=.1))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

metrics["TPR"] = [tpr_threshold(X_train, y_train, t) for t in metrics["Threshold"]]
metrics["FPR"] = [fpr_threshold(X_train, y_train, t) for t in metrics["Threshold"]]

fig = px.line(metrics, x="Threshold", y=["TPR", "FPR", "Accuracy"],
        render_mode="svg", width=600, height=600)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
        width=600, height=600,
        render_mode="svg")
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
              width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[0,0,1], y=[0,1,1], mode='lines', 
                line_dash='dash', line_color='black',
                name="Perfect Classifier")
# move the legend inside the plot
fig.update_layout(legend=dict(x=.5, y=.1))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension	malignant
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890	True
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902	True
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758	True
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300	True
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678	True

	x	y	str_y
0	-4.0	0	0
1	-2.0	0	0
2	-0.5	1	1
3	1.0	0	0
4	3.0	1	1
5	5.0	1	1

	Threshold	Accuracy	Precision	Recall
0	0.000000	0.371429	0.371429	1.0
1	0.001001	0.371429	0.371429	1.0
2	0.002002	0.380220	0.374723	1.0
3	0.003003	0.391209	0.378924	1.0
4	0.004004	0.402198	0.383220	1.0

Lecture 11: Logistic Regression – CS 189, Fall 2025

Build a Logistic Regression Model¶

Decision Boundaries¶

🎯 Performance Metrics¶

Confusion matrix¶

Precision and Recall¶

Precision and Recall¶

True and False Positive Rates¶

🎛️ Adjusting the Classification Threshold¶

Thresholds and Performance Metrics¶

Precision-Recall Curves¶

ROC Curves¶

	Threshold	Accuracy	Precision	Recall
556	0.556557	0.879121	0.919118	0.739645
552	0.552553	0.879121	0.919118	0.739645
559	0.559560	0.879121	0.919118	0.739645
558	0.558559	0.879121	0.919118	0.739645
557	0.557558	0.879121	0.919118	0.739645