Lecture 18 – CS 189, Fall 2025
In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import torchvision as tv
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import random as rn
from torchinfo import summary
from types import SimpleNamespace
from torchvision.utils import make_grid
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
Using device: cpu
Building CNN from Scratch in PyTorch¶
In [2]:
mnist_tf = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
In [3]:
# Load MNIST dataset
x_train_mnist = tv.datasets.MNIST(root='./data', train=True,
download=True, transform=mnist_tf)
x_test_mnist = tv.datasets.MNIST(root='./data', train=False,
download=True, transform=mnist_tf)
image_size = x_train_mnist[0][0].shape[1]
print(f"MNIST training data size: {len(x_train_mnist)} of size {image_size}x{image_size}")
print(f"MNIST test data size: {len(x_test_mnist)} of size {image_size}x{image_size}")
MNIST training data size: 60000 of size 28x28 MNIST test data size: 10000 of size 28x28
In [4]:
train_loader_mnist = DataLoader(x_train_mnist, batch_size=128, shuffle=True)
test_loader_mnist = DataLoader(x_test_mnist, batch_size=256, shuffle=False)
In [5]:
imgs = [x_train_mnist[i] for i in range(16)]
labels = [i for i in range(16)]
fig, axes = plt.subplots(4, 4, figsize=(4, 4))
for ax, img, lab in zip(axes.flatten(), imgs, labels):
ax.imshow(img[0].squeeze().numpy(), cmap='gray')
ax.set_title(str(lab))
ax.axis('off')
plt.tight_layout()
plt.show()
In [6]:
class SmallCNN(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 8, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(8, 64, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(64*7*7, 256), # Adjusted based on the output size of the features block
nn.ReLU(),
nn.Dropout(0.25),
nn.Linear(256, 10)
)
def forward(self, x):
x = self.features(x)
return self.classifier(x)
model = SmallCNN().to(device)
summary(model, input_size=(1, 1, 28, 28),
col_names=("input_size","output_size","num_params","kernel_size"),
depth=4)
Out[6]:
============================================================================================================================================ Layer (type:depth-idx) Input Shape Output Shape Param # Kernel Shape ============================================================================================================================================ SmallCNN [1, 1, 28, 28] [1, 10] -- -- ├─Sequential: 1-1 [1, 1, 28, 28] [1, 64, 7, 7] -- -- │ └─Conv2d: 2-1 [1, 1, 28, 28] [1, 8, 28, 28] 80 [3, 3] │ └─ReLU: 2-2 [1, 8, 28, 28] [1, 8, 28, 28] -- -- │ └─MaxPool2d: 2-3 [1, 8, 28, 28] [1, 8, 14, 14] -- 2 │ └─Conv2d: 2-4 [1, 8, 14, 14] [1, 64, 14, 14] 4,672 [3, 3] │ └─ReLU: 2-5 [1, 64, 14, 14] [1, 64, 14, 14] -- -- │ └─MaxPool2d: 2-6 [1, 64, 14, 14] [1, 64, 7, 7] -- 2 ├─Sequential: 1-2 [1, 64, 7, 7] [1, 10] -- -- │ └─Flatten: 2-7 [1, 64, 7, 7] [1, 3136] -- -- │ └─Linear: 2-8 [1, 3136] [1, 256] 803,072 -- │ └─ReLU: 2-9 [1, 256] [1, 256] -- -- │ └─Dropout: 2-10 [1, 256] [1, 256] -- -- │ └─Linear: 2-11 [1, 256] [1, 10] 2,570 -- ============================================================================================================================================ Total params: 810,394 Trainable params: 810,394 Non-trainable params: 0 Total mult-adds (M): 1.78 ============================================================================================================================================ Input size (MB): 0.00 Forward/backward pass size (MB): 0.15 Params size (MB): 3.24 Estimated Total Size (MB): 3.40 ============================================================================================================================================
In [7]:
def train_one_epoch(model, loader, opt, loss_fn):
model.train()
total, correct, running_loss = 0, 0, 0.0
for xb, yb in loader:
xb, yb = xb.to(device), yb.to(device)
opt.zero_grad()
logits = model(xb)
loss = loss_fn(logits, yb)
loss.backward()
opt.step()
running_loss += loss.item()*xb.size(0)
preds = logits.argmax(dim=1)
correct += (preds==yb).sum().item()
total += xb.size(0)
return running_loss/total, correct/total
@torch.no_grad()
def evaluate(model, loader, loss_fn):
model.eval()
total, correct, running_loss = 0, 0, 0.0
for xb, yb in loader:
xb, yb = xb.to(device), yb.to(device)
logits = model(xb)
loss = loss_fn(logits, yb)
running_loss += loss.item()*xb.size(0)
preds = logits.argmax(dim=1)
correct += (preds==yb).sum().item()
total += xb.size(0)
return running_loss/total, correct/total
history = {'epoch': [], 'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
if tv is not None:
model = SmallCNN().to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
EPOCHS = 4
for epoch in range(1, EPOCHS+1):
tl, ta = train_one_epoch(model, train_loader_mnist, opt, loss_fn)
vl, va = evaluate(model, test_loader_mnist, loss_fn)
history['epoch'].append(epoch)
history['train_loss'].append(tl); history['val_loss'].append(vl)
history['train_acc'].append(ta); history['val_acc'].append(va)
print(f'E{epoch}: train_loss={tl:.4f} val_loss={vl:.4f} train_acc={ta:.3f} val_acc={va:.3f}')
else:
print("torchvision not available. Skipping training.")
E1: train_loss=0.2034 val_loss=0.0485 train_acc=0.938 val_acc=0.985
E2: train_loss=0.0591 val_loss=0.0455 train_acc=0.982 val_acc=0.984
E3: train_loss=0.0412 val_loss=0.0330 train_acc=0.987 val_acc=0.988
E4: train_loss=0.0313 val_loss=0.0282 train_acc=0.990 val_acc=0.990
In [8]:
if history and 'train_loss' in history and 'val_loss' in history:
plt.figure(); plt.plot(history['epoch'], history['train_loss']); plt.plot(history['epoch'], history['val_loss']); plt.legend(['train','val']); plt.title('Loss'); plt.xlabel('epoch'); plt.show()
if history and 'train_acc' in history and 'val_acc' in history:
plt.figure(); plt.plot(history['epoch'], history['train_acc']); plt.plot(history['epoch'], history['val_acc']); plt.legend(['train','val']); plt.title('Accuracy'); plt.xlabel('epoch'); plt.show()
In [9]:
if tv is not None:
model.eval()
all_true, all_pred = [], []
with torch.no_grad():
for xb, yb in test_loader_mnist:
xb, yb = xb.to(device), yb.to(device)
logits = model(xb)
preds = logits.argmax(dim=1)
all_pred.extend(preds.cpu().numpy())
all_true.extend(yb.cpu().numpy())
# Calculate accuracy
all_true = np.array(all_true)
all_pred = np.array(all_pred)
accuracy = (all_true == all_pred).sum() / len(all_true)
print(f"Accuracy on test data: {accuracy:.4f}")
cm = confusion_matrix(all_true, all_pred, labels=list(range(10)))
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=list(range(10)), yticklabels=list(range(10)))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()
else:
print("torchvision not available. Cannot evaluate predictions.")
Accuracy on test data: 0.9903
AlexNet¶
In [10]:
alex_w = tv.models.AlexNet_Weights.IMAGENET1K_V1
MODELS = {
"alexnet": SimpleNamespace(
ctor=lambda: tv.models.alexnet(weights=alex_w).to(device).eval(),
weights=alex_w,
act_layers=["features.0", "features.3"], # conv blocks
maxact_layer="features.10",
arch="alexnet"
),
}
In [11]:
def load_image(path, weights):
img = Image.open(path).convert('RGB')
return weights.transforms()(img).unsqueeze(0).to(device)
def show(tensor, title=None):
if tensor.ndim == 4:
grid = make_grid(tensor, nrow=int(np.ceil(np.sqrt(tensor.size(0)))))
arr = grid.permute(1,2,0).detach().cpu().numpy()
else:
arr = tensor.permute(1,2,0).detach().cpu().numpy()
plt.figure(figsize=(6,6))
plt.imshow(np.clip(arr, 0, 1))
plt.axis('off')
if title: plt.title(title)
plt.show()
# Robust per-item normalization (avoids tuple-dim min/max issues)
def _norm_per_item(t):
# t shape: [N, ...]
if hasattr(torch, "amin"):
tmin = torch.amin(t, dim=tuple(range(1, t.ndim)), keepdim=True)
tmax = torch.amax(t, dim=tuple(range(1, t.ndim)), keepdim=True)
else:
flat = t.view(t.size(0), -1)
tmin = flat.min(dim=1, keepdim=True)[0].view(-1, *([1]*(t.ndim-1)))
tmax = flat.max(dim=1, keepdim=True)[0].view(-1, *([1]*(t.ndim-1)))
return (t - tmin) / (tmax - tmin + 1e-8)
# Helper: resolve "features.23" dotted path to a module
def resolve_module(root, name):
mod = root
for part in name.split('.'):
if part.isdigit():
mod = mod[int(part)]
else:
mod = getattr(mod, part)
return mod
def first_conv_module(model):
for m in model.modules():
if isinstance(m, nn.Conv2d):
return m
raise RuntimeError("No Conv2d found.")
In [12]:
def visualize_first_layer_filters(model, max_filters=64, label="model"):
conv1 = None
# Try common entry points first
for attr in ["conv1", "features"]:
if hasattr(model, attr):
m = getattr(model, attr)
if isinstance(m, nn.Conv2d):
conv1 = m
break
# If Sequential, first layer likely Conv2d
if isinstance(m, nn.Sequential):
for x in m:
if isinstance(x, nn.Conv2d):
conv1 = x; break
if conv1 is not None: break
if conv1 is None:
conv1 = first_conv_module(model)
w = conv1.weight.data.clone().cpu() # [out, in, k, k]
w = _norm_per_item(w)
show(w[:max_filters], title=f"{label}: first-layer conv filters")
In [13]:
def visualize_activations(model, img_tensor, layer_names, label="model"):
feats, hooks = {}, []
def hook(name):
return lambda m, i, o: feats.setdefault(name, o.detach().cpu())
# Register hooks
for name in layer_names:
try:
module = resolve_module(model, name)
hooks.append(module.register_forward_hook(hook(name)))
except Exception as e:
print(f"[warn] could not hook '{name}': {e}")
with torch.no_grad():
_ = model(img_tensor)
for h in hooks: h.remove()
for name, feat in feats.items():
fmap = feat[0]
if fmap.ndim != 3:
print(f"[info] {label}:{name} is non-spatial (shape {feat.shape}), skipping grid")
continue
C = min(64, fmap.size(0))
fm = fmap[:C]
fm = _norm_per_item(fm.unsqueeze(1)).squeeze(1)
show(fm.unsqueeze(1), title=f"{label}: activations @ {name}")
In [14]:
@torch.no_grad()
def predict_probs(model, x):
logits = model(x)
return F.softmax(logits, dim=1)
def _resize_heat_with_torch(heat, H, W):
t = torch.from_numpy(heat)[None, None]
t = F.interpolate(t.float(), size=(H, W), mode="bilinear", align_corners=False)
return t[0,0].numpy()
def occlusion_heatmap(model, img_tensor, idx_to_label=None, target_class=None, patch=32, stride=16, baseline=0.0, label="model"):
model.eval()
x = img_tensor.clone()
probs = predict_probs(model, x)[0]
if target_class is None:
target_class = probs.argmax().item()
base_p = probs[target_class].item()
_, _, H, W = x.shape
heat = np.zeros(((H - patch)//stride + 1, (W - patch)//stride + 1), dtype=np.float32)
for i, y in enumerate(range(0, H - patch + 1, stride)):
for j, z in enumerate(range(0, W - patch + 1, stride)):
x_ = x.clone()
x_[:,:, y:y+patch, z:z+patch] = baseline
p = predict_probs(model, x_)[0, target_class].item()
heat[i, j] = base_p - p
heat_resized = _resize_heat_with_torch(heat, H, W)
# quick unnormalize for show (using Imagenet stats)
im = x[0].detach().cpu()
im = (im * torch.tensor([0.229,0.224,0.225])[:,None,None] + torch.tensor([0.485,0.456,0.406])[:,None,None]).permute(1,2,0).numpy()
plt.figure(figsize=(6,6)); plt.imshow(np.clip(im,0,1)); plt.imshow(heat_resized, alpha=0.5); plt.axis('off')
if idx_to_label:
tname = idx_to_label[target_class]
else:
tname = str(target_class)
plt.title(f"{label}: occlusion (target='{tname}', base p={base_p:.3f})")
plt.show()
return heat_resized
In [15]:
def saliency_map(model, img_tensor, target_class=None, label="model"):
model.eval()
x = img_tensor.clone().requires_grad_(True)
logits = model(x)
if target_class is None:
target_class = logits.argmax(dim=1).item()
loss = logits[0, target_class]
model.zero_grad()
loss.backward()
g = x.grad.detach()[0] # [3,H,W]
sal = g.abs().max(dim=0)[0] # [H,W]
sal = (sal - sal.min())/(sal.max()-sal.min()+1e-8)
plt.figure(figsize=(6,6)); plt.imshow(sal.cpu(), cmap='gray'); plt.axis('off'); plt.title(f"{label}: saliency")
plt.show()
return sal
class GuidedBackpropReLU(nn.Module):
def forward(self, x):
self.saved = x
return F.relu(x)
def backward_hook(self, module, grad_in, grad_out):
positive_grad = torch.clamp(grad_out[0], min=0.0)
positive_mask = (self.saved > 0).float()
return (positive_grad * positive_mask,)
def guided_backprop(model_ctor, weights, img_tensor, target_class=None, label="model"):
# Create a fresh copy to freely patch ReLUs
gb_model = model_ctor().to(device).eval()
# Swap all ReLUs
relus = []
for name, module in gb_model.named_modules():
if isinstance(module, nn.ReLU):
relu = GuidedBackpropReLU()
relus.append(relu)
parent = gb_model
*parents, leaf = name.split('.')
for p in parents:
parent = getattr(parent, p)
setattr(parent, leaf, relu)
x = img_tensor.clone().requires_grad_(True)
logits = gb_model(x)
if target_class is None:
target_class = logits.argmax(dim=1).item()
loss = logits[0, target_class]
gb_model.zero_grad()
hooks = [relu.register_full_backward_hook(relu.backward_hook) for relu in relus]
loss.backward()
for h in hooks: h.remove()
g = x.grad.detach()[0]
g = (g - g.min())/(g.max()-g.min()+1e-8)
g = g.permute(1,2,0).cpu().numpy()
plt.figure(figsize=(6,6)); plt.imshow(g); plt.axis('off'); plt.title(f"{label}: guided backprop")
plt.show()
return g
In [16]:
class FeatExtractor(nn.Module):
"""Return a fixed-dim feature vector (penultimate-ish) for each arch."""
def __init__(self, model, arch):
super().__init__()
self.arch = arch
self.model = model
if arch == "resnet":
# body up to layer4 GAP
self.body = nn.Sequential(
model.conv1, model.bn1, model.relu, model.maxpool,
model.layer1, model.layer2, model.layer3, model.layer4,
nn.AdaptiveAvgPool2d((1,1))
)
self.out_dim = model.fc.in_features
elif arch == "vgg" or arch == "alexnet":
self.features = model.features
self.pool = nn.AdaptiveAvgPool2d((7,7)) # match VGG/Alex input to classifier
# classifier: take everything except final Linear
self.prefix = nn.Sequential(*list(model.classifier.children())[:-1])
# out_dim is the in_features of final Linear
last_linear = list(model.classifier.children())[-1]
self.out_dim = last_linear.in_features
else:
raise ValueError("Unknown arch")
def forward(self, x):
if self.arch == "resnet":
x = self.body(x).flatten(1)
return x
else:
x = self.features(x)
x = self.pool(x)
x = torch.flatten(x, 1)
x = self.prefix(x)
return x
In [17]:
def max_activating_images(model, dataset, layer_name, topk=16, label="model"):
target = resolve_module(model, layer_name)
acts = []
imgs_cache = []
def fhook(m, i, o):
if o.ndim == 4:
a = o.detach().cpu().mean(dim=(2,3)) # GAP over H,W → [B, C]
else:
a = o.detach().cpu()
acts.append(a)
h = target.register_forward_hook(fhook)
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False, num_workers=2)
with torch.no_grad():
for xb, yb in loader:
imgs_cache.append(xb)
_ = model(xb.to(device))
h.remove()
A = torch.cat(acts, 0).numpy() # [N, C]
imgs_cache = torch.cat(imgs_cache, 0)
# Choose an arbitrary channel to inspect (customize this)
channel = min(5, A.shape[1]-1)
idxs = np.argsort(-A[:, channel])[:topk]
grid = imgs_cache[idxs]
# unnormalize for viewing (ImageNet stats)
grid = grid*torch.tensor([0.229,0.224,0.225])[None,:,None,None] + torch.tensor([0.485,0.456,0.406])[None,:,None,None]
grid = grid.clamp(0,1)
show(grid, title=f"{label}: top-{topk} images for channel {channel} @ {layer_name}")
In [18]:
img_path = "pishi.png"
from PIL import Image
models = {}
for name, cfg in MODELS.items():
m = cfg.ctor()
models[name] = SimpleNamespace(
model=m, weights=cfg.weights, act_layers=cfg.act_layers,
maxact_layer=cfg.maxact_layer, arch=cfg.arch,
idx_to_label=cfg.weights.meta.get("categories", None)
)
# Ensure the input image is resized to 224x224
images = {name: load_image(img_path, cfg.weights) for name, cfg in models.items()}
for name, img in images.items():
assert img.shape[-2:] == (224, 224), f"Image for model {name} is not resized to 224x224"
# 1) First-layer filters comparison
for name, cfg in models.items():
visualize_first_layer_filters(cfg.model, max_filters=64, label=name)
# 2) Activation maps at key layers
for name, cfg in models.items():
visualize_activations(cfg.model, images[name], cfg.act_layers, label=name)
# 3) Occlusion sensitivity (same target class per model by default)
for name, cfg in models.items():
_ = occlusion_heatmap(cfg.model, images[name], idx_to_label=cfg.idx_to_label, patch=32, stride=16, label=name)
# 4) Saliency and Guided Backprop
for name, cfg in models.items():
_ = saliency_map(cfg.model, images[name], label=name)
_ = guided_backprop(MODELS[name].ctor, cfg.weights, images[name], label=name)
Transfer Learning on CIFAR Data¶
In [19]:
# Download and load training data
cifar_tf = transforms.Compose([
transforms.ToTensor(),
])
data_cifar100 = tv.datasets.CIFAR100(root='./data', train=True,
download=True, transform=cifar_tf)
X_cifar = data_cifar100.data # numpy array (50000, 32, 32, 3)
y_cifar = np.array(data_cifar100.targets) # numpy array (50000,)
print('X shape: ', X_cifar.shape)
print('y shape:', y_cifar.shape)
X shape: (50000, 32, 32, 3) y shape: (50000,)
In [20]:
fig, ax = plt.subplots(5, 4)
fig.set_size_inches(15, 15)
for i in range(5):
for j in range(4):
l = rn.randint(0, len(y_cifar))
ax[i, j].imshow(X_cifar[l])
ax[i, j].set_title('Label: ' + str(y_cifar[l]))
ax[i, j].grid(False)
ax[i, j].set_xticks([])
ax[i, j].set_yticks([])
plt.tight_layout()
plt.show()
In [21]:
X_cifar = X_cifar / 255.0
# Convert to PyTorch tensors and reshape to (N, C, H, W) format
X_cifar_tensor = torch.FloatTensor(X_cifar).permute(0, 3, 1, 2) # (50000, 3, 32, 32)
y_cifar_tensor = torch.LongTensor(y_cifar)
print('X_tensor shape: ', X_cifar_tensor.shape)
print('y_tensor shape:', y_cifar_tensor.shape)
X_tensor shape: torch.Size([50000, 3, 32, 32]) y_tensor shape: torch.Size([50000])
In [22]:
# Create dataset and split into train/val
dataset_cifar = TensorDataset(X_cifar_tensor, y_cifar_tensor)
train_size_cifar_100 = int(0.9 * len(dataset_cifar))
val_size_cifar = len(dataset_cifar) - train_size_cifar_100
train_dataset_cifar, val_dataset_cifar = random_split(dataset_cifar, [train_size_cifar_100, val_size_cifar])
In [23]:
# Create data loaders
batch_size = 200
train_loader_cifar = DataLoader(train_dataset_cifar, batch_size=batch_size, shuffle=True)
val_loader_cifar = DataLoader(val_dataset_cifar, batch_size=batch_size, shuffle=False)
In [24]:
class CNN_Demo(nn.Module):
def __init__(self, num_classes=100):
super(CNN_Demo, self).__init__()
# First convolutional block
self.conv1 = nn.Conv2d(in_channels=3, out_channels=16,
kernel_size=3, padding=1) # 'same' padding
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.bn1 = nn.BatchNorm2d(16)
# Second convolutional block
self.conv2 = nn.Conv2d(in_channels=16, out_channels=32,
kernel_size=3, padding=1)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
# Fully connected layers
self.fc1 = nn.Linear(32 * 8 * 8, 200) # After 2 pooling layers: 32->16->8
self.fc2 = nn.Linear(200, num_classes)
def forward(self, x):
# First conv block
x = F.relu(self.conv1(x))
x = self.pool1(x)
x = self.bn1(x)
# Second conv block
x = F.relu(self.conv2(x))
x = self.pool2(x)
# Flatten
x = x.view(x.size(0), -1)
# Fully connected layers
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
In [25]:
# Create model instance
model = CNN_Demo(num_classes=100).to(device)
In [26]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.001)
epochs = 2
summary(model, input_size=(1, 3, 32, 32),
col_names=("input_size","output_size","num_params","kernel_size"),
depth=4)
Out[26]:
============================================================================================================================================ Layer (type:depth-idx) Input Shape Output Shape Param # Kernel Shape ============================================================================================================================================ CNN_Demo [1, 3, 32, 32] [1, 100] -- -- ├─Conv2d: 1-1 [1, 3, 32, 32] [1, 16, 32, 32] 448 [3, 3] ├─MaxPool2d: 1-2 [1, 16, 32, 32] [1, 16, 16, 16] -- 2 ├─BatchNorm2d: 1-3 [1, 16, 16, 16] [1, 16, 16, 16] 32 -- ├─Conv2d: 1-4 [1, 16, 16, 16] [1, 32, 16, 16] 4,640 [3, 3] ├─MaxPool2d: 1-5 [1, 32, 16, 16] [1, 32, 8, 8] -- 2 ├─Linear: 1-6 [1, 2048] [1, 200] 409,800 -- ├─Linear: 1-7 [1, 200] [1, 100] 20,100 -- ============================================================================================================================================ Total params: 435,020 Trainable params: 435,020 Non-trainable params: 0 Total mult-adds (M): 2.08 ============================================================================================================================================ Input size (MB): 0.01 Forward/backward pass size (MB): 0.23 Params size (MB): 1.74 Estimated Total Size (MB): 1.98 ============================================================================================================================================
In [27]:
# Training loop
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
for epoch in range(epochs):
# Training phase
model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for inputs, labels in train_loader_cifar:
inputs, labels = inputs.to(device), labels.to(device)
# Zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, labels)
# Backward pass and optimize
loss.backward()
optimizer.step()
# Statistics
train_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
train_total += labels.size(0)
train_correct += (predicted == labels).sum().item()
train_loss = train_loss / train_total
train_acc = train_correct / train_total
# Validation phase
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader_cifar:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
val_total += labels.size(0)
val_correct += (predicted == labels).sum().item()
val_loss = val_loss / val_total
val_acc = val_correct / val_total
# Store history
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
print(f'Epoch [{epoch+1}/{epochs}] - '
f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
Epoch [1/2] - Train Loss: 3.9809, Train Acc: 0.1101, Val Loss: 3.7447, Val Acc: 0.1572
Epoch [2/2] - Train Loss: 3.5960, Train Acc: 0.1730, Val Loss: 3.5809, Val Acc: 0.1738
In [28]:
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='train')
plt.plot(history['val_loss'], label='val')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], label='train')
plt.plot(history['val_acc'], label='val')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.tight_layout()
plt.show()
In [29]:
from torchvision.models import vgg16, VGG16_Weights
# Load pretrained VGG16
base_model = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
# VGG16 expects 224x224 images, but CIFAR-100 is 32x32
# We'll modify the architecture to work with 32x32 images
class VGG16Transfer(nn.Module):
def __init__(self, num_classes=100):
super(VGG16Transfer, self).__init__()
# Use VGG16 features (convolutional layers)
vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
self.features = vgg.features
# Modify classifier for CIFAR-100
# After VGG features with 32x32 input, we get 1x1x512
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(512, 200)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(200, num_classes)
def forward(self, x):
x = self.features(x)
x = self.flatten(x)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x
In [30]:
# Create transfer learning model
transfer_model = VGG16Transfer(num_classes=100).to(device)
# Freeze the feature extraction layers
for param in transfer_model.features.parameters():
param.requires_grad = False
# Only train the classifier layers
optimizer_transfer = optim.Adagrad(transfer_model.parameters(), lr=0.001)
criterion_transfer = nn.CrossEntropyLoss()
In [31]:
epochs_transfer = 2
history_transfer = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
for epoch in range(epochs_transfer):
# Training phase
transfer_model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for inputs, labels in train_loader_cifar:
inputs, labels = inputs.to(device), labels.to(device)
optimizer_transfer.zero_grad()
outputs = transfer_model(inputs)
loss = criterion_transfer(outputs, labels)
loss.backward()
optimizer_transfer.step()
train_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
train_total += labels.size(0)
train_correct += (predicted == labels).sum().item()
train_loss = train_loss / train_total
train_acc = train_correct / train_total
# Validation phase
transfer_model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader_cifar:
inputs, labels = inputs.to(device), labels.to(device)
outputs = transfer_model(inputs)
loss = criterion_transfer(outputs, labels)
val_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
val_total += labels.size(0)
val_correct += (predicted == labels).sum().item()
val_loss = val_loss / val_total
val_acc = val_correct / val_total
history_transfer['train_loss'].append(train_loss)
history_transfer['train_acc'].append(train_acc)
history_transfer['val_loss'].append(val_loss)
history_transfer['val_acc'].append(val_acc)
print(f'Epoch [{epoch+1}/{epochs_transfer}] - '
f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
Epoch [1/2] - Train Loss: 4.0653, Train Acc: 0.1482, Val Loss: 3.7265, Val Acc: 0.2076
Epoch [2/2] - Train Loss: 3.5416, Train Acc: 0.2287, Val Loss: 3.4285, Val Acc: 0.2386
In [32]:
# Plot transfer learning results
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_transfer['train_loss'], label='train')
plt.plot(history_transfer['val_loss'], label='val')
plt.title('Transfer Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history_transfer['train_acc'], label='train')
plt.plot(history_transfer['val_acc'], label='val')
plt.title('Transfer Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.tight_layout()
plt.show()
In [33]:
# Select a random image
img_idx = rn.randint(0, len(X_cifar_tensor))
img = X_cifar_tensor[img_idx:img_idx+1].to(device)
# Display original image
plt.imshow(X_cifar[img_idx])
plt.grid(False)
plt.title('Original Image')
plt.show()
# Hook to capture intermediate layer outputs
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
# Register hooks for convolutional layers
transfer_model.features[2].register_forward_hook(get_activation('conv1'))
transfer_model.features[7].register_forward_hook(get_activation('conv2'))
transfer_model.features[12].register_forward_hook(get_activation('conv3'))
transfer_model.features[19].register_forward_hook(get_activation('conv4'))
transfer_model.features[26].register_forward_hook(get_activation('conv5'))
# Forward pass
transfer_model.eval()
with torch.no_grad():
_ = transfer_model(img)
# Visualize feature maps
layer_names = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
for layer_name in layer_names:
if layer_name in activation:
feature_maps = activation[layer_name].cpu().numpy()[0] # (num_channels, H, W)
num_filters = min(64, feature_maps.shape[0])
fig, axes = plt.subplots(8, 8, figsize=(10, 10))
fig.suptitle(f'Feature Maps from {layer_name}', fontsize=16)
for i in range(8):
for j in range(8):
idx = i * 8 + j
if idx < num_filters:
axes[i, j].imshow(feature_maps[idx], cmap='viridis')
axes[i, j].set_title(f'K{idx}', fontsize=8)
axes[i, j].axis('off')
plt.tight_layout()
plt.show()
In [34]:
# Load CIFAR-10 dataset
cifar10_tf = transforms.Compose([
transforms.ToTensor(),
])
# Download and load training data
data_cifar_10 = tv.datasets.CIFAR10(root='./data', train=True,
download=True, transform=cifar10_tf)
X_cifar_10 = data_cifar_10.data # numpy array (50000, 32, 32, 3)
y_cifar_10 = np.array(data_cifar_10.targets) # numpy array (50000,)
print("shape of X:", X_cifar_10.shape)
print("shape of y:", y_cifar_10.shape)
shape of X: (50000, 32, 32, 3) shape of y: (50000,)
In [35]:
# Normalize all entries to the interval [0, 1]
X_cifar_10 = X_cifar_10 / 255.0
# Convert to PyTorch tensors and reshape to (N, C, H, W) format
X_cifar_10_tensor = torch.FloatTensor(X_cifar_10).permute(0, 3, 1, 2) # (50000, 3, 32, 32)
y_cifar_10_tensor = torch.LongTensor(y_cifar_10)
print("X_tensor shape:", X_cifar_10_tensor.shape)
print("y_tensor shape:", y_cifar_10_tensor.shape)
X_tensor shape: torch.Size([50000, 3, 32, 32]) y_tensor shape: torch.Size([50000])
In [36]:
# Create dataset and split into train/val
dataset_cifar_10 = TensorDataset(X_cifar_10_tensor, y_cifar_10_tensor)
train_size_cifar_10 = int(0.8 * len(dataset_cifar_10))
val_size_cifar_10 = len(dataset_cifar_10) - train_size_cifar_10
train_dataset_cifar_10, val_dataset_cifar_10 = random_split(dataset_cifar_10, [train_size_cifar_10, val_size_cifar_10])
# Create data loaders
batch_size = 100
train_loader = DataLoader(train_dataset_cifar_10, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset_cifar_10, batch_size=batch_size, shuffle=False)
In [37]:
from torchvision.models import vgg16, VGG16_Weights
# Load pretrained VGG16
base_model = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
# Display the summary of the VGG16 model with the correct input size
summary(base_model, input_size=(1, 3, 224, 224), col_names=("input_size","output_size","num_params","kernel_size"))
Out[37]:
============================================================================================================================================ Layer (type:depth-idx) Input Shape Output Shape Param # Kernel Shape ============================================================================================================================================ VGG [1, 3, 224, 224] [1, 1000] -- -- ├─Sequential: 1-1 [1, 3, 224, 224] [1, 512, 7, 7] -- -- │ └─Conv2d: 2-1 [1, 3, 224, 224] [1, 64, 224, 224] 1,792 [3, 3] │ └─ReLU: 2-2 [1, 64, 224, 224] [1, 64, 224, 224] -- -- │ └─Conv2d: 2-3 [1, 64, 224, 224] [1, 64, 224, 224] 36,928 [3, 3] │ └─ReLU: 2-4 [1, 64, 224, 224] [1, 64, 224, 224] -- -- │ └─MaxPool2d: 2-5 [1, 64, 224, 224] [1, 64, 112, 112] -- 2 │ └─Conv2d: 2-6 [1, 64, 112, 112] [1, 128, 112, 112] 73,856 [3, 3] │ └─ReLU: 2-7 [1, 128, 112, 112] [1, 128, 112, 112] -- -- │ └─Conv2d: 2-8 [1, 128, 112, 112] [1, 128, 112, 112] 147,584 [3, 3] │ └─ReLU: 2-9 [1, 128, 112, 112] [1, 128, 112, 112] -- -- │ └─MaxPool2d: 2-10 [1, 128, 112, 112] [1, 128, 56, 56] -- 2 │ └─Conv2d: 2-11 [1, 128, 56, 56] [1, 256, 56, 56] 295,168 [3, 3] │ └─ReLU: 2-12 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─Conv2d: 2-13 [1, 256, 56, 56] [1, 256, 56, 56] 590,080 [3, 3] │ └─ReLU: 2-14 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─Conv2d: 2-15 [1, 256, 56, 56] [1, 256, 56, 56] 590,080 [3, 3] │ └─ReLU: 2-16 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─MaxPool2d: 2-17 [1, 256, 56, 56] [1, 256, 28, 28] -- 2 │ └─Conv2d: 2-18 [1, 256, 28, 28] [1, 512, 28, 28] 1,180,160 [3, 3] │ └─ReLU: 2-19 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─Conv2d: 2-20 [1, 512, 28, 28] [1, 512, 28, 28] 2,359,808 [3, 3] │ └─ReLU: 2-21 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─Conv2d: 2-22 [1, 512, 28, 28] [1, 512, 28, 28] 2,359,808 [3, 3] │ └─ReLU: 2-23 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─MaxPool2d: 2-24 [1, 512, 28, 28] [1, 512, 14, 14] -- 2 │ └─Conv2d: 2-25 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-26 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─Conv2d: 2-27 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-28 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─Conv2d: 2-29 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-30 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─MaxPool2d: 2-31 [1, 512, 14, 14] [1, 512, 7, 7] -- 2 ├─AdaptiveAvgPool2d: 1-2 [1, 512, 7, 7] [1, 512, 7, 7] -- -- ├─Sequential: 1-3 [1, 25088] [1, 1000] -- -- │ └─Linear: 2-32 [1, 25088] [1, 4096] 102,764,544 -- │ └─ReLU: 2-33 [1, 4096] [1, 4096] -- -- │ └─Dropout: 2-34 [1, 4096] [1, 4096] -- -- │ └─Linear: 2-35 [1, 4096] [1, 4096] 16,781,312 -- │ └─ReLU: 2-36 [1, 4096] [1, 4096] -- -- │ └─Dropout: 2-37 [1, 4096] [1, 4096] -- -- │ └─Linear: 2-38 [1, 4096] [1, 1000] 4,097,000 -- ============================================================================================================================================ Total params: 138,357,544 Trainable params: 138,357,544 Non-trainable params: 0 Total mult-adds (G): 15.48 ============================================================================================================================================ Input size (MB): 0.60 Forward/backward pass size (MB): 108.45 Params size (MB): 553.43 Estimated Total Size (MB): 662.49 ============================================================================================================================================
In [38]:
class VGG16Classifier(nn.Module):
def __init__(self, num_classes=10, freeze_features=True):
super(VGG16Classifier, self).__init__()
# Load pretrained VGG16
vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
# Use VGG16 features (convolutional layers)
self.features = vgg.features
# Freeze the pretrained weights if specified
if freeze_features:
for param in self.features.parameters():
param.requires_grad = False
# Custom classifier for CIFAR10
# After features with 32x32 input, we get 1x1x512
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(512, 60),
nn.ReLU(inplace=True),
nn.Linear(60, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
In [39]:
# Create model instance
model = VGG16Classifier(num_classes=10, freeze_features=True).to(device)
In [40]:
summary(base_model, input_size=(1, 3, 224, 224), col_names=("input_size","output_size","num_params","kernel_size"))
Out[40]:
============================================================================================================================================ Layer (type:depth-idx) Input Shape Output Shape Param # Kernel Shape ============================================================================================================================================ VGG [1, 3, 224, 224] [1, 1000] -- -- ├─Sequential: 1-1 [1, 3, 224, 224] [1, 512, 7, 7] -- -- │ └─Conv2d: 2-1 [1, 3, 224, 224] [1, 64, 224, 224] 1,792 [3, 3] │ └─ReLU: 2-2 [1, 64, 224, 224] [1, 64, 224, 224] -- -- │ └─Conv2d: 2-3 [1, 64, 224, 224] [1, 64, 224, 224] 36,928 [3, 3] │ └─ReLU: 2-4 [1, 64, 224, 224] [1, 64, 224, 224] -- -- │ └─MaxPool2d: 2-5 [1, 64, 224, 224] [1, 64, 112, 112] -- 2 │ └─Conv2d: 2-6 [1, 64, 112, 112] [1, 128, 112, 112] 73,856 [3, 3] │ └─ReLU: 2-7 [1, 128, 112, 112] [1, 128, 112, 112] -- -- │ └─Conv2d: 2-8 [1, 128, 112, 112] [1, 128, 112, 112] 147,584 [3, 3] │ └─ReLU: 2-9 [1, 128, 112, 112] [1, 128, 112, 112] -- -- │ └─MaxPool2d: 2-10 [1, 128, 112, 112] [1, 128, 56, 56] -- 2 │ └─Conv2d: 2-11 [1, 128, 56, 56] [1, 256, 56, 56] 295,168 [3, 3] │ └─ReLU: 2-12 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─Conv2d: 2-13 [1, 256, 56, 56] [1, 256, 56, 56] 590,080 [3, 3] │ └─ReLU: 2-14 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─Conv2d: 2-15 [1, 256, 56, 56] [1, 256, 56, 56] 590,080 [3, 3] │ └─ReLU: 2-16 [1, 256, 56, 56] [1, 256, 56, 56] -- -- │ └─MaxPool2d: 2-17 [1, 256, 56, 56] [1, 256, 28, 28] -- 2 │ └─Conv2d: 2-18 [1, 256, 28, 28] [1, 512, 28, 28] 1,180,160 [3, 3] │ └─ReLU: 2-19 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─Conv2d: 2-20 [1, 512, 28, 28] [1, 512, 28, 28] 2,359,808 [3, 3] │ └─ReLU: 2-21 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─Conv2d: 2-22 [1, 512, 28, 28] [1, 512, 28, 28] 2,359,808 [3, 3] │ └─ReLU: 2-23 [1, 512, 28, 28] [1, 512, 28, 28] -- -- │ └─MaxPool2d: 2-24 [1, 512, 28, 28] [1, 512, 14, 14] -- 2 │ └─Conv2d: 2-25 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-26 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─Conv2d: 2-27 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-28 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─Conv2d: 2-29 [1, 512, 14, 14] [1, 512, 14, 14] 2,359,808 [3, 3] │ └─ReLU: 2-30 [1, 512, 14, 14] [1, 512, 14, 14] -- -- │ └─MaxPool2d: 2-31 [1, 512, 14, 14] [1, 512, 7, 7] -- 2 ├─AdaptiveAvgPool2d: 1-2 [1, 512, 7, 7] [1, 512, 7, 7] -- -- ├─Sequential: 1-3 [1, 25088] [1, 1000] -- -- │ └─Linear: 2-32 [1, 25088] [1, 4096] 102,764,544 -- │ └─ReLU: 2-33 [1, 4096] [1, 4096] -- -- │ └─Dropout: 2-34 [1, 4096] [1, 4096] -- -- │ └─Linear: 2-35 [1, 4096] [1, 4096] 16,781,312 -- │ └─ReLU: 2-36 [1, 4096] [1, 4096] -- -- │ └─Dropout: 2-37 [1, 4096] [1, 4096] -- -- │ └─Linear: 2-38 [1, 4096] [1, 1000] 4,097,000 -- ============================================================================================================================================ Total params: 138,357,544 Trainable params: 138,357,544 Non-trainable params: 0 Total mult-adds (G): 15.48 ============================================================================================================================================ Input size (MB): 0.60 Forward/backward pass size (MB): 108.45 Params size (MB): 553.43 Estimated Total Size (MB): 662.49 ============================================================================================================================================
In [41]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
In [42]:
# VGG16 preprocessing - normalize with ImageNet mean and std
imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device)
imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device)
def preprocess_vgg(x):
"""Preprocess input for VGG16 (ImageNet normalization)"""
return (x - imagenet_mean) / imagenet_std
In [43]:
def train_model(model, train_loader, val_loader, criterion, optimizer,
epochs=2, preprocess_fn=None):
"""Train the model and return history"""
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
for epoch in range(epochs):
# Training phase
model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
# Apply preprocessing if provided
if preprocess_fn:
inputs = preprocess_fn(inputs)
# Zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, labels)
# Backward pass and optimize
loss.backward()
optimizer.step()
# Statistics
train_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
train_total += labels.size(0)
train_correct += (predicted == labels).sum().item()
train_loss = train_loss / train_total
train_acc = train_correct / train_total
# Validation phase
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
# Apply preprocessing if provided
if preprocess_fn:
inputs = preprocess_fn(inputs)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item() * inputs.size(0)
_, predicted = torch.max(outputs.data, 1)
val_total += labels.size(0)
val_correct += (predicted == labels).sum().item()
val_loss = val_loss / val_total
val_acc = val_correct / val_total
# Store history
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
print(f'Epoch [{epoch+1}/{epochs}] - '
f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
return history
# Train the model
print("\nTraining initial model...")
model_info = train_model(model, train_loader, val_loader, criterion,
optimizer, epochs=2, preprocess_fn=preprocess_vgg)
def plot_losses(history):
"""Plot training and validation loss"""
plt.figure(figsize=(10, 4))
plt.plot(history['train_loss'], label='Train')
plt.plot(history['val_loss'], label='Val')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.title('Model Loss')
plt.legend()
plt.grid(True)
plt.show()
def plot_accuracies(history):
"""Plot training and validation accuracy"""
plt.figure(figsize=(10, 4))
plt.plot(history['train_acc'], label='Train')
plt.plot(history['val_acc'], label='Val')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()
# Plot losses and accuracies
plot_losses(model_info)
plot_accuracies(model_info)
Training initial model...
Epoch [1/2] - Train Loss: 1.0801, Train Acc: 0.6266, Val Loss: 0.9473, Val Acc: 0.6694
Epoch [2/2] - Train Loss: 0.8961, Train Acc: 0.6856, Val Loss: 0.9253, Val Acc: 0.6754
Denoising Autoencoder¶
In [44]:
# Denormalize the MNIST images back to [0, 1]
x_train_mnist_np = np.array([x[0].numpy() for x in x_train_mnist]) * 0.3081 + 0.1307
x_test_mnist_np = np.array([x[0].numpy() for x in x_test_mnist]) * 0.3081 + 0.1307
# Generate corrupted MNIST images by adding noise with normal distribution
noise = np.random.normal(loc=0.0, scale=0.5, size=x_train_mnist_np.shape)
x_train_noisy = x_train_mnist_np + noise
noise = np.random.normal(loc=0.0, scale=0.5, size=x_test_mnist_np.shape)
x_test_noisy = x_test_mnist_np + noise
# Clip values to [0, 1]
x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)
# Display the first 25 corrupted and original images
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_train_mnist_np[:num], x_train_noisy[:num]])
imgs = imgs.reshape((rows * 2, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 2, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.title('Original images: top rows, Corrupted Input: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
plt.savefig('original_vs_noisy.png', bbox_inches='tight', dpi=150)
plt.show()
In [45]:
# Convert to PyTorch tensors with shape (N, C, H, W)
x_train_tensor = torch.from_numpy(x_train_mnist_np).float()
x_train_noisy_tensor = torch.from_numpy(x_train_noisy).float()
x_test_tensor = torch.from_numpy(x_test_mnist_np).float()
x_test_noisy_tensor = torch.from_numpy(x_test_noisy).float()
In [46]:
# Create datasets and dataloaders
train_dataset = TensorDataset(x_train_noisy_tensor, x_train_tensor)
test_dataset = TensorDataset(x_test_noisy_tensor, x_test_tensor)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
In [47]:
# Network parameters
input_shape = (1, image_size, image_size) # (C, H, W) format for PyTorch
kernel_size = 3
latent_dim = 16
# Encoder/Decoder number of CNN layers and filters per layer
layer_filters = [32, 64]
In [48]:
class Encoder(nn.Module):
def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
super(Encoder, self).__init__()
self.latent_dim = latent_dim
# Stack of Conv2D blocks
layers = []
in_channels = 1
for filters in layer_filters:
layers.append(nn.Conv2d(in_channels, filters, kernel_size,
stride=2, padding=1))
layers.append(nn.ReLU(inplace=True))
in_channels = filters
self.conv_layers = nn.Sequential(*layers)
# Calculate the flattened size after convolutions
# After each stride=2 conv with padding=1: size -> (size + 1) // 2
# 28 -> 14 -> 7
self.flatten_size = layer_filters[-1] * 7 * 7
# Latent vector layer
self.fc = nn.Linear(self.flatten_size, latent_dim)
def forward(self, x):
x = self.conv_layers(x)
# Get shape before flattening (needed for decoder)
self.shape_before_flatten = x.shape
x = x.view(x.size(0), -1) # Flatten
latent = self.fc(x)
return latent
# Instantiate Encoder
encoder = Encoder(latent_dim=latent_dim, layer_filters=layer_filters,
kernel_size=kernel_size).to(device)
print("Encoder Architecture:")
# Ensure input_shape matches the expected dimensions (batch_size, channels, height, width)
input_shape = (1, 1, 28, 28) # Batch size of 1, 1 channel, 28x28 image
summary(encoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))
Encoder Architecture:
Out[48]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Encoder [1, 1, 28, 28] [1, 16] -- ├─Sequential: 1-1 [1, 1, 28, 28] [1, 64, 7, 7] -- │ └─Conv2d: 2-1 [1, 1, 28, 28] [1, 32, 14, 14] 320 │ └─ReLU: 2-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ └─Conv2d: 2-3 [1, 32, 14, 14] [1, 64, 7, 7] 18,496 │ └─ReLU: 2-4 [1, 64, 7, 7] [1, 64, 7, 7] -- ├─Linear: 1-2 [1, 3136] [1, 16] 50,192 =================================================================================================================== Total params: 69,008 Trainable params: 69,008 Non-trainable params: 0 Total mult-adds (M): 1.02 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.08 Params size (MB): 0.28 Estimated Total Size (MB): 0.35 ===================================================================================================================
In [49]:
class Decoder(nn.Module):
def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
super(Decoder, self).__init__()
# Calculate the size after encoder
# 28 -> 14 -> 7 (shape: [batch, 64, 7, 7])
self.shape_h = 7
self.shape_w = 7
self.shape_c = layer_filters[-1]
# Dense layer to reshape from latent vector
self.fc = nn.Linear(latent_dim, self.shape_c * self.shape_h * self.shape_w)
# Stack of Transposed Conv2D blocks (reverse order)
layers = []
reversed_filters = layer_filters[::-1]
for i in range(len(reversed_filters)):
in_channels = reversed_filters[i]
out_channels = reversed_filters[i+1] if i+1 < len(reversed_filters) else 1
if i < len(reversed_filters) - 1:
# Intermediate layers with ReLU
layers.append(nn.ConvTranspose2d(in_channels, out_channels,
kernel_size, stride=2,
padding=1, output_padding=1))
layers.append(nn.ReLU(inplace=True))
else:
# Last transposed conv to get back to original size
layers.append(nn.ConvTranspose2d(in_channels, out_channels,
kernel_size, stride=2,
padding=1, output_padding=1))
# Final conv to ensure exact output size and add sigmoid
layers.append(nn.Conv2d(1, 1, kernel_size, padding=1))
layers.append(nn.Sigmoid())
self.deconv_layers = nn.Sequential(*layers)
def forward(self, latent):
x = self.fc(latent)
x = x.view(-1, self.shape_c, self.shape_h, self.shape_w)
x = self.deconv_layers(x)
return x
# Instantiate Decoder
decoder = Decoder(latent_dim=latent_dim, layer_filters=layer_filters,
kernel_size=kernel_size).to(device)
print("\nDecoder Architecture:")
summary(decoder, input_size=(latent_dim,), col_names=("input_size", "output_size", "num_params"))
Decoder Architecture:
Out[49]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Decoder [16] [1, 1, 28, 28] -- ├─Linear: 1-1 [16] [3136] 53,312 ├─Sequential: 1-2 [1, 64, 7, 7] [1, 1, 28, 28] -- │ └─ConvTranspose2d: 2-1 [1, 64, 7, 7] [1, 32, 14, 14] 18,464 │ └─ReLU: 2-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ └─ConvTranspose2d: 2-3 [1, 32, 14, 14] [1, 1, 28, 28] 289 │ └─Conv2d: 2-4 [1, 1, 28, 28] [1, 1, 28, 28] 10 │ └─Sigmoid: 2-5 [1, 1, 28, 28] [1, 1, 28, 28] -- =================================================================================================================== Total params: 72,075 Trainable params: 72,075 Non-trainable params: 0 Total mult-adds (M): 171.04 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.09 Params size (MB): 0.29 Estimated Total Size (MB): 0.38 ===================================================================================================================
In [50]:
class Autoencoder(nn.Module):
def __init__(self, encoder, decoder):
super(Autoencoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
return reconstructed
# Instantiate Autoencoder
autoencoder = Autoencoder(encoder, decoder).to(device)
print("\nComplete Autoencoder:")
summary(autoencoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))
Complete Autoencoder:
Out[50]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Autoencoder [1, 1, 28, 28] [1, 1, 28, 28] -- ├─Encoder: 1-1 [1, 1, 28, 28] [1, 16] -- │ └─Sequential: 2-1 [1, 1, 28, 28] [1, 64, 7, 7] -- │ │ └─Conv2d: 3-1 [1, 1, 28, 28] [1, 32, 14, 14] 320 │ │ └─ReLU: 3-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ │ └─Conv2d: 3-3 [1, 32, 14, 14] [1, 64, 7, 7] 18,496 │ │ └─ReLU: 3-4 [1, 64, 7, 7] [1, 64, 7, 7] -- │ └─Linear: 2-2 [1, 3136] [1, 16] 50,192 ├─Decoder: 1-2 [1, 16] [1, 1, 28, 28] -- │ └─Linear: 2-3 [1, 16] [1, 3136] 53,312 │ └─Sequential: 2-4 [1, 64, 7, 7] [1, 1, 28, 28] -- │ │ └─ConvTranspose2d: 3-5 [1, 64, 7, 7] [1, 32, 14, 14] 18,464 │ │ └─ReLU: 3-6 [1, 32, 14, 14] [1, 32, 14, 14] -- │ │ └─ConvTranspose2d: 3-7 [1, 32, 14, 14] [1, 1, 28, 28] 289 │ │ └─Conv2d: 3-8 [1, 1, 28, 28] [1, 1, 28, 28] 10 │ │ └─Sigmoid: 3-9 [1, 1, 28, 28] [1, 1, 28, 28] -- =================================================================================================================== Total params: 141,083 Trainable params: 141,083 Non-trainable params: 0 Total mult-adds (M): 4.93 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.16 Params size (MB): 0.56 Estimated Total Size (MB): 0.73 ===================================================================================================================
In [51]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters())
# Training function
def train_autoencoder(model, train_loader, test_loader, criterion, optimizer, epochs=2):
"""Train the autoencoder"""
history = {'train_loss': [], 'val_loss': []}
for epoch in range(epochs):
# Training phase
model.train()
train_loss = 0.0
for noisy_imgs, clean_imgs in train_loader:
noisy_imgs = noisy_imgs.to(device)
clean_imgs = clean_imgs.to(device)
# Zero gradients
optimizer.zero_grad()
# Forward pass
outputs = model(noisy_imgs)
loss = criterion(outputs, clean_imgs)
# Backward pass
loss.backward()
optimizer.step()
train_loss += loss.item() * noisy_imgs.size(0)
train_loss = train_loss / len(train_loader.dataset)
# Validation phase
model.eval()
val_loss = 0.0
with torch.no_grad():
for noisy_imgs, clean_imgs in test_loader:
noisy_imgs = noisy_imgs.to(device)
clean_imgs = clean_imgs.to(device)
outputs = model(noisy_imgs)
loss = criterion(outputs, clean_imgs)
val_loss += loss.item() * noisy_imgs.size(0)
val_loss = val_loss / len(test_loader.dataset)
# Store history
history['train_loss'].append(train_loss)
history['val_loss'].append(val_loss)
print(f'Epoch [{epoch+1}/{epochs}] - '
f'Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
return history
# Train the autoencoder
print("\nTraining autoencoder...")
history = train_autoencoder(autoencoder, train_loader, test_loader,
criterion, optimizer, epochs=2)
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training History')
plt.legend()
plt.grid(True)
plt.savefig('training_history.png', bbox_inches='tight', dpi=150)
plt.show()
Training autoencoder...
Epoch [1/2] - Train Loss: 0.044972, Val Loss: 0.025023
Epoch [2/2] - Train Loss: 0.023074, Val Loss: 0.020574
In [52]:
# Predict the Autoencoder output from corrupted test images
autoencoder.eval()
with torch.no_grad():
# Get predictions for first batch
x_test_noisy_batch = x_test_noisy_tensor.to(device)
x_decoded = autoencoder(x_test_noisy_batch).cpu()
# Convert back to numpy for visualization
x_decoded = x_decoded.numpy() # Already has the correct dimensions
# Display the first 25 images: original, corrupted, and denoised
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_test_mnist_np[:num], x_test_noisy[:num], x_decoded[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)
plt.figure(figsize=(10, 15))
plt.axis('off')
plt.title('Original images: top rows, '
'Corrupted Input: middle rows, '
'Denoised Output: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
plt.savefig('denoising_results.png', bbox_inches='tight', dpi=150)
plt.show()
In [ ]: