import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import torchvision as tv
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import random as rn
from torchinfo import summary
from types import SimpleNamespace
from torchvision.utils import make_grid
from sklearn.manifold import TSNE

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu

mnist_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load MNIST dataset
x_train_mnist = tv.datasets.MNIST(root='./data', train=True, 
                                          download=True, transform=mnist_tf)
x_test_mnist = tv.datasets.MNIST(root='./data', train=False, 
                                         download=True, transform=mnist_tf)

image_size = x_train_mnist[0][0].shape[1]


print(f"MNIST training data size: {len(x_train_mnist)} of size {image_size}x{image_size}")
print(f"MNIST test data size: {len(x_test_mnist)} of size {image_size}x{image_size}")

MNIST training data size: 60000 of size 28x28
MNIST test data size: 10000 of size 28x28

train_loader_mnist = DataLoader(x_train_mnist, batch_size=128, shuffle=True)
test_loader_mnist  = DataLoader(x_test_mnist, batch_size=256, shuffle=False)

imgs = [x_train_mnist[i] for i in range(16)]  
labels = [i for i in range(16)] 
fig, axes = plt.subplots(4, 4, figsize=(4, 4))
for ax, img, lab in zip(axes.flatten(), imgs, labels):
    ax.imshow(img[0].squeeze().numpy(), cmap='gray') 
    ax.set_title(str(lab))
    ax.axis('off')
plt.tight_layout()
plt.show()

class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, 3, padding=1),  
            nn.ReLU(),
            nn.MaxPool2d(2),                 
            nn.Conv2d(8, 64, 3, padding=1), 
            nn.ReLU(),
            nn.MaxPool2d(2),               
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*7*7, 256),  # Adjusted based on the output size of the features block
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(256, 10)
        )
    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

model = SmallCNN().to(device)
summary(model, input_size=(1, 1, 28, 28),
        col_names=("input_size","output_size","num_params","kernel_size"),
        depth=4)

============================================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape
============================================================================================================================================
SmallCNN                                 [1, 1, 28, 28]            [1, 10]                   --                        --
├─Sequential: 1-1                        [1, 1, 28, 28]            [1, 64, 7, 7]             --                        --
│    └─Conv2d: 2-1                       [1, 1, 28, 28]            [1, 8, 28, 28]            80                        [3, 3]
│    └─ReLU: 2-2                         [1, 8, 28, 28]            [1, 8, 28, 28]            --                        --
│    └─MaxPool2d: 2-3                    [1, 8, 28, 28]            [1, 8, 14, 14]            --                        2
│    └─Conv2d: 2-4                       [1, 8, 14, 14]            [1, 64, 14, 14]           4,672                     [3, 3]
│    └─ReLU: 2-5                         [1, 64, 14, 14]           [1, 64, 14, 14]           --                        --
│    └─MaxPool2d: 2-6                    [1, 64, 14, 14]           [1, 64, 7, 7]             --                        2
├─Sequential: 1-2                        [1, 64, 7, 7]             [1, 10]                   --                        --
│    └─Flatten: 2-7                      [1, 64, 7, 7]             [1, 3136]                 --                        --
│    └─Linear: 2-8                       [1, 3136]                 [1, 256]                  803,072                   --
│    └─ReLU: 2-9                         [1, 256]                  [1, 256]                  --                        --
│    └─Dropout: 2-10                     [1, 256]                  [1, 256]                  --                        --
│    └─Linear: 2-11                      [1, 256]                  [1, 10]                   2,570                     --
============================================================================================================================================
Total params: 810,394
Trainable params: 810,394
Non-trainable params: 0
Total mult-adds (M): 1.78
============================================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.15
Params size (MB): 3.24
Estimated Total Size (MB): 3.40
============================================================================================================================================

def train_one_epoch(model, loader, opt, loss_fn):
    model.train()
    total, correct, running_loss = 0, 0, 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        logits = model(xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        opt.step()
        running_loss += loss.item()*xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds==yb).sum().item()
        total += xb.size(0)
    return running_loss/total, correct/total

@torch.no_grad()
def evaluate(model, loader, loss_fn):
    model.eval()
    total, correct, running_loss = 0, 0, 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = loss_fn(logits, yb)
        running_loss += loss.item()*xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds==yb).sum().item()
        total += xb.size(0)
    return running_loss/total, correct/total

history = {'epoch': [], 'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
if tv is not None:
    model = SmallCNN().to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    EPOCHS = 4 
    for epoch in range(1, EPOCHS+1):
        tl, ta = train_one_epoch(model, train_loader_mnist, opt, loss_fn)
        vl, va = evaluate(model, test_loader_mnist, loss_fn)
        history['epoch'].append(epoch)
        history['train_loss'].append(tl); history['val_loss'].append(vl)
        history['train_acc'].append(ta);  history['val_acc'].append(va)
        print(f'E{epoch}: train_loss={tl:.4f} val_loss={vl:.4f} train_acc={ta:.3f} val_acc={va:.3f}')
else:
    print("torchvision not available. Skipping training.")

E1: train_loss=0.2034 val_loss=0.0485 train_acc=0.938 val_acc=0.985

E2: train_loss=0.0591 val_loss=0.0455 train_acc=0.982 val_acc=0.984

E3: train_loss=0.0412 val_loss=0.0330 train_acc=0.987 val_acc=0.988

E4: train_loss=0.0313 val_loss=0.0282 train_acc=0.990 val_acc=0.990

if history and 'train_loss' in history and 'val_loss' in history:
    plt.figure(); plt.plot(history['epoch'], history['train_loss']); plt.plot(history['epoch'], history['val_loss']); plt.legend(['train','val']); plt.title('Loss'); plt.xlabel('epoch'); plt.show()
if history and 'train_acc' in history and 'val_acc' in history:
    plt.figure(); plt.plot(history['epoch'], history['train_acc']); plt.plot(history['epoch'], history['val_acc']); plt.legend(['train','val']); plt.title('Accuracy'); plt.xlabel('epoch'); plt.show()

if tv is not None:
    model.eval()
    all_true, all_pred = [], []
    with torch.no_grad():
        for xb, yb in test_loader_mnist:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            preds = logits.argmax(dim=1)
            all_pred.extend(preds.cpu().numpy())
            all_true.extend(yb.cpu().numpy())

    # Calculate accuracy
    all_true = np.array(all_true)
    all_pred = np.array(all_pred)
    accuracy = (all_true == all_pred).sum() / len(all_true)
    print(f"Accuracy on test data: {accuracy:.4f}")

    cm = confusion_matrix(all_true, all_pred, labels=list(range(10)))
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=list(range(10)), yticklabels=list(range(10)))
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.show()
else:
    print("torchvision not available. Cannot evaluate predictions.")

Accuracy on test data: 0.9903

alex_w   = tv.models.AlexNet_Weights.IMAGENET1K_V1

MODELS = {
    "alexnet": SimpleNamespace(
        ctor=lambda: tv.models.alexnet(weights=alex_w).to(device).eval(),
        weights=alex_w,
        act_layers=["features.0", "features.3"],  # conv blocks
        maxact_layer="features.10",
        arch="alexnet"
    ),
}

def load_image(path, weights):
    img = Image.open(path).convert('RGB')
    return weights.transforms()(img).unsqueeze(0).to(device)

def show(tensor, title=None):
    if tensor.ndim == 4:
        grid = make_grid(tensor, nrow=int(np.ceil(np.sqrt(tensor.size(0)))))
        arr = grid.permute(1,2,0).detach().cpu().numpy()
    else:
        arr = tensor.permute(1,2,0).detach().cpu().numpy()
    plt.figure(figsize=(6,6))
    plt.imshow(np.clip(arr, 0, 1))
    plt.axis('off')
    if title: plt.title(title)
    plt.show()

# Robust per-item normalization (avoids tuple-dim min/max issues)
def _norm_per_item(t):
    # t shape: [N, ...]
    if hasattr(torch, "amin"):
        tmin = torch.amin(t, dim=tuple(range(1, t.ndim)), keepdim=True)
        tmax = torch.amax(t, dim=tuple(range(1, t.ndim)), keepdim=True)
    else:
        flat = t.view(t.size(0), -1)
        tmin = flat.min(dim=1, keepdim=True)[0].view(-1, *([1]*(t.ndim-1)))
        tmax = flat.max(dim=1, keepdim=True)[0].view(-1, *([1]*(t.ndim-1)))
    return (t - tmin) / (tmax - tmin + 1e-8)

# Helper: resolve "features.23" dotted path to a module
def resolve_module(root, name):
    mod = root
    for part in name.split('.'):
        if part.isdigit():
            mod = mod[int(part)]
        else:
            mod = getattr(mod, part)
    return mod

def first_conv_module(model):
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            return m
    raise RuntimeError("No Conv2d found.")

def visualize_first_layer_filters(model, max_filters=64, label="model"):
    conv1 = None
    # Try common entry points first
    for attr in ["conv1", "features"]:
        if hasattr(model, attr):
            m = getattr(model, attr)
            if isinstance(m, nn.Conv2d):
                conv1 = m
                break
            # If Sequential, first layer likely Conv2d
            if isinstance(m, nn.Sequential):
                for x in m:
                    if isinstance(x, nn.Conv2d):
                        conv1 = x; break
        if conv1 is not None: break
    if conv1 is None:
        conv1 = first_conv_module(model)

    w = conv1.weight.data.clone().cpu()  # [out, in, k, k]
    w = _norm_per_item(w)
    show(w[:max_filters], title=f"{label}: first-layer conv filters")

def visualize_activations(model, img_tensor, layer_names, label="model"):
    feats, hooks = {}, []
    def hook(name): 
        return lambda m, i, o: feats.setdefault(name, o.detach().cpu())
    # Register hooks
    for name in layer_names:
        try:
            module = resolve_module(model, name)
            hooks.append(module.register_forward_hook(hook(name)))
        except Exception as e:
            print(f"[warn] could not hook '{name}': {e}")

    with torch.no_grad():
        _ = model(img_tensor)
    for h in hooks: h.remove()

    for name, feat in feats.items():
        fmap = feat[0]                    
        if fmap.ndim != 3:
            print(f"[info] {label}:{name} is non-spatial (shape {feat.shape}), skipping grid")
            continue
        C = min(64, fmap.size(0))
        fm = fmap[:C]
        fm = _norm_per_item(fm.unsqueeze(1)).squeeze(1)
        show(fm.unsqueeze(1), title=f"{label}: activations @ {name}")

@torch.no_grad()
def predict_probs(model, x):
    logits = model(x)
    return F.softmax(logits, dim=1)

def _resize_heat_with_torch(heat, H, W):
    t = torch.from_numpy(heat)[None, None]
    t = F.interpolate(t.float(), size=(H, W), mode="bilinear", align_corners=False)
    return t[0,0].numpy()

def occlusion_heatmap(model, img_tensor, idx_to_label=None, target_class=None, patch=32, stride=16, baseline=0.0, label="model"):
    model.eval()
    x = img_tensor.clone()
    probs = predict_probs(model, x)[0]
    if target_class is None:
        target_class = probs.argmax().item()
    base_p = probs[target_class].item()

    _, _, H, W = x.shape
    heat = np.zeros(((H - patch)//stride + 1, (W - patch)//stride + 1), dtype=np.float32)

    for i, y in enumerate(range(0, H - patch + 1, stride)):
        for j, z in enumerate(range(0, W - patch + 1, stride)):
            x_ = x.clone()
            x_[:,:, y:y+patch, z:z+patch] = baseline
            p = predict_probs(model, x_)[0, target_class].item()
            heat[i, j] = base_p - p

    heat_resized = _resize_heat_with_torch(heat, H, W)
    # quick unnormalize for show (using Imagenet stats)
    im = x[0].detach().cpu()
    im = (im * torch.tensor([0.229,0.224,0.225])[:,None,None] + torch.tensor([0.485,0.456,0.406])[:,None,None]).permute(1,2,0).numpy()
    plt.figure(figsize=(6,6)); plt.imshow(np.clip(im,0,1)); plt.imshow(heat_resized, alpha=0.5); plt.axis('off')
    if idx_to_label:
        tname = idx_to_label[target_class]
    else:
        tname = str(target_class)
    plt.title(f"{label}: occlusion (target='{tname}', base p={base_p:.3f})")
    plt.show()
    return heat_resized

def saliency_map(model, img_tensor, target_class=None, label="model"):
    model.eval()
    x = img_tensor.clone().requires_grad_(True)
    logits = model(x)
    if target_class is None:
        target_class = logits.argmax(dim=1).item()
    loss = logits[0, target_class]
    model.zero_grad()
    loss.backward()
    g = x.grad.detach()[0]               # [3,H,W]
    sal = g.abs().max(dim=0)[0]          # [H,W]
    sal = (sal - sal.min())/(sal.max()-sal.min()+1e-8)
    plt.figure(figsize=(6,6)); plt.imshow(sal.cpu(), cmap='gray'); plt.axis('off'); plt.title(f"{label}: saliency")
    plt.show()
    return sal

class GuidedBackpropReLU(nn.Module):
    def forward(self, x):
        self.saved = x
        return F.relu(x)
    def backward_hook(self, module, grad_in, grad_out):
        positive_grad = torch.clamp(grad_out[0], min=0.0)
        positive_mask = (self.saved > 0).float()
        return (positive_grad * positive_mask,)

def guided_backprop(model_ctor, weights, img_tensor, target_class=None, label="model"):
    # Create a fresh copy to freely patch ReLUs
    gb_model = model_ctor().to(device).eval()
    # Swap all ReLUs
    relus = []
    for name, module in gb_model.named_modules():
        if isinstance(module, nn.ReLU):
            relu = GuidedBackpropReLU()
            relus.append(relu)
            parent = gb_model
            *parents, leaf = name.split('.')
            for p in parents:
                parent = getattr(parent, p)
            setattr(parent, leaf, relu)
    x = img_tensor.clone().requires_grad_(True)
    logits = gb_model(x)
    if target_class is None:
        target_class = logits.argmax(dim=1).item()
    loss = logits[0, target_class]
    gb_model.zero_grad()
    hooks = [relu.register_full_backward_hook(relu.backward_hook) for relu in relus]
    loss.backward()
    for h in hooks: h.remove()

    g = x.grad.detach()[0]
    g = (g - g.min())/(g.max()-g.min()+1e-8)
    g = g.permute(1,2,0).cpu().numpy()
    plt.figure(figsize=(6,6)); plt.imshow(g); plt.axis('off'); plt.title(f"{label}: guided backprop")
    plt.show()
    return g

class FeatExtractor(nn.Module):
    """Return a fixed-dim feature vector (penultimate-ish) for each arch."""
    def __init__(self, model, arch):
        super().__init__()
        self.arch = arch
        self.model = model
        if arch == "resnet":
            # body up to layer4 GAP
            self.body = nn.Sequential(
                model.conv1, model.bn1, model.relu, model.maxpool,
                model.layer1, model.layer2, model.layer3, model.layer4,
                nn.AdaptiveAvgPool2d((1,1))
            )
            self.out_dim = model.fc.in_features
        elif arch == "vgg" or arch == "alexnet":
            self.features = model.features
            self.pool = nn.AdaptiveAvgPool2d((7,7))  # match VGG/Alex input to classifier
            # classifier: take everything except final Linear
            self.prefix = nn.Sequential(*list(model.classifier.children())[:-1])
            # out_dim is the in_features of final Linear
            last_linear = list(model.classifier.children())[-1]
            self.out_dim = last_linear.in_features
        else:
            raise ValueError("Unknown arch")
    def forward(self, x):
        if self.arch == "resnet":
            x = self.body(x).flatten(1)
            return x
        else:
            x = self.features(x)
            x = self.pool(x)
            x = torch.flatten(x, 1)
            x = self.prefix(x)
            return x

def max_activating_images(model, dataset, layer_name, topk=16, label="model"):
    target = resolve_module(model, layer_name)
    acts = []
    imgs_cache = []
    def fhook(m, i, o):
        if o.ndim == 4:
            a = o.detach().cpu().mean(dim=(2,3))  # GAP over H,W → [B, C]
        else:
            a = o.detach().cpu()
        acts.append(a)
    h = target.register_forward_hook(fhook)
    loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False, num_workers=2)
    with torch.no_grad():
        for xb, yb in loader:
            imgs_cache.append(xb)
            _ = model(xb.to(device))
    h.remove()
    A = torch.cat(acts, 0).numpy()      # [N, C]
    imgs_cache = torch.cat(imgs_cache, 0)
    # Choose an arbitrary channel to inspect (customize this)
    channel = min(5, A.shape[1]-1)
    idxs = np.argsort(-A[:, channel])[:topk]
    grid = imgs_cache[idxs]
    # unnormalize for viewing (ImageNet stats)
    grid = grid*torch.tensor([0.229,0.224,0.225])[None,:,None,None] + torch.tensor([0.485,0.456,0.406])[None,:,None,None]
    grid = grid.clamp(0,1)
    show(grid, title=f"{label}: top-{topk} images for channel {channel} @ {layer_name}")

img_path = "pishi.png" 
from PIL import Image

models = {}
for name, cfg in MODELS.items():
    m = cfg.ctor()
    models[name] = SimpleNamespace(
        model=m, weights=cfg.weights, act_layers=cfg.act_layers,
        maxact_layer=cfg.maxact_layer, arch=cfg.arch,
        idx_to_label=cfg.weights.meta.get("categories", None)
    )

# Ensure the input image is resized to 224x224
images = {name: load_image(img_path, cfg.weights) for name, cfg in models.items()}
for name, img in images.items():
    assert img.shape[-2:] == (224, 224), f"Image for model {name} is not resized to 224x224"

# 1) First-layer filters comparison
for name, cfg in models.items():
    visualize_first_layer_filters(cfg.model, max_filters=64, label=name)

# 2) Activation maps at key layers
for name, cfg in models.items():
    visualize_activations(cfg.model, images[name], cfg.act_layers, label=name)

# 3) Occlusion sensitivity (same target class per model by default)
for name, cfg in models.items():
    _ = occlusion_heatmap(cfg.model, images[name], idx_to_label=cfg.idx_to_label, patch=32, stride=16, label=name)

# 4) Saliency and Guided Backprop
for name, cfg in models.items():
    _ = saliency_map(cfg.model, images[name], label=name)
    _ = guided_backprop(MODELS[name].ctor, cfg.weights, images[name], label=name)

# Download and load training data
cifar_tf = transforms.Compose([
    transforms.ToTensor(),
])

data_cifar100 = tv.datasets.CIFAR100(root='./data', train=True,
                                        download=True, transform=cifar_tf)

X_cifar = data_cifar100.data  # numpy array (50000, 32, 32, 3)
y_cifar = np.array(data_cifar100.targets)  # numpy array (50000,)

print('X shape: ', X_cifar.shape)
print('y shape:', y_cifar.shape)

X shape:  (50000, 32, 32, 3)
y shape: (50000,)

fig, ax = plt.subplots(5, 4)
fig.set_size_inches(15, 15)

for i in range(5):
    for j in range(4):
        l = rn.randint(0, len(y_cifar))
        ax[i, j].imshow(X_cifar[l])
        ax[i, j].set_title('Label: ' + str(y_cifar[l]))
        ax[i, j].grid(False)
        ax[i, j].set_xticks([])
        ax[i, j].set_yticks([])

plt.tight_layout()
plt.show()

X_cifar = X_cifar / 255.0

# Convert to PyTorch tensors and reshape to (N, C, H, W) format
X_cifar_tensor = torch.FloatTensor(X_cifar).permute(0, 3, 1, 2)  # (50000, 3, 32, 32)
y_cifar_tensor = torch.LongTensor(y_cifar)

print('X_tensor shape: ', X_cifar_tensor.shape)
print('y_tensor shape:', y_cifar_tensor.shape)

X_tensor shape:  torch.Size([50000, 3, 32, 32])
y_tensor shape: torch.Size([50000])

# Create dataset and split into train/val
dataset_cifar = TensorDataset(X_cifar_tensor, y_cifar_tensor)
train_size_cifar_100 = int(0.9 * len(dataset_cifar))
val_size_cifar = len(dataset_cifar) - train_size_cifar_100
train_dataset_cifar, val_dataset_cifar = random_split(dataset_cifar, [train_size_cifar_100, val_size_cifar])

# Create data loaders
batch_size = 200
train_loader_cifar = DataLoader(train_dataset_cifar, batch_size=batch_size, shuffle=True)
val_loader_cifar = DataLoader(val_dataset_cifar, batch_size=batch_size, shuffle=False)

class CNN_Demo(nn.Module):
    def __init__(self, num_classes=100):
        super(CNN_Demo, self).__init__()
        
        # First convolutional block
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, 
                              kernel_size=3, padding=1)  # 'same' padding
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        
        # Second convolutional block
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, 
                              kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(32 * 8 * 8, 200)  # After 2 pooling layers: 32->16->8
        self.fc2 = nn.Linear(200, num_classes)
        
    def forward(self, x):
        # First conv block
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = self.bn1(x)
        
        # Second conv block
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# Create model instance
model = CNN_Demo(num_classes=100).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.001)

epochs = 2

summary(model, input_size=(1, 3, 32, 32),
        col_names=("input_size","output_size","num_params","kernel_size"),
        depth=4)

============================================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape
============================================================================================================================================
CNN_Demo                                 [1, 3, 32, 32]            [1, 100]                  --                        --
├─Conv2d: 1-1                            [1, 3, 32, 32]            [1, 16, 32, 32]           448                       [3, 3]
├─MaxPool2d: 1-2                         [1, 16, 32, 32]           [1, 16, 16, 16]           --                        2
├─BatchNorm2d: 1-3                       [1, 16, 16, 16]           [1, 16, 16, 16]           32                        --
├─Conv2d: 1-4                            [1, 16, 16, 16]           [1, 32, 16, 16]           4,640                     [3, 3]
├─MaxPool2d: 1-5                         [1, 32, 16, 16]           [1, 32, 8, 8]             --                        2
├─Linear: 1-6                            [1, 2048]                 [1, 200]                  409,800                   --
├─Linear: 1-7                            [1, 200]                  [1, 100]                  20,100                    --
============================================================================================================================================
Total params: 435,020
Trainable params: 435,020
Non-trainable params: 0
Total mult-adds (M): 2.08
============================================================================================================================================
Input size (MB): 0.01
Forward/backward pass size (MB): 0.23
Params size (MB): 1.74
Estimated Total Size (MB): 1.98
============================================================================================================================================

# Training loop
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for inputs, labels in train_loader_cifar:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Statistics
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
    
    train_loss = train_loss / train_total
    train_acc = train_correct / train_total
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader_cifar:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    
    val_loss = val_loss / val_total
    val_acc = val_correct / val_total
    
    # Store history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    print(f'Epoch [{epoch+1}/{epochs}] - '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch [1/2] - Train Loss: 3.9809, Train Acc: 0.1101, Val Loss: 3.7447, Val Acc: 0.1572

Epoch [2/2] - Train Loss: 3.5960, Train Acc: 0.1730, Val Loss: 3.5809, Val Acc: 0.1738

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='train')
plt.plot(history['val_loss'], label='val')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['train_acc'], label='train')
plt.plot(history['val_acc'], label='val')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()

plt.tight_layout()
plt.show()

from torchvision.models import vgg16, VGG16_Weights

# Load pretrained VGG16
base_model = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

# VGG16 expects 224x224 images, but CIFAR-100 is 32x32
# We'll modify the architecture to work with 32x32 images

class VGG16Transfer(nn.Module):
    def __init__(self, num_classes=100):
        super(VGG16Transfer, self).__init__()
        
        # Use VGG16 features (convolutional layers)
        vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
        self.features = vgg.features
        
        # Modify classifier for CIFAR-100
        # After VGG features with 32x32 input, we get 1x1x512
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(512, 200)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(200, num_classes)
        
    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create transfer learning model
transfer_model = VGG16Transfer(num_classes=100).to(device)

# Freeze the feature extraction layers
for param in transfer_model.features.parameters():
    param.requires_grad = False

# Only train the classifier layers
optimizer_transfer = optim.Adagrad(transfer_model.parameters(), lr=0.001)
criterion_transfer = nn.CrossEntropyLoss()

epochs_transfer = 2
history_transfer = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(epochs_transfer):
    # Training phase
    transfer_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for inputs, labels in train_loader_cifar:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer_transfer.zero_grad()
        outputs = transfer_model(inputs)
        loss = criterion_transfer(outputs, labels)
        loss.backward()
        optimizer_transfer.step()
        
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
    
    train_loss = train_loss / train_total
    train_acc = train_correct / train_total
    
    # Validation phase
    transfer_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader_cifar:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = transfer_model(inputs)
            loss = criterion_transfer(outputs, labels)
            
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
    
    val_loss = val_loss / val_total
    val_acc = val_correct / val_total
    
    history_transfer['train_loss'].append(train_loss)
    history_transfer['train_acc'].append(train_acc)
    history_transfer['val_loss'].append(val_loss)
    history_transfer['val_acc'].append(val_acc)
    
    print(f'Epoch [{epoch+1}/{epochs_transfer}] - '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

Epoch [1/2] - Train Loss: 4.0653, Train Acc: 0.1482, Val Loss: 3.7265, Val Acc: 0.2076

Epoch [2/2] - Train Loss: 3.5416, Train Acc: 0.2287, Val Loss: 3.4285, Val Acc: 0.2386

# Plot transfer learning results
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history_transfer['train_loss'], label='train')
plt.plot(history_transfer['val_loss'], label='val')
plt.title('Transfer Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_transfer['train_acc'], label='train')
plt.plot(history_transfer['val_acc'], label='val')
plt.title('Transfer Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()

plt.tight_layout()
plt.show()

# Select a random image
img_idx = rn.randint(0, len(X_cifar_tensor))
img = X_cifar_tensor[img_idx:img_idx+1].to(device)

# Display original image
plt.imshow(X_cifar[img_idx])
plt.grid(False)
plt.title('Original Image')
plt.show()

# Hook to capture intermediate layer outputs
activation = {}

def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

# Register hooks for convolutional layers
transfer_model.features[2].register_forward_hook(get_activation('conv1'))
transfer_model.features[7].register_forward_hook(get_activation('conv2'))
transfer_model.features[12].register_forward_hook(get_activation('conv3'))
transfer_model.features[19].register_forward_hook(get_activation('conv4'))
transfer_model.features[26].register_forward_hook(get_activation('conv5'))

# Forward pass
transfer_model.eval()
with torch.no_grad():
    _ = transfer_model(img)

# Visualize feature maps
layer_names = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']

for layer_name in layer_names:
    if layer_name in activation:
        feature_maps = activation[layer_name].cpu().numpy()[0]  # (num_channels, H, W)
        num_filters = min(64, feature_maps.shape[0])
        
        fig, axes = plt.subplots(8, 8, figsize=(10, 10))
        fig.suptitle(f'Feature Maps from {layer_name}', fontsize=16)
        
        for i in range(8):
            for j in range(8):
                idx = i * 8 + j
                if idx < num_filters:
                    axes[i, j].imshow(feature_maps[idx], cmap='viridis')
                    axes[i, j].set_title(f'K{idx}', fontsize=8)
                axes[i, j].axis('off')
        
        plt.tight_layout()
        plt.show()

# Load CIFAR-10 dataset
cifar10_tf = transforms.Compose([
    transforms.ToTensor(),
])

# Download and load training data
data_cifar_10 = tv.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=cifar10_tf)

X_cifar_10 = data_cifar_10.data  # numpy array (50000, 32, 32, 3)
y_cifar_10 = np.array(data_cifar_10.targets)  # numpy array (50000,)

print("shape of X:", X_cifar_10.shape)
print("shape of y:", y_cifar_10.shape)

shape of X: (50000, 32, 32, 3)
shape of y: (50000,)

# Normalize all entries to the interval [0, 1]
X_cifar_10 = X_cifar_10 / 255.0

# Convert to PyTorch tensors and reshape to (N, C, H, W) format
X_cifar_10_tensor = torch.FloatTensor(X_cifar_10).permute(0, 3, 1, 2)  # (50000, 3, 32, 32)
y_cifar_10_tensor = torch.LongTensor(y_cifar_10)

print("X_tensor shape:", X_cifar_10_tensor.shape)
print("y_tensor shape:", y_cifar_10_tensor.shape)

X_tensor shape: torch.Size([50000, 3, 32, 32])
y_tensor shape: torch.Size([50000])

# Create dataset and split into train/val
dataset_cifar_10 = TensorDataset(X_cifar_10_tensor, y_cifar_10_tensor)
train_size_cifar_10 = int(0.8 * len(dataset_cifar_10))
val_size_cifar_10 = len(dataset_cifar_10) - train_size_cifar_10
train_dataset_cifar_10, val_dataset_cifar_10 = random_split(dataset_cifar_10, [train_size_cifar_10, val_size_cifar_10])

# Create data loaders
batch_size = 100
train_loader = DataLoader(train_dataset_cifar_10, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset_cifar_10, batch_size=batch_size, shuffle=False)

from torchvision.models import vgg16, VGG16_Weights

# Load pretrained VGG16
base_model = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

# Display the summary of the VGG16 model with the correct input size
summary(base_model, input_size=(1, 3, 224, 224), col_names=("input_size","output_size","num_params","kernel_size"))

============================================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape
============================================================================================================================================
VGG                                      [1, 3, 224, 224]          [1, 1000]                 --                        --
├─Sequential: 1-1                        [1, 3, 224, 224]          [1, 512, 7, 7]            --                        --
│    └─Conv2d: 2-1                       [1, 3, 224, 224]          [1, 64, 224, 224]         1,792                     [3, 3]
│    └─ReLU: 2-2                         [1, 64, 224, 224]         [1, 64, 224, 224]         --                        --
│    └─Conv2d: 2-3                       [1, 64, 224, 224]         [1, 64, 224, 224]         36,928                    [3, 3]
│    └─ReLU: 2-4                         [1, 64, 224, 224]         [1, 64, 224, 224]         --                        --
│    └─MaxPool2d: 2-5                    [1, 64, 224, 224]         [1, 64, 112, 112]         --                        2
│    └─Conv2d: 2-6                       [1, 64, 112, 112]         [1, 128, 112, 112]        73,856                    [3, 3]
│    └─ReLU: 2-7                         [1, 128, 112, 112]        [1, 128, 112, 112]        --                        --
│    └─Conv2d: 2-8                       [1, 128, 112, 112]        [1, 128, 112, 112]        147,584                   [3, 3]
│    └─ReLU: 2-9                         [1, 128, 112, 112]        [1, 128, 112, 112]        --                        --
│    └─MaxPool2d: 2-10                   [1, 128, 112, 112]        [1, 128, 56, 56]          --                        2
│    └─Conv2d: 2-11                      [1, 128, 56, 56]          [1, 256, 56, 56]          295,168                   [3, 3]
│    └─ReLU: 2-12                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─Conv2d: 2-13                      [1, 256, 56, 56]          [1, 256, 56, 56]          590,080                   [3, 3]
│    └─ReLU: 2-14                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─Conv2d: 2-15                      [1, 256, 56, 56]          [1, 256, 56, 56]          590,080                   [3, 3]
│    └─ReLU: 2-16                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─MaxPool2d: 2-17                   [1, 256, 56, 56]          [1, 256, 28, 28]          --                        2
│    └─Conv2d: 2-18                      [1, 256, 28, 28]          [1, 512, 28, 28]          1,180,160                 [3, 3]
│    └─ReLU: 2-19                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─Conv2d: 2-20                      [1, 512, 28, 28]          [1, 512, 28, 28]          2,359,808                 [3, 3]
│    └─ReLU: 2-21                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─Conv2d: 2-22                      [1, 512, 28, 28]          [1, 512, 28, 28]          2,359,808                 [3, 3]
│    └─ReLU: 2-23                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─MaxPool2d: 2-24                   [1, 512, 28, 28]          [1, 512, 14, 14]          --                        2
│    └─Conv2d: 2-25                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-26                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─Conv2d: 2-27                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-28                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─Conv2d: 2-29                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-30                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─MaxPool2d: 2-31                   [1, 512, 14, 14]          [1, 512, 7, 7]            --                        2
├─AdaptiveAvgPool2d: 1-2                 [1, 512, 7, 7]            [1, 512, 7, 7]            --                        --
├─Sequential: 1-3                        [1, 25088]                [1, 1000]                 --                        --
│    └─Linear: 2-32                      [1, 25088]                [1, 4096]                 102,764,544               --
│    └─ReLU: 2-33                        [1, 4096]                 [1, 4096]                 --                        --
│    └─Dropout: 2-34                     [1, 4096]                 [1, 4096]                 --                        --
│    └─Linear: 2-35                      [1, 4096]                 [1, 4096]                 16,781,312                --
│    └─ReLU: 2-36                        [1, 4096]                 [1, 4096]                 --                        --
│    └─Dropout: 2-37                     [1, 4096]                 [1, 4096]                 --                        --
│    └─Linear: 2-38                      [1, 4096]                 [1, 1000]                 4,097,000                 --
============================================================================================================================================
Total params: 138,357,544
Trainable params: 138,357,544
Non-trainable params: 0
Total mult-adds (G): 15.48
============================================================================================================================================
Input size (MB): 0.60
Forward/backward pass size (MB): 108.45
Params size (MB): 553.43
Estimated Total Size (MB): 662.49
============================================================================================================================================

class VGG16Classifier(nn.Module):
    def __init__(self, num_classes=10, freeze_features=True):
        super(VGG16Classifier, self).__init__()
        
        # Load pretrained VGG16
        vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
        
        # Use VGG16 features (convolutional layers)
        self.features = vgg.features
        
        # Freeze the pretrained weights if specified
        if freeze_features:
            for param in self.features.parameters():
                param.requires_grad = False
        
        # Custom classifier for CIFAR10
        # After features with 32x32 input, we get 1x1x512
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 60),
            nn.ReLU(inplace=True),
            nn.Linear(60, num_classes)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Create model instance
model = VGG16Classifier(num_classes=10, freeze_features=True).to(device)

summary(base_model, input_size=(1, 3, 224, 224), col_names=("input_size","output_size","num_params","kernel_size"))

============================================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape
============================================================================================================================================
VGG                                      [1, 3, 224, 224]          [1, 1000]                 --                        --
├─Sequential: 1-1                        [1, 3, 224, 224]          [1, 512, 7, 7]            --                        --
│    └─Conv2d: 2-1                       [1, 3, 224, 224]          [1, 64, 224, 224]         1,792                     [3, 3]
│    └─ReLU: 2-2                         [1, 64, 224, 224]         [1, 64, 224, 224]         --                        --
│    └─Conv2d: 2-3                       [1, 64, 224, 224]         [1, 64, 224, 224]         36,928                    [3, 3]
│    └─ReLU: 2-4                         [1, 64, 224, 224]         [1, 64, 224, 224]         --                        --
│    └─MaxPool2d: 2-5                    [1, 64, 224, 224]         [1, 64, 112, 112]         --                        2
│    └─Conv2d: 2-6                       [1, 64, 112, 112]         [1, 128, 112, 112]        73,856                    [3, 3]
│    └─ReLU: 2-7                         [1, 128, 112, 112]        [1, 128, 112, 112]        --                        --
│    └─Conv2d: 2-8                       [1, 128, 112, 112]        [1, 128, 112, 112]        147,584                   [3, 3]
│    └─ReLU: 2-9                         [1, 128, 112, 112]        [1, 128, 112, 112]        --                        --
│    └─MaxPool2d: 2-10                   [1, 128, 112, 112]        [1, 128, 56, 56]          --                        2
│    └─Conv2d: 2-11                      [1, 128, 56, 56]          [1, 256, 56, 56]          295,168                   [3, 3]
│    └─ReLU: 2-12                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─Conv2d: 2-13                      [1, 256, 56, 56]          [1, 256, 56, 56]          590,080                   [3, 3]
│    └─ReLU: 2-14                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─Conv2d: 2-15                      [1, 256, 56, 56]          [1, 256, 56, 56]          590,080                   [3, 3]
│    └─ReLU: 2-16                        [1, 256, 56, 56]          [1, 256, 56, 56]          --                        --
│    └─MaxPool2d: 2-17                   [1, 256, 56, 56]          [1, 256, 28, 28]          --                        2
│    └─Conv2d: 2-18                      [1, 256, 28, 28]          [1, 512, 28, 28]          1,180,160                 [3, 3]
│    └─ReLU: 2-19                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─Conv2d: 2-20                      [1, 512, 28, 28]          [1, 512, 28, 28]          2,359,808                 [3, 3]
│    └─ReLU: 2-21                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─Conv2d: 2-22                      [1, 512, 28, 28]          [1, 512, 28, 28]          2,359,808                 [3, 3]
│    └─ReLU: 2-23                        [1, 512, 28, 28]          [1, 512, 28, 28]          --                        --
│    └─MaxPool2d: 2-24                   [1, 512, 28, 28]          [1, 512, 14, 14]          --                        2
│    └─Conv2d: 2-25                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-26                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─Conv2d: 2-27                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-28                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─Conv2d: 2-29                      [1, 512, 14, 14]          [1, 512, 14, 14]          2,359,808                 [3, 3]
│    └─ReLU: 2-30                        [1, 512, 14, 14]          [1, 512, 14, 14]          --                        --
│    └─MaxPool2d: 2-31                   [1, 512, 14, 14]          [1, 512, 7, 7]            --                        2
├─AdaptiveAvgPool2d: 1-2                 [1, 512, 7, 7]            [1, 512, 7, 7]            --                        --
├─Sequential: 1-3                        [1, 25088]                [1, 1000]                 --                        --
│    └─Linear: 2-32                      [1, 25088]                [1, 4096]                 102,764,544               --
│    └─ReLU: 2-33                        [1, 4096]                 [1, 4096]                 --                        --
│    └─Dropout: 2-34                     [1, 4096]                 [1, 4096]                 --                        --
│    └─Linear: 2-35                      [1, 4096]                 [1, 4096]                 16,781,312                --
│    └─ReLU: 2-36                        [1, 4096]                 [1, 4096]                 --                        --
│    └─Dropout: 2-37                     [1, 4096]                 [1, 4096]                 --                        --
│    └─Linear: 2-38                      [1, 4096]                 [1, 1000]                 4,097,000                 --
============================================================================================================================================
Total params: 138,357,544
Trainable params: 138,357,544
Non-trainable params: 0
Total mult-adds (G): 15.48
============================================================================================================================================
Input size (MB): 0.60
Forward/backward pass size (MB): 108.45
Params size (MB): 553.43
Estimated Total Size (MB): 662.49
============================================================================================================================================

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# VGG16 preprocessing - normalize with ImageNet mean and std
imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device)
imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device)

def preprocess_vgg(x):
    """Preprocess input for VGG16 (ImageNet normalization)"""
    return (x - imagenet_mean) / imagenet_std

def train_model(model, train_loader, val_loader, criterion, optimizer, 
                epochs=2, preprocess_fn=None):
    """Train the model and return history"""
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Apply preprocessing if provided
            if preprocess_fn:
                inputs = preprocess_fn(inputs)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Statistics
            train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss = train_loss / train_total
        train_acc = train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                # Apply preprocessing if provided
                if preprocess_fn:
                    inputs = preprocess_fn(inputs)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss = val_loss / val_total
        val_acc = val_correct / val_total
        
        # Store history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f'Epoch [{epoch+1}/{epochs}] - '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    return history

# Train the model
print("\nTraining initial model...")
model_info = train_model(model, train_loader, val_loader, criterion, 
                         optimizer, epochs=2, preprocess_fn=preprocess_vgg)

def plot_losses(history):
    """Plot training and validation loss"""
    plt.figure(figsize=(10, 4))
    plt.plot(history['train_loss'], label='Train')
    plt.plot(history['val_loss'], label='Val')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.title('Model Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_accuracies(history):
    """Plot training and validation accuracy"""
    plt.figure(figsize=(10, 4))
    plt.plot(history['train_acc'], label='Train')
    plt.plot(history['val_acc'], label='Val')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot losses and accuracies
plot_losses(model_info)
plot_accuracies(model_info)

Training initial model...

Epoch [1/2] - Train Loss: 1.0801, Train Acc: 0.6266, Val Loss: 0.9473, Val Acc: 0.6694

Epoch [2/2] - Train Loss: 0.8961, Train Acc: 0.6856, Val Loss: 0.9253, Val Acc: 0.6754

# Denormalize the MNIST images back to [0, 1]
x_train_mnist_np = np.array([x[0].numpy() for x in x_train_mnist]) * 0.3081 + 0.1307
x_test_mnist_np = np.array([x[0].numpy() for x in x_test_mnist]) * 0.3081 + 0.1307

# Generate corrupted MNIST images by adding noise with normal distribution
noise = np.random.normal(loc=0.0, scale=0.5, size=x_train_mnist_np.shape)
x_train_noisy = x_train_mnist_np + noise
noise = np.random.normal(loc=0.0, scale=0.5, size=x_test_mnist_np.shape)
x_test_noisy = x_test_mnist_np + noise

# Clip values to [0, 1]
x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)

# Display the first 25 corrupted and original images
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_train_mnist_np[:num], x_train_noisy[:num]])
imgs = imgs.reshape((rows * 2, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 2, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])

plt.figure(figsize=(10, 10))
plt.axis('off')
plt.title('Original images: top rows, Corrupted Input: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
plt.savefig('original_vs_noisy.png', bbox_inches='tight', dpi=150)
plt.show()

# Convert to PyTorch tensors with shape (N, C, H, W)
x_train_tensor       = torch.from_numpy(x_train_mnist_np).float()
x_train_noisy_tensor = torch.from_numpy(x_train_noisy).float()
x_test_tensor        = torch.from_numpy(x_test_mnist_np).float()
x_test_noisy_tensor  = torch.from_numpy(x_test_noisy).float()

# Create datasets and dataloaders
train_dataset = TensorDataset(x_train_noisy_tensor, x_train_tensor)
test_dataset = TensorDataset(x_test_noisy_tensor, x_test_tensor)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Network parameters
input_shape = (1, image_size, image_size)  # (C, H, W) format for PyTorch
kernel_size = 3
latent_dim = 16
# Encoder/Decoder number of CNN layers and filters per layer
layer_filters = [32, 64]

class Encoder(nn.Module):
    def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
        super(Encoder, self).__init__()
        
        self.latent_dim = latent_dim
        
        # Stack of Conv2D blocks
        layers = []
        in_channels = 1
        for filters in layer_filters:
            layers.append(nn.Conv2d(in_channels, filters, kernel_size, 
                                   stride=2, padding=1))
            layers.append(nn.ReLU(inplace=True))
            in_channels = filters
        
        self.conv_layers = nn.Sequential(*layers)
        
        # Calculate the flattened size after convolutions
        # After each stride=2 conv with padding=1: size -> (size + 1) // 2
        # 28 -> 14 -> 7
        self.flatten_size = layer_filters[-1] * 7 * 7
        
        # Latent vector layer
        self.fc = nn.Linear(self.flatten_size, latent_dim)
        
    def forward(self, x):
        x = self.conv_layers(x)
        # Get shape before flattening (needed for decoder)
        self.shape_before_flatten = x.shape
        x = x.view(x.size(0), -1)  # Flatten
        latent = self.fc(x)
        return latent

# Instantiate Encoder
encoder = Encoder(latent_dim=latent_dim, layer_filters=layer_filters, 
                 kernel_size=kernel_size).to(device)

print("Encoder Architecture:")
# Ensure input_shape matches the expected dimensions (batch_size, channels, height, width)
input_shape = (1, 1, 28, 28)  # Batch size of 1, 1 channel, 28x28 image
summary(encoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))

Encoder Architecture:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Encoder                                  [1, 1, 28, 28]            [1, 16]                   --
├─Sequential: 1-1                        [1, 1, 28, 28]            [1, 64, 7, 7]             --
│    └─Conv2d: 2-1                       [1, 1, 28, 28]            [1, 32, 14, 14]           320
│    └─ReLU: 2-2                         [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    └─Conv2d: 2-3                       [1, 32, 14, 14]           [1, 64, 7, 7]             18,496
│    └─ReLU: 2-4                         [1, 64, 7, 7]             [1, 64, 7, 7]             --
├─Linear: 1-2                            [1, 3136]                 [1, 16]                   50,192
===================================================================================================================
Total params: 69,008
Trainable params: 69,008
Non-trainable params: 0
Total mult-adds (M): 1.02
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.08
Params size (MB): 0.28
Estimated Total Size (MB): 0.35
===================================================================================================================

class Decoder(nn.Module):
    def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
        super(Decoder, self).__init__()
        
        # Calculate the size after encoder
        # 28 -> 14 -> 7 (shape: [batch, 64, 7, 7])
        self.shape_h = 7
        self.shape_w = 7
        self.shape_c = layer_filters[-1]
        
        # Dense layer to reshape from latent vector
        self.fc = nn.Linear(latent_dim, self.shape_c * self.shape_h * self.shape_w)
        
        # Stack of Transposed Conv2D blocks (reverse order)
        layers = []
        reversed_filters = layer_filters[::-1]
        
        for i in range(len(reversed_filters)):
            in_channels = reversed_filters[i]
            out_channels = reversed_filters[i+1] if i+1 < len(reversed_filters) else 1
            
            if i < len(reversed_filters) - 1:
                # Intermediate layers with ReLU
                layers.append(nn.ConvTranspose2d(in_channels, out_channels, 
                                                kernel_size, stride=2, 
                                                padding=1, output_padding=1))
                layers.append(nn.ReLU(inplace=True))
            else:
                # Last transposed conv to get back to original size
                layers.append(nn.ConvTranspose2d(in_channels, out_channels, 
                                                kernel_size, stride=2, 
                                                padding=1, output_padding=1))
        
        # Final conv to ensure exact output size and add sigmoid
        layers.append(nn.Conv2d(1, 1, kernel_size, padding=1))
        layers.append(nn.Sigmoid())
        
        self.deconv_layers = nn.Sequential(*layers)
        
    def forward(self, latent):
        x = self.fc(latent)
        x = x.view(-1, self.shape_c, self.shape_h, self.shape_w)
        x = self.deconv_layers(x)
        return x

# Instantiate Decoder
decoder = Decoder(latent_dim=latent_dim, layer_filters=layer_filters, 
                 kernel_size=kernel_size).to(device)

print("\nDecoder Architecture:")
summary(decoder, input_size=(latent_dim,), col_names=("input_size", "output_size", "num_params"))

Decoder Architecture:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Decoder                                  [16]                      [1, 1, 28, 28]            --
├─Linear: 1-1                            [16]                      [3136]                    53,312
├─Sequential: 1-2                        [1, 64, 7, 7]             [1, 1, 28, 28]            --
│    └─ConvTranspose2d: 2-1              [1, 64, 7, 7]             [1, 32, 14, 14]           18,464
│    └─ReLU: 2-2                         [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    └─ConvTranspose2d: 2-3              [1, 32, 14, 14]           [1, 1, 28, 28]            289
│    └─Conv2d: 2-4                       [1, 1, 28, 28]            [1, 1, 28, 28]            10
│    └─Sigmoid: 2-5                      [1, 1, 28, 28]            [1, 1, 28, 28]            --
===================================================================================================================
Total params: 72,075
Trainable params: 72,075
Non-trainable params: 0
Total mult-adds (M): 171.04
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.29
Estimated Total Size (MB): 0.38
===================================================================================================================

class Autoencoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

# Instantiate Autoencoder
autoencoder = Autoencoder(encoder, decoder).to(device)

print("\nComplete Autoencoder:")
summary(autoencoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))

Complete Autoencoder:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Autoencoder                              [1, 1, 28, 28]            [1, 1, 28, 28]            --
├─Encoder: 1-1                           [1, 1, 28, 28]            [1, 16]                   --
│    └─Sequential: 2-1                   [1, 1, 28, 28]            [1, 64, 7, 7]             --
│    │    └─Conv2d: 3-1                  [1, 1, 28, 28]            [1, 32, 14, 14]           320
│    │    └─ReLU: 3-2                    [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    │    └─Conv2d: 3-3                  [1, 32, 14, 14]           [1, 64, 7, 7]             18,496
│    │    └─ReLU: 3-4                    [1, 64, 7, 7]             [1, 64, 7, 7]             --
│    └─Linear: 2-2                       [1, 3136]                 [1, 16]                   50,192
├─Decoder: 1-2                           [1, 16]                   [1, 1, 28, 28]            --
│    └─Linear: 2-3                       [1, 16]                   [1, 3136]                 53,312
│    └─Sequential: 2-4                   [1, 64, 7, 7]             [1, 1, 28, 28]            --
│    │    └─ConvTranspose2d: 3-5         [1, 64, 7, 7]             [1, 32, 14, 14]           18,464
│    │    └─ReLU: 3-6                    [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    │    └─ConvTranspose2d: 3-7         [1, 32, 14, 14]           [1, 1, 28, 28]            289
│    │    └─Conv2d: 3-8                  [1, 1, 28, 28]            [1, 1, 28, 28]            10
│    │    └─Sigmoid: 3-9                 [1, 1, 28, 28]            [1, 1, 28, 28]            --
===================================================================================================================
Total params: 141,083
Trainable params: 141,083
Non-trainable params: 0
Total mult-adds (M): 4.93
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.16
Params size (MB): 0.56
Estimated Total Size (MB): 0.73
===================================================================================================================

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters())

# Training function
def train_autoencoder(model, train_loader, test_loader, criterion, optimizer, epochs=2):
    """Train the autoencoder"""
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for noisy_imgs, clean_imgs in train_loader:
            noisy_imgs = noisy_imgs.to(device)
            clean_imgs = clean_imgs.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(noisy_imgs)
            loss = criterion(outputs, clean_imgs)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * noisy_imgs.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for noisy_imgs, clean_imgs in test_loader:
                noisy_imgs = noisy_imgs.to(device)
                clean_imgs = clean_imgs.to(device)
                
                outputs = model(noisy_imgs)
                loss = criterion(outputs, clean_imgs)
                
                val_loss += loss.item() * noisy_imgs.size(0)
        
        val_loss = val_loss / len(test_loader.dataset)
        
        # Store history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        
        print(f'Epoch [{epoch+1}/{epochs}] - '
              f'Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
    
    return history

# Train the autoencoder
print("\nTraining autoencoder...")
history = train_autoencoder(autoencoder, train_loader, test_loader, 
                           criterion, optimizer, epochs=2)

# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training History')
plt.legend()
plt.grid(True)
plt.savefig('training_history.png', bbox_inches='tight', dpi=150)
plt.show()

Training autoencoder...

Epoch [1/2] - Train Loss: 0.044972, Val Loss: 0.025023

Epoch [2/2] - Train Loss: 0.023074, Val Loss: 0.020574

# Predict the Autoencoder output from corrupted test images
autoencoder.eval()
with torch.no_grad():
    # Get predictions for first batch
    x_test_noisy_batch = x_test_noisy_tensor.to(device)
    x_decoded = autoencoder(x_test_noisy_batch).cpu()

# Convert back to numpy for visualization
x_decoded = x_decoded.numpy()  # Already has the correct dimensions

# Display the first 25 images: original, corrupted, and denoised
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_test_mnist_np[:num], x_test_noisy[:num], x_decoded[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)

plt.figure(figsize=(10, 15))
plt.axis('off')
plt.title('Original images: top rows, '
          'Corrupted Input: middle rows, '
          'Denoised Output: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
plt.savefig('denoising_results.png', bbox_inches='tight', dpi=150)
plt.show()

Lecture 18 – CS 189, Fall 2025

Building CNN from Scratch in PyTorch¶

AlexNet¶

Transfer Learning on CIFAR Data¶

Denoising Autoencoder¶