[{"content":"","date":"30 May 2026","externalUrl":null,"permalink":"/","section":"","summary":"","title":"","type":"page"},{"content":"","date":"30 May 2026","externalUrl":null,"permalink":"/categories/","section":"Categories","summary":"","title":"Categories","type":"categories"},{"content":" Setup # import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader, Subset import torchvision import torchvision.transforms as transforms from torch.cuda.amp import GradScaler, autocast import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import classification_report, confusion_matrix import random import time random.seed(42) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) torch.backends.cudnn.benchmark = True device = torch.device(\u0026#34;cuda\u0026#34; if torch.cuda.is_available() else \u0026#34;cpu\u0026#34;) print(f\u0026#34;Device: {device}\u0026#34;) if device.type == \u0026#34;cuda\u0026#34;: print(f\u0026#34;GPU: {torch.cuda.get_device_name(0)}\u0026#34;) vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 print(f\u0026#34;VRAM: {vram_gb:.1f} GB\u0026#34;) Device: cuda GPU: NVIDIA GeForce RTX 3090 VRAM: 25.3 GB Load Oxford-IIIT Pet # The Oxford-IIIT Pet dataset has 37 cat and dog breeds with roughly 200 images per class at 200-500px resolution. We cap to 6 classes for a manageable but challenging multi-class problem, resize to 224×224 so pretrained models work at their native resolution, and use ImageNet normalization. We split the trainval portion into 80% training and 20% validation.\nNUM_CLASSES = 6 IMG_SIZE = 224 DATA_DIR = \u0026#34;/home/migue/.cache\u0026#34; BATCH = 64 imagenet_mean = (0.485, 0.456, 0.406) imagenet_std = (0.229, 0.224, 0.225) transform_train = transforms.Compose([ transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(imagenet_mean, imagenet_std), ]) transform_test = transforms.Compose([ transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.ToTensor(), transforms.Normalize(imagenet_mean, imagenet_std), ]) # Load full trainval split, then filter to first 6 classes full_dataset = torchvision.datasets.OxfordIIITPet( root=DATA_DIR, split=\u0026#34;trainval\u0026#34;, target_types=\u0026#34;category\u0026#34;, download=True, transform=transform_train, ) # Collect labels within the dataset all_labels = [full_dataset[i][1] for i in range(len(full_dataset))] keep_idx = [i for i, lbl in enumerate(all_labels) if lbl \u0026lt; NUM_CLASSES] filtered = Subset(full_dataset, keep_idx) # Build test set with test split, same class filter test_full = torchvision.datasets.OxfordIIITPet( root=DATA_DIR, split=\u0026#34;test\u0026#34;, target_types=\u0026#34;category\u0026#34;, download=True, transform=transform_test, ) test_labels = [test_full[i][1] for i in range(len(test_full))] test_idx = [i for i, lbl in enumerate(test_labels) if lbl \u0026lt; NUM_CLASSES] test_set = Subset(test_full, test_idx) # Train/val split indices = list(range(len(filtered))) random.shuffle(indices) split = int(0.8 * len(indices)) train_set = Subset(filtered, indices[:split]) val_set = Subset(filtered, indices[split:]) train_loader = DataLoader(train_set, batch_size=BATCH, shuffle=True, pin_memory=True, num_workers=4) val_loader = DataLoader(val_set, batch_size=BATCH, shuffle=False, pin_memory=True, num_workers=4) test_loader = DataLoader(test_set, batch_size=BATCH, shuffle=False, pin_memory=True, num_workers=4) classes = (\u0026#39;Abyssinian\u0026#39;, \u0026#39;american_bulldog\u0026#39;, \u0026#39;american_pit_bull\u0026#39;, \u0026#39;basset_hound\u0026#39;, \u0026#39;beagle\u0026#39;, \u0026#39;Bengal\u0026#39;) print(f\u0026#34;Train: {len(train_set)} Val: {len(val_set)} Test: {len(test_set)}\u0026#34;) print(f\u0026#34;Classes: {\u0026#39;, \u0026#39;.join(classes)}\u0026#34;) Train: 480 Val: 120 Test: 598 Classes: Abyssinian, american_bulldog, american_pit_bull, basset_hound, beagle, Bengal Explore the Data # 224×224 RGB images at full resolution. The Oxford-IIIT Pet dataset provides real-world photographs with varied backgrounds, poses, and lighting, a much harder benchmark than CIFAR-10.\ntemp_set = torchvision.datasets.OxfordIIITPet( root=DATA_DIR, split=\u0026#34;trainval\u0026#34;, target_types=\u0026#34;category\u0026#34;, download=False, transform=transforms.Compose([ transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.ToTensor(), ]), ) temp_labels = [temp_set[i][1] for i in range(len(temp_set))] temp_idx = [i for i, lbl in enumerate(temp_labels) if lbl \u0026lt; NUM_CLASSES] temp = Subset(temp_set, temp_idx) temp_loader = DataLoader(temp, batch_size=16, shuffle=True) images, labels = next(iter(temp_loader)) fig, axes = plt.subplots(2, 8, figsize=(12, 5)) for ax, img, lbl in zip(axes.flat, images, labels): ax.imshow(img.permute(1, 2, 0)) ax.set_title(classes[lbl], fontsize=8) ax.axis(\u0026#34;off\u0026#34;) plt.tight_layout() plt.show() train_targets = [filtered[i][1] for i in indices[:split]] counts = [train_targets.count(i) for i in range(NUM_CLASSES)] fig, ax = plt.subplots(figsize=(8, 4)) bars = ax.bar(classes, counts) ax.bar_label(bars, fontsize=9) ax.set_ylabel(\u0026#34;Samples\u0026#34;) ax.set_title(\u0026#34;Training Set Class Distribution\u0026#34;) plt.xticks(rotation=30, ha=\u0026#34;right\u0026#34;) plt.tight_layout() plt.show() Shared Utilities # These functions are reused by all our models. train_epoch does one pass with mixed precision. evaluate runs validation or testing. print_architecture displays every layer with its output shape and parameter count. We define the loss once, shared by every model.\ndef train_epoch(model, loader, criterion, optimizer, scaler): model.train() running_loss, correct, total = 0.0, 0, 0 for inputs, targets in loader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() with autocast(): outputs = model(inputs) loss = criterion(outputs, targets) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() running_loss += loss.item() _, pred = outputs.max(1) total += targets.size(0) correct += pred.eq(targets).sum().item() return running_loss / len(loader), 100.0 * correct / total @torch.no_grad() def evaluate(model, loader, criterion): model.eval() running_loss, correct, total = 0.0, 0, 0 for inputs, targets in loader: inputs, targets = inputs.to(device), targets.to(device) with autocast(): outputs = model(inputs) loss = criterion(outputs, targets) running_loss += loss.item() _, pred = outputs.max(1) total += targets.size(0) correct += pred.eq(targets).sum().item() return running_loss / len(loader), 100.0 * correct / total def print_architecture(model, input_size=(1, 3, IMG_SIZE, IMG_SIZE)): x = torch.randn(*input_size).to(device) model = model.to(device) model.eval() print(f\u0026#34;{\u0026#39;Layer\u0026#39;:\u0026lt;35} {\u0026#39;Output Shape\u0026#39;:\u0026lt;25} {\u0026#39;Params\u0026#39;:\u0026gt;10}\u0026#34;) print(\u0026#34;=\u0026#34; * 73) hooks = [] def make_hook(name): def hook(module, inp, out): params = sum(p.numel() for p in module.parameters()) shape = str(list(out.shape)) if not isinstance(out, (list, tuple)) else str([list(o.shape) for o in out]) if params \u0026gt; 0 or isinstance(module, (nn.ReLU, nn.MaxPool2d, nn.AdaptiveAvgPool2d, nn.Dropout, nn.Flatten)): print(f\u0026#34;{name:\u0026lt;35} {shape:\u0026lt;25} {params:\u0026gt;10,}\u0026#34;) return hook for name, m in model.named_modules(): if not name: continue hooks.append(m.register_forward_hook(make_hook(name))) with torch.no_grad(): _ = model(x) for h in hooks: h.remove() total = sum(p.numel() for p in model.parameters()) print(\u0026#34;=\u0026#34; * 73) print(f\u0026#34;{\u0026#39;Total\u0026#39;:\u0026lt;35} {\u0026#39;\u0026#39;:\u0026lt;25} {total:\u0026gt;10,}\u0026#34;) return total criterion = nn.CrossEntropyLoss() Part 1: Plain CNN # A basic stack of Conv2d + ReLU + MaxPool2d. No BatchNorm, no residual connections, no Dropout. This is our baseline.\nclass PlainCNN(nn.Module): def __init__(self, num_classes=NUM_CLASSES): super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), ) self.pool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) def forward(self, x): x = self.features(x) x = self.pool(x) x = x.view(x.size(0), -1) return self.fc(x) plain = PlainCNN() print_architecture(plain) Layer Output Shape Params ========================================================================= features.0 [1, 32, 224, 224] 896 features.1 [1, 32, 224, 224] 0 features.2 [1, 32, 112, 112] 0 features.3 [1, 64, 112, 112] 18,496 features.4 [1, 64, 112, 112] 0 features.5 [1, 64, 56, 56] 0 features.6 [1, 128, 56, 56] 73,856 features.7 [1, 128, 56, 56] 0 features.8 [1, 128, 28, 28] 0 features.9 [1, 256, 28, 28] 295,168 features.10 [1, 256, 28, 28] 0 features.11 [1, 256, 14, 14] 0 features [1, 256, 14, 14] 388,416 pool [1, 256, 1, 1] 0 fc [1, 6] 1,542 ========================================================================= Total 389,958 389958 Channels grow (32→64→128→256) while spatial size shrinks (224→112→56→28→14). The feature pyramid in its simplest form.\nplain = PlainCNN().to(device) opt = torch.optim.AdamW(plain.parameters(), lr=1e-3, weight_decay=1e-4) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=20) scaler = GradScaler() epochs, patience = 20, 10 best_loss, epochs_no_improve = float(\u0026#34;inf\u0026#34;), 0 best_acc = 0.0 print(f\u0026#34;PlainCNN, {sum(p.numel() for p in plain.parameters()):,} parameters\\n\u0026#34;) for epoch in range(1, epochs + 1): tl, ta = train_epoch(plain, train_loader, criterion, opt, scaler) vl, va = evaluate(plain, val_loader, criterion) sched.step() if va \u0026gt; best_acc: best_acc = va improved = \u0026#34;\u0026#34; if vl \u0026lt; best_loss: best_loss = vl epochs_no_improve = 0 improved = \u0026#34; *\u0026#34; else: epochs_no_improve += 1 print(f\u0026#34;Epoch {epoch:2d} | Train Acc {ta:5.2f}% | Val Acc {va:5.2f}%{improved}\u0026#34;) if epochs_no_improve \u0026gt;= patience: print(f\u0026#34;Early stopping at epoch {epoch}\u0026#34;) break test_loss, plain_acc = evaluate(plain, test_loader, criterion) print(f\u0026#34;\\nPlainCNN test accuracy: {plain_acc:.2f}% (best val: {best_acc:.1f}%)\u0026#34;) PlainCNN, 389,958 parameters /tmp/ipykernel_26143/3925818616.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Acc 18.12% | Val Acc 15.83% * Epoch 2 | Train Acc 22.29% | Val Acc 15.83% Epoch 3 | Train Acc 24.79% | Val Acc 18.33% * Epoch 4 | Train Acc 24.79% | Val Acc 20.83% * Epoch 5 | Train Acc 27.92% | Val Acc 22.50% * Epoch 6 | Train Acc 30.00% | Val Acc 25.00% Epoch 7 | Train Acc 26.88% | Val Acc 23.33% Epoch 8 | Train Acc 31.04% | Val Acc 25.00% Epoch 9 | Train Acc 33.33% | Val Acc 25.00% Epoch 10 | Train Acc 34.17% | Val Acc 26.67% Epoch 11 | Train Acc 31.88% | Val Acc 27.50% * Epoch 12 | Train Acc 33.75% | Val Acc 26.67% Epoch 13 | Train Acc 34.38% | Val Acc 24.17% Epoch 14 | Train Acc 35.62% | Val Acc 25.00% Epoch 15 | Train Acc 36.88% | Val Acc 25.83% * Epoch 16 | Train Acc 38.12% | Val Acc 25.00% * Epoch 17 | Train Acc 36.67% | Val Acc 25.00% * Epoch 18 | Train Acc 37.08% | Val Acc 26.67% Epoch 19 | Train Acc 38.54% | Val Acc 25.00% Epoch 20 | Train Acc 37.92% | Val Acc 26.67% PlainCNN test accuracy: 25.08% (best val: 27.5%) Part 2: PlainCNN + Dropout # Same architecture, adding nn.Dropout(0.3) before the classifier. Dropout randomly zeros 30% of the feature vector during training, forcing the model to rely on distributed representations.\nclass PlainCNNDropout(nn.Module): def __init__(self, num_classes=NUM_CLASSES): super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), ) self.pool = nn.AdaptiveAvgPool2d((1, 1)) self.dropout = nn.Dropout(0.3) self.fc = nn.Linear(256, num_classes) def forward(self, x): x = self.features(x) x = self.pool(x) x = x.view(x.size(0), -1) x = self.dropout(x) x = self.fc(x) return x dropper = PlainCNNDropout() print_architecture(dropper) Layer Output Shape Params ========================================================================= features.0 [1, 32, 224, 224] 896 features.1 [1, 32, 224, 224] 0 features.2 [1, 32, 112, 112] 0 features.3 [1, 64, 112, 112] 18,496 features.4 [1, 64, 112, 112] 0 features.5 [1, 64, 56, 56] 0 features.6 [1, 128, 56, 56] 73,856 features.7 [1, 128, 56, 56] 0 features.8 [1, 128, 28, 28] 0 features.9 [1, 256, 28, 28] 295,168 features.10 [1, 256, 28, 28] 0 features.11 [1, 256, 14, 14] 0 features [1, 256, 14, 14] 388,416 pool [1, 256, 1, 1] 0 dropout [1, 256] 0 fc [1, 6] 1,542 ========================================================================= Total 389,958 389958 Dropout adds zero parameters, it is a free regularizer.\ndropper = PlainCNNDropout().to(device) opt = torch.optim.AdamW(dropper.parameters(), lr=1e-3, weight_decay=1e-4) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=20) scaler = GradScaler() best_loss, epochs_no_improve = float(\u0026#34;inf\u0026#34;), 0 best_acc = 0.0 print(f\u0026#34;PlainCNN+Dropout, {sum(p.numel() for p in dropper.parameters()):,} parameters\\n\u0026#34;) for epoch in range(1, 21): tl, ta = train_epoch(dropper, train_loader, criterion, opt, scaler) vl, va = evaluate(dropper, val_loader, criterion) sched.step() if va \u0026gt; best_acc: best_acc = va improved = \u0026#34;\u0026#34; if vl \u0026lt; best_loss: best_loss = vl epochs_no_improve = 0 improved = \u0026#34; *\u0026#34; else: epochs_no_improve += 1 print(f\u0026#34;Epoch {epoch:2d} | Train Acc {ta:5.2f}% | Val Acc {va:5.2f}%{improved}\u0026#34;) if epochs_no_improve \u0026gt;= 10: print(f\u0026#34;Early stopping at epoch {epoch}\u0026#34;) break test_loss, drop_acc = evaluate(dropper, test_loader, criterion) print(f\u0026#34;\\nPlainCNN+Dropout test accuracy: {drop_acc:.2f}% (best val: {best_acc:.1f}%)\u0026#34;) print(f\u0026#34;Improvement over PlainCNN: +{drop_acc - plain_acc:.1f}%\u0026#34;) PlainCNN+Dropout, 389,958 parameters /tmp/ipykernel_26143/2963331684.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Acc 18.12% | Val Acc 15.00% * /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 2 | Train Acc 19.79% | Val Acc 15.83% Epoch 3 | Train Acc 21.04% | Val Acc 15.00% * Epoch 4 | Train Acc 24.79% | Val Acc 19.17% * Epoch 5 | Train Acc 27.29% | Val Acc 23.33% * Epoch 6 | Train Acc 26.04% | Val Acc 20.83% Epoch 7 | Train Acc 30.21% | Val Acc 24.17% * Epoch 8 | Train Acc 26.67% | Val Acc 25.83% Epoch 9 | Train Acc 32.92% | Val Acc 27.50% * Epoch 10 | Train Acc 28.75% | Val Acc 26.67% Epoch 11 | Train Acc 34.17% | Val Acc 27.50% Epoch 12 | Train Acc 31.88% | Val Acc 30.00% * Epoch 13 | Train Acc 29.79% | Val Acc 25.00% Epoch 14 | Train Acc 33.75% | Val Acc 30.83% Epoch 15 | Train Acc 32.08% | Val Acc 29.17% Epoch 16 | Train Acc 35.21% | Val Acc 25.00% Epoch 17 | Train Acc 35.62% | Val Acc 28.33% Epoch 18 | Train Acc 38.33% | Val Acc 29.17% Epoch 19 | Train Acc 34.38% | Val Acc 30.00% Epoch 20 | Train Acc 36.25% | Val Acc 29.17% PlainCNN+Dropout test accuracy: 23.58% (best val: 30.8%) Improvement over PlainCNN: +-1.5% Part 3: Adding BatchNorm # Add BatchNorm2d after every convolution and remove the bias (BatchNorm provides its own shift). This stabilizes training and typically gives a solid accuracy boost.\nclass CNNBN(nn.Module): def __init__(self, num_classes=NUM_CLASSES): super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 32, 3, padding=1, bias=False), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, 3, padding=1, bias=False), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, 3, padding=1, bias=False), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(128, 256, 3, padding=1, bias=False), nn.BatchNorm2d(256), nn.ReLU(), nn.MaxPool2d(2), ) self.pool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(256, num_classes) def forward(self, x): x = self.features(x) x = self.pool(x) x = x.view(x.size(0), -1) return self.fc(x) cnn_bn = CNNBN() print_architecture(cnn_bn) Layer Output Shape Params ========================================================================= features.0 [1, 32, 224, 224] 864 features.1 [1, 32, 224, 224] 64 features.2 [1, 32, 224, 224] 0 features.3 [1, 32, 112, 112] 0 features.4 [1, 64, 112, 112] 18,432 features.5 [1, 64, 112, 112] 128 features.6 [1, 64, 112, 112] 0 features.7 [1, 64, 56, 56] 0 features.8 [1, 128, 56, 56] 73,728 features.9 [1, 128, 56, 56] 256 features.10 [1, 128, 56, 56] 0 features.11 [1, 128, 28, 28] 0 features.12 [1, 256, 28, 28] 294,912 features.13 [1, 256, 28, 28] 512 features.14 [1, 256, 28, 28] 0 features.15 [1, 256, 14, 14] 0 features [1, 256, 14, 14] 388,896 pool [1, 256, 1, 1] 0 fc [1, 6] 1,542 ========================================================================= Total 390,438 390438 cnn_bn = CNNBN().to(device) opt = torch.optim.AdamW(cnn_bn.parameters(), lr=1e-3, weight_decay=1e-4) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=20) scaler = GradScaler() best_loss, epochs_no_improve = float(\u0026#34;inf\u0026#34;), 0 best_acc = 0.0 print(f\u0026#34;CNN+BN, {sum(p.numel() for p in cnn_bn.parameters()):,} parameters\\n\u0026#34;) for epoch in range(1, 21): tl, ta = train_epoch(cnn_bn, train_loader, criterion, opt, scaler) vl, va = evaluate(cnn_bn, val_loader, criterion) sched.step() if va \u0026gt; best_acc: best_acc = va improved = \u0026#34;\u0026#34; if vl \u0026lt; best_loss: best_loss = vl epochs_no_improve = 0 improved = \u0026#34; *\u0026#34; else: epochs_no_improve += 1 print(f\u0026#34;Epoch {epoch:2d} | Train Acc {ta:5.2f}% | Val Acc {va:5.2f}%{improved}\u0026#34;) if epochs_no_improve \u0026gt;= 10: print(f\u0026#34;Early stopping at epoch {epoch}\u0026#34;) break test_loss, bn_acc = evaluate(cnn_bn, test_loader, criterion) print(f\u0026#34;\\nCNN+BN test accuracy: {bn_acc:.2f}%\u0026#34;) print(f\u0026#34;Improvement over PlainCNN: +{bn_acc - plain_acc:.1f}%\u0026#34;) CNN+BN, 390,438 parameters /tmp/ipykernel_26143/4188736977.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Acc 22.71% | Val Acc 24.17% * /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 2 | Train Acc 29.17% | Val Acc 17.50% Epoch 3 | Train Acc 35.21% | Val Acc 14.17% Epoch 4 | Train Acc 36.88% | Val Acc 20.00% Epoch 5 | Train Acc 38.75% | Val Acc 23.33% * Epoch 6 | Train Acc 39.17% | Val Acc 19.17% Epoch 7 | Train Acc 45.83% | Val Acc 20.83% Epoch 8 | Train Acc 43.96% | Val Acc 27.50% * Epoch 9 | Train Acc 46.67% | Val Acc 30.83% * Epoch 10 | Train Acc 49.38% | Val Acc 29.17% Epoch 11 | Train Acc 49.79% | Val Acc 35.83% * Epoch 12 | Train Acc 48.96% | Val Acc 38.33% * Epoch 13 | Train Acc 54.17% | Val Acc 33.33% Epoch 14 | Train Acc 57.50% | Val Acc 39.17% * Epoch 15 | Train Acc 57.50% | Val Acc 37.50% Epoch 16 | Train Acc 61.46% | Val Acc 42.50% * Epoch 17 | Train Acc 62.29% | Val Acc 43.33% Epoch 18 | Train Acc 63.96% | Val Acc 38.33% Epoch 19 | Train Acc 63.96% | Val Acc 41.67% * Epoch 20 | Train Acc 63.75% | Val Acc 41.67% CNN+BN test accuracy: 35.45% Improvement over PlainCNN: +10.4% Part 4: DeepResNet, Built from Scratch # Our custom residual CNN. Each block applies Conv→BN→ReLU→Conv→BN, then adds the input back via a skip connection: output = F(x) + x. When dimensions change, we project the shortcut with a 1×1 convolution.\nResidual Block # class ResBlock(nn.Module): def __init__(self, in_c, out_c, stride=1): super().__init__() self.conv1 = nn.Conv2d(in_c, out_c, 3, stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(out_c) self.conv2 = nn.Conv2d(out_c, out_c, 3, 1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(out_c) self.shortcut = nn.Sequential() if stride != 1 or in_c != out_c: self.shortcut = nn.Sequential( nn.Conv2d(in_c, out_c, 1, stride, bias=False), nn.BatchNorm2d(out_c), ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) out = F.relu(out) return out Full Model # Three stages of residual blocks. Channels increase (64→128→256→512) while spatial size decreases (224→112→56→28). Global average pooling collapses the spatial dimensions, then dropout + a linear layer produces 6 class logits.\nclass DeepResNet(nn.Module): def __init__(self, num_classes=NUM_CLASSES): super().__init__() self.conv1 = nn.Conv2d(3, 64, 3, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_block(64, 128, stride=2) # 224→112 self.layer2 = self._make_block(128, 256, stride=2) # 112→56 self.layer3 = self._make_block(256, 512, stride=2) # 56→28 self.pool = nn.AdaptiveAvgPool2d((1, 1)) self.dropout = nn.Dropout(0.4) self.fc = nn.Linear(512, num_classes) def _make_block(self, in_c, out_c, stride): return nn.Sequential( ResBlock(in_c, out_c, stride), ResBlock(out_c, out_c, 1), ) def forward(self, x): x = F.relu(self.bn1(self.conv1(x))) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.pool(x) x = x.view(x.size(0), -1) x = self.dropout(x) x = self.fc(x) return x resnet = DeepResNet() print_architecture(resnet) Layer Output Shape Params ========================================================================= conv1 [1, 64, 224, 224] 1,728 bn1 [1, 64, 224, 224] 128 layer1.0.conv1 [1, 128, 112, 112] 73,728 layer1.0.bn1 [1, 128, 112, 112] 256 layer1.0.conv2 [1, 128, 112, 112] 147,456 layer1.0.bn2 [1, 128, 112, 112] 256 layer1.0.shortcut.0 [1, 128, 112, 112] 8,192 layer1.0.shortcut.1 [1, 128, 112, 112] 256 layer1.0.shortcut [1, 128, 112, 112] 8,448 layer1.0 [1, 128, 112, 112] 230,144 layer1.1.conv1 [1, 128, 112, 112] 147,456 layer1.1.bn1 [1, 128, 112, 112] 256 layer1.1.conv2 [1, 128, 112, 112] 147,456 layer1.1.bn2 [1, 128, 112, 112] 256 layer1.1 [1, 128, 112, 112] 295,424 layer1 [1, 128, 112, 112] 525,568 layer2.0.conv1 [1, 256, 56, 56] 294,912 layer2.0.bn1 [1, 256, 56, 56] 512 layer2.0.conv2 [1, 256, 56, 56] 589,824 layer2.0.bn2 [1, 256, 56, 56] 512 layer2.0.shortcut.0 [1, 256, 56, 56] 32,768 layer2.0.shortcut.1 [1, 256, 56, 56] 512 layer2.0.shortcut [1, 256, 56, 56] 33,280 layer2.0 [1, 256, 56, 56] 919,040 layer2.1.conv1 [1, 256, 56, 56] 589,824 layer2.1.bn1 [1, 256, 56, 56] 512 layer2.1.conv2 [1, 256, 56, 56] 589,824 layer2.1.bn2 [1, 256, 56, 56] 512 layer2.1 [1, 256, 56, 56] 1,180,672 layer2 [1, 256, 56, 56] 2,099,712 layer3.0.conv1 [1, 512, 28, 28] 1,179,648 layer3.0.bn1 [1, 512, 28, 28] 1,024 layer3.0.conv2 [1, 512, 28, 28] 2,359,296 layer3.0.bn2 [1, 512, 28, 28] 1,024 layer3.0.shortcut.0 [1, 512, 28, 28] 131,072 layer3.0.shortcut.1 [1, 512, 28, 28] 1,024 layer3.0.shortcut [1, 512, 28, 28] 132,096 layer3.0 [1, 512, 28, 28] 3,673,088 layer3.1.conv1 [1, 512, 28, 28] 2,359,296 layer3.1.bn1 [1, 512, 28, 28] 1,024 layer3.1.conv2 [1, 512, 28, 28] 2,359,296 layer3.1.bn2 [1, 512, 28, 28] 1,024 layer3.1 [1, 512, 28, 28] 4,720,640 layer3 [1, 512, 28, 28] 8,393,728 pool [1, 512, 1, 1] 0 dropout [1, 512] 0 fc [1, 6] 3,078 ========================================================================= Total 11,023,942 11023942 Forward Pass, Shape Trace # x = torch.randn(4, 3, IMG_SIZE, IMG_SIZE).to(device) resnet = DeepResNet().to(device) resnet.eval() with torch.no_grad(): y = resnet.conv1(x) print(f\u0026#34;After initial conv: {list(y.shape)}\u0026#34;) y = F.relu(resnet.bn1(y)) print(f\u0026#34;After BN + ReLU: {list(y.shape)}\u0026#34;) y = resnet.layer1(y) print(f\u0026#34;After ResBlock 1: {list(y.shape)} (64→128, 224→112)\u0026#34;) y = resnet.layer2(y) print(f\u0026#34;After ResBlock 2: {list(y.shape)} (128→256, 112→56)\u0026#34;) y = resnet.layer3(y) print(f\u0026#34;After ResBlock 3: {list(y.shape)} (256→512, 56→28)\u0026#34;) y = resnet.pool(y) print(f\u0026#34;After GlobalAvgPool: {list(y.shape)}\u0026#34;) y = y.view(y.size(0), -1) print(f\u0026#34;After flatten: {list(y.shape)}\u0026#34;) y = resnet.dropout(y) logits = resnet.fc(y) print(f\u0026#34;After Linear: {list(logits.shape)} (6 class logits)\u0026#34;) After initial conv: [4, 64, 224, 224] After BN + ReLU: [4, 64, 224, 224] After ResBlock 1: [4, 128, 112, 112] (64→128, 224→112) After ResBlock 2: [4, 256, 56, 56] (128→256, 112→56) After ResBlock 3: [4, 512, 28, 28] (256→512, 56→28) After GlobalAvgPool: [4, 512, 1, 1] After flatten: [4, 512] After Linear: [4, 6] (6 class logits) Training with Early Stopping # Up to 20 epochs with early stopping (patience = 5 epochs on validation loss). The * marks when validation loss improved.\nresnet = DeepResNet().to(device) optimizer = torch.optim.AdamW(resnet.parameters(), lr=1e-3, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20) scaler = GradScaler() epochs = 30 patience = 5 history = {\u0026#34;train_loss\u0026#34;: [], \u0026#34;train_acc\u0026#34;: [], \u0026#34;val_loss\u0026#34;: [], \u0026#34;val_acc\u0026#34;: []} best_acc = 0.0 best_loss = float(\u0026#34;inf\u0026#34;) epochs_no_improve = 0 start = time.time() for epoch in range(1, epochs + 1): train_loss, train_acc = train_epoch(resnet, train_loader, criterion, optimizer, scaler) val_loss, val_acc = evaluate(resnet, val_loader, criterion) scheduler.step() history[\u0026#34;train_loss\u0026#34;].append(train_loss) history[\u0026#34;train_acc\u0026#34;].append(train_acc) history[\u0026#34;val_loss\u0026#34;].append(val_loss) history[\u0026#34;val_acc\u0026#34;].append(val_acc) if val_acc \u0026gt; best_acc: best_acc = val_acc torch.save(resnet.state_dict(), \u0026#34;best_model.pt\u0026#34;) improved = \u0026#34;\u0026#34; if val_loss \u0026lt; best_loss: best_loss = val_loss epochs_no_improve = 0 improved = \u0026#34; *\u0026#34; else: epochs_no_improve += 1 lr_now = optimizer.param_groups[0][\u0026#34;lr\u0026#34;] print(f\u0026#34;Epoch {epoch:2d} | Train Loss {train_loss:.4f} Acc {train_acc:5.2f}% | \u0026#34; f\u0026#34;Val Loss {val_loss:.4f} Acc {val_acc:5.2f}% | LR {lr_now:.2e}{improved}\u0026#34;) if epochs_no_improve \u0026gt;= patience: print(f\u0026#34;\\nEarly stopping at epoch {epoch}\u0026#34;) break elapsed = time.time() - start print(f\u0026#34;\\nTraining: {elapsed/60:.1f} min | Best val loss: {best_loss:.4f} | Best val acc: {best_acc:.2f}%\u0026#34;) /tmp/ipykernel_26143/577991586.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Loss 2.0111 Acc 18.75% | Val Loss 2.0020 Acc 15.00% | LR 9.94e-04 * Epoch 2 | Train Loss 1.7888 Acc 22.29% | Val Loss 2.8056 Acc 13.33% | LR 9.76e-04 Epoch 3 | Train Loss 1.7315 Acc 25.21% | Val Loss 1.8803 Acc 17.50% | LR 9.46e-04 * Epoch 4 | Train Loss 1.7058 Acc 30.21% | Val Loss 1.8006 Acc 19.17% | LR 9.05e-04 * Epoch 5 | Train Loss 1.6532 Acc 33.33% | Val Loss 1.9391 Acc 17.50% | LR 8.54e-04 Epoch 6 | Train Loss 1.6192 Acc 33.96% | Val Loss 1.8679 Acc 22.50% | LR 7.94e-04 Epoch 7 | Train Loss 1.5996 Acc 38.75% | Val Loss 1.8615 Acc 23.33% | LR 7.27e-04 Epoch 8 | Train Loss 1.5763 Acc 33.33% | Val Loss 1.8387 Acc 27.50% | LR 6.55e-04 Epoch 9 | Train Loss 1.5284 Acc 40.62% | Val Loss 1.7853 Acc 30.00% | LR 5.78e-04 * Epoch 10 | Train Loss 1.5364 Acc 38.96% | Val Loss 1.7848 Acc 29.17% | LR 5.00e-04 * Epoch 11 | Train Loss 1.4826 Acc 40.83% | Val Loss 1.9205 Acc 31.67% | LR 4.22e-04 Epoch 12 | Train Loss 1.4651 Acc 44.58% | Val Loss 1.9092 Acc 28.33% | LR 3.45e-04 Epoch 13 | Train Loss 1.4363 Acc 45.21% | Val Loss 1.8635 Acc 24.17% | LR 2.73e-04 Epoch 14 | Train Loss 1.3766 Acc 45.42% | Val Loss 1.8488 Acc 30.83% | LR 2.06e-04 Epoch 15 | Train Loss 1.3403 Acc 47.71% | Val Loss 1.8700 Acc 30.83% | LR 1.46e-04 Early stopping at epoch 15 Training: 3.5 min | Best val loss: 1.7848 | Best val acc: 31.67% Learning Curves # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4)) ax1.plot(history[\u0026#34;train_loss\u0026#34;], label=\u0026#34;Train Loss\u0026#34;) ax1.plot(history[\u0026#34;val_loss\u0026#34;], label=\u0026#34;Val Loss\u0026#34;) ax1.set_xlabel(\u0026#34;Epoch\u0026#34;) ax1.set_ylabel(\u0026#34;Loss\u0026#34;) ax1.set_title(\u0026#34;Loss Curves\u0026#34;) ax1.legend() ax1.grid(alpha=0.3) ax2.plot(history[\u0026#34;train_acc\u0026#34;], label=\u0026#34;Train Acc\u0026#34;) ax2.plot(history[\u0026#34;val_acc\u0026#34;], label=\u0026#34;Val Acc\u0026#34;) ax2.set_xlabel(\u0026#34;Epoch\u0026#34;) ax2.set_ylabel(\u0026#34;Accuracy (%)\u0026#34;) ax2.set_title(\u0026#34;Accuracy Curves\u0026#34;) ax2.legend() ax2.grid(alpha=0.3) plt.tight_layout() plt.show() Test Evaluation # resnet.load_state_dict(torch.load(\u0026#34;best_model.pt\u0026#34;, weights_only=True)) test_loss, resnet_acc = evaluate(resnet, test_loader, criterion) print(f\u0026#34;DeepResNet Test Accuracy: {resnet_acc:.2f}%\u0026#34;) /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): DeepResNet Test Accuracy: 22.91% all_preds, all_labels = [], [] resnet.eval() with torch.no_grad(): for inputs, targets in test_loader: inputs = inputs.to(device) with autocast(): outputs = resnet(inputs) _, preds = outputs.max(1) all_preds.extend(preds.cpu().numpy()) all_labels.extend(targets.numpy()) print(classification_report(all_labels, all_preds, target_names=classes, digits=3)) /tmp/ipykernel_26143/1124227186.py:6: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): precision recall f1-score support Abyssinian 0.531 0.265 0.354 98 american_bulldog 0.284 0.230 0.254 100 american_pit_bull 0.180 0.240 0.206 100 basset_hound 0.176 0.410 0.246 100 beagle 0.105 0.020 0.034 100 Bengal 0.253 0.210 0.230 100 accuracy 0.229 598 macro avg 0.255 0.229 0.221 598 weighted avg 0.254 0.229 0.220 598 Confusion Matrix # cm = confusion_matrix(all_labels, all_preds) cm_norm = cm.astype(\u0026#34;float\u0026#34;) / cm.sum(axis=1, keepdims=True) fig, ax = plt.subplots(figsize=(7, 6)) im = ax.imshow(cm_norm, cmap=\u0026#34;Blues\u0026#34;) ax.set_xticks(range(NUM_CLASSES)) ax.set_yticks(range(NUM_CLASSES)) ax.set_xticklabels(classes, rotation=30, ha=\u0026#34;right\u0026#34;, fontsize=8) ax.set_yticklabels(classes, fontsize=8) ax.set_xlabel(\u0026#34;Predicted\u0026#34;) ax.set_ylabel(\u0026#34;True\u0026#34;) ax.set_title(\u0026#34;Confusion Matrix (Normalized)\u0026#34;) for i in range(NUM_CLASSES): for j in range(NUM_CLASSES): val = cm[i, j] color = \u0026#34;white\u0026#34; if cm_norm[i, j] \u0026gt; 0.5 else \u0026#34;black\u0026#34; ax.text(j, i, val, ha=\u0026#34;center\u0026#34;, va=\u0026#34;center\u0026#34;, color=color, fontsize=9) plt.tight_layout() plt.show() Misclassified Examples # # Build raw test set without normalization for display test_raw = torchvision.datasets.OxfordIIITPet( root=DATA_DIR, split=\u0026#34;test\u0026#34;, target_types=\u0026#34;category\u0026#34;, download=False, transform=transforms.Compose([ transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.ToTensor(), ]), ) test_raw_labels = [test_raw[i][1] for i in range(len(test_raw))] test_raw_idx = [i for i, lbl in enumerate(test_raw_labels) if lbl \u0026lt; NUM_CLASSES] test_raw = Subset(test_raw, test_raw_idx) raw_loader = DataLoader(test_raw, batch_size=16, shuffle=False) errors = [] idx = 0 with torch.no_grad(): for images, _ in raw_loader: inp = transforms.Normalize(imagenet_mean, imagenet_std)(images).to(device) with autocast(): outputs = resnet(inp) probs = F.softmax(outputs, dim=1) conf, preds = probs.max(1) for i in range(len(preds)): if preds[i].item() != all_labels[idx]: errors.append((idx, images[i], all_labels[idx], preds[i].item(), conf[i].item())) idx += 1 errors.sort(key=lambda e: e[4], reverse=True) top_errors = errors[:12] fig, axes = plt.subplots(3, 4, figsize=(12, 9)) for ax, (idx, img, true_lbl, pred_lbl, conf) in zip(axes.flat, top_errors): ax.imshow(img.permute(1, 2, 0)) ax.set_title(f\u0026#34;True: {classes[true_lbl]}\\nPred: {classes[pred_lbl]} ({conf:.1%})\u0026#34;, color=\u0026#34;red\u0026#34;, fontsize=8) ax.axis(\u0026#34;off\u0026#34;) plt.suptitle(\u0026#34;Most Confident Misclassifications\u0026#34;, fontsize=12, y=1.02) plt.tight_layout() plt.show() /tmp/ipykernel_26143/2984140589.py:19: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Visualizing Learned Filters # filters = resnet.conv1.weight.data.cpu() fig, axes = plt.subplots(8, 8, figsize=(8, 8)) for i, ax in enumerate(axes.flat): if i \u0026lt; filters.size(0): f = filters[i] f = (f - f.min()) / (f.max() - f.min() + 1e-8) ax.imshow(f.permute(1, 2, 0)) ax.axis(\u0026#34;off\u0026#34;) plt.suptitle(\u0026#34;First Layer Filters (64 × 3×3×3)\u0026#34;, fontsize=14, y=1.01) plt.tight_layout() plt.show() sample_img, sample_label = test_set[0] sample_input = sample_img.unsqueeze(0).to(device) with torch.no_grad(): conv1_out = F.relu(resnet.bn1(resnet.conv1(sample_input))) activations = conv1_out[0].cpu() fig, axes = plt.subplots(8, 8, figsize=(10, 10)) for i, ax in enumerate(axes.flat): if i \u0026lt; activations.size(0): ax.imshow(activations[i].numpy(), cmap=\u0026#34;viridis\u0026#34;) ax.axis(\u0026#34;off\u0026#34;) plt.suptitle(f\u0026#34;Activations, {classes[sample_label]} (64 channels)\u0026#34;, fontsize=14, y=1.01) plt.tight_layout() plt.show() Part 5: Transfer Learning with MobileNetV3-Small # All models so far were built from scratch. Now we use a MobileNetV3-Small pretrained on ImageNet (1.4M images, 1000 classes). Its convolutional base already knows general visual features. At 224×224, the resolution matches what the model was trained on, all pretrained weights are preserved, no adaptation needed beyond replacing the classifier head.\nfrom torchvision import models transfer = models.mobilenet_v3_small(weights=\u0026#39;IMAGENET1K_V1\u0026#39;) transfer.classifier[3] = nn.Linear(1024, NUM_CLASSES) transfer = transfer.to(device) print(f\u0026#34;MobileNetV3-Small, {sum(p.numel() for p in transfer.parameters()):,} params\u0026#34;) print(\u0026#34;Pretrained backbone preserved, only classifier replaced\u0026#34;) MobileNetV3-Small, 1,524,006 params Pretrained backbone preserved, only classifier replaced Feature Extraction # Freeze the backbone, train only the new classifier head for 10 epochs.\nfor param in transfer.parameters(): param.requires_grad = False transfer.classifier[3].weight.requires_grad = True transfer.classifier[3].bias.requires_grad = True opt = torch.optim.AdamW(transfer.classifier.parameters(), lr=1e-3) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10) scaler = GradScaler() print(\u0026#34;Feature extraction, training only the head (10 epochs)\\n\u0026#34;) for epoch in range(1, 11): tl, ta = train_epoch(transfer, train_loader, criterion, opt, scaler) vl, va = evaluate(transfer, val_loader, criterion) sched.step() print(f\u0026#34;Epoch {epoch} | Train Acc {ta:5.2f}% | Val Acc {va:5.2f}%\u0026#34;) test_loss, transfer_acc = evaluate(transfer, test_loader, criterion) print(f\u0026#34;\\nFeature extraction test accuracy: {transfer_acc:.2f}%\u0026#34;) Feature extraction, training only the head (10 epochs) /tmp/ipykernel_26143/2988297993.py:8: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. scaler = GradScaler() /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Acc 32.50% | Val Acc 51.67% Epoch 2 | Train Acc 70.62% | Val Acc 67.50% Epoch 3 | Train Acc 79.79% | Val Acc 74.17% Epoch 4 | Train Acc 88.33% | Val Acc 75.83% Epoch 5 | Train Acc 89.17% | Val Acc 79.17% Epoch 6 | Train Acc 89.58% | Val Acc 82.50% Epoch 7 | Train Acc 90.21% | Val Acc 84.17% Epoch 8 | Train Acc 90.00% | Val Acc 86.67% Epoch 9 | Train Acc 91.25% | Val Acc 86.67% Epoch 10 | Train Acc 91.04% | Val Acc 85.00% Feature extraction test accuracy: 83.95% Fine-Tuning # Unfreeze the last few blocks and train with a low learning rate for 5 epochs.\nfor param in transfer.parameters(): param.requires_grad = True opt = torch.optim.AdamW([ {\u0026#39;params\u0026#39;: transfer.features[-5:].parameters(), \u0026#39;lr\u0026#39;: 1e-5}, {\u0026#39;params\u0026#39;: transfer.classifier.parameters(), \u0026#39;lr\u0026#39;: 1e-3}, ], weight_decay=1e-4) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=3) print(\u0026#34;Fine-tuning, unfreezing last 5 blocks (5 epochs)\\n\u0026#34;) for epoch in range(1, 6): tl, ta = train_epoch(transfer, train_loader, criterion, opt, scaler) vl, va = evaluate(transfer, val_loader, criterion) sched.step() print(f\u0026#34;Epoch {epoch} | Train Acc {ta:5.2f}% | Val Acc {va:5.2f}%\u0026#34;) test_loss, finetune_acc = evaluate(transfer, test_loader, criterion) print(f\u0026#34;\\nFine-tuned test accuracy: {finetune_acc:.2f}%\u0026#34;) Fine-tuning, unfreezing last 5 blocks (5 epochs) /tmp/ipykernel_26143/2983837481.py:7: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 1 | Train Acc 87.08% | Val Acc 85.00% /tmp/ipykernel_26143/2983837481.py:25: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(): Epoch 2 | Train Acc 92.50% | Val Acc 85.00% Epoch 3 | Train Acc 95.42% | Val Acc 85.83% Epoch 4 | Train Acc 96.46% | Val Acc 84.17% Epoch 5 | Train Acc 96.25% | Val Acc 90.00% Fine-tuned test accuracy: 84.45% Summary # We started with a bare convolutional stack and progressively added architectural improvements, then leveraged pretrained weights, all on the same Oxford-IIIT Pet dataset at native 224×224 resolution.\nmodels = [\u0026#34;PlainCNN\u0026#34;, \u0026#34;+\\nDropout\u0026#34;, \u0026#34;+\\nBatchNorm\u0026#34;, \u0026#34;DeepResNet\\n(from scratch)\u0026#34;, \u0026#34;Transfer\\n(feature ext.)\u0026#34;, \u0026#34;Transfer\\n(fine-tuned)\u0026#34;] accs = [plain_acc, drop_acc, bn_acc, resnet_acc, transfer_acc, finetune_acc] colors = [\u0026#34;#e74c3c\u0026#34;, \u0026#34;#e67e22\u0026#34;, \u0026#34;#f39c12\u0026#34;, \u0026#34;#2ecc71\u0026#34;, \u0026#34;#3498db\u0026#34;, \u0026#34;#9b59b6\u0026#34;] fig, ax = plt.subplots(figsize=(10, 5)) bars = ax.bar(models, accs, color=colors) ax.bar_label(bars, fmt=\u0026#34;%.1f%%\u0026#34;, fontsize=11, fontweight=\u0026#34;bold\u0026#34;) ax.set_ylabel(\u0026#34;Test Accuracy (%)\u0026#34;) ax.set_ylim(0, 100) ax.set_title(\u0026#34;From Scratch to Transfer Learning, Accuracy Progression\u0026#34;) plt.tight_layout() plt.show() Key takeaways:\nDropout is the simplest regularization, zero cost in parameters, consistently improves over the bare baseline.\nBatchNorm gives a larger accuracy boost by stabilizing training across deeper networks.\nResidual connections push accuracy further with skip connections that let gradients flow unimpeded.\nTransfer learning with a pretrained MobileNetV3-Small reaches strong accuracy when inputs match the model\u0026rsquo;s native 224×224 resolution.\nFine-tuning squeezes out additional gains by adapting high-level features to the specific pet breeds.\n","date":"30 May 2026","externalUrl":null,"permalink":"/blog/notebook-cnn/","section":"My Blog Articles","summary":"","title":"CNN Image Classification, From Scratch to Transfer Learning","type":"blog"},{"content":"","date":"30 May 2026","externalUrl":null,"permalink":"/categories/computer-vision/","section":"Categories","summary":"","title":"Computer-Vision","type":"categories"},{"content":"","date":"30 May 2026","externalUrl":null,"permalink":"/categories/deep-learning/","section":"Categories","summary":"","title":"Deep-Learning","type":"categories"},{"content":"","date":"30 May 2026","externalUrl":null,"permalink":"/blog/","section":"My Blog Articles","summary":"","title":"My Blog Articles","type":"blog"},{"content":" Conceptual Foundations of Convolutional Neural Networks # Computer vision was the foundational success story of modern deep learning, driving its mainstream adoption between 2011 and 2015. This shift was characterized by a transition from engineered visual features to learned representations, primarily driven by Convolutional Neural Networks (ConvNets or CNNs).\nEarly validation came from specialized benchmarks, such as Dan Ciresan\u0026rsquo;s success in character and traffic sign recognition in 2011, followed by the breakthrough performance of Hinton\u0026rsquo;s group at the 2012 ImageNet Large Scale Visual Recognition Challenge. Despite initial institutional skepticism within the computer vision community, ConvNets became the dominant architectural paradigm by 2016. Today, they underpin production systems ranging from consumer image search and optical character recognition (OCR) to autonomous driving, robotics, and medical diagnostics.\nUnderstanding how these networks function requires analyzing how they exploit the structural properties of visual data.\nWhy Dense Layers Fail on Image Data # In a standard densely connected (fully connected) layer, inputs are processed as flattened vectors. For an image, this means reshaping a 2D or 3D grid of pixels into a single 1D array. This operation has two major theoretical drawbacks:\nDestruction of Spatial Topology: Flattening discards the spatial proximity of pixels. A pixel at coordinate $(x, y)$ is mathematically decoupled from its neighbors at $(x+1, y)$, forcing the network to relearn spatial relationships from scratch. Parameter Explosion: Because every input neuron connects to every output neuron, scaling to high-resolution images leads to a prohibitive number of weights, causing severe overfitting and computational bottlenecks. ConvNets solve these issues by preserving the dimensional structure of the input throughout the feature extraction phase.\nCore Characteristics: Local Patterns and Invariance # The fundamental distinction between a dense layer and a convolutional layer lies in how they observe patterns: dense layers learn global configurations across the entire input space, whereas convolutional layers learn local patterns within small, localized 2D windows.\nThis architectural constraint provides two critical mathematical properties:\nTranslation Invariance: Because the same local transformation is applied across the entire image, a visual feature (such as an edge or corner) learned in one quadrant can be recognized anywhere else. This makes ConvNets highly data-efficient; they do not need to see a feature in every possible location to generalize. Spatial Hierarchies: The visual world is naturally hierarchical. Early convolutional layers extract low-level, primitive features like edges, lines, and elemental textures. Subsequent layers compose these early signals into mid-level shapes (motifs, corners). The deepest layers aggregate these shapes into abstract, high-level semantic concepts (objects, faces, structures). [Raw Input Pixels] ──\u0026gt; [Edges \u0026amp; Textures] ──\u0026gt; [Shapes \u0026amp; Motifs] ──\u0026gt; [Semantic Objects] Mechanics of the Convolution Operation # Convolutions operate on rank-3 tensors known as feature maps. These tensors possess two spatial axes (height and width) and a depth axis (commonly referred to as channels). For a raw input image, the channel depth corresponds to the color space (e.g., 3 for RGB, 1 for grayscale).\nThe operation proceeds through a sequence of systematic steps:\nA window of a fixed spatial size (typically $3 \\times 3$ or $5 \\times 5$) slides systematically across the input feature map. At each position, it extracts a 3D patch equal to the window size multiplied by the input depth: $\\text{window_height} \\times \\text{window_width} \\times \\text{input_depth}$. This 3D patch undergoes a tensor product with a learned weight matrix\u0026mdash;the convolution kernel. The output vectors calculated at each spatial position are assembled into a new rank-3 tensor: the output feature map. In an output feature map, the depth dimension no longer represents raw colors. Instead, each channel represents a unique filter or response map. A single channel acts as a spatial map indicating where, and how strongly, a specific visual feature activated across the input.\nBorder Effects, Padding, and Strides # The geometry of sliding a window across a grid introduces changes to the spatial dimensions of the output feature map. These are governed by three primary hyper-parameters:\n1. Border Effects and Padding # When sliding a $3 \\times 3$ window across a $5 \\times 5$ grid, the center of the window can only visit a $3 \\times 3$ sub-grid of valid locations. Consequently, the output map shrinks by two units along each spatial dimension.\nTo prevent this shrinkage and preserve spatial resolution, padding can be applied. Padding appends artificial rows and columns (typically filled with zeros) to the perimeter of the input feature map, allowing the convolution window to center on the true edge pixels of the original image.\nValid Convolution (No Padding): Same Convolution (With Padding): ■ ■ ■ ■ ■ ░ ░ ░ ░ ░ ░ ░ ■ ■ ■ ■ ■ ──\u0026gt; Output Spatial Size ░ ■ ■ ■ ■ ■ ░ ■ ■ ■ ■ ■ Shrinks to 3x3 ░ ■ ■ ■ ■ ■ ░ ──\u0026gt; Output Spatial Size ■ ■ ■ ■ ■ ░ ■ ■ ■ ■ ■ ░ Preserved at 5x5 ■ ■ ■ ■ ■ ░ ■ ■ ■ ■ ■ ░ ░ ░ ░ ░ ░ ░ ░ 2. Strides # The distance between two successive convolution windows is called the stride. While the default stride is usually 1 (moving the window one pixel at a time), a stride greater than 1 results in a strided convolution. This downsamples the output feature map by skipping input positions, effectively reducing the spatial dimensions by a factor roughly equal to the stride value.\nDownsampling via Max Pooling # While strided convolutions are used in specific network architectures, standard classification models primarily rely on max pooling to downsample feature maps.\nMax pooling operates conceptually like a hardcoded, non-linear convolution. It extracts local windows (almost universally $2 \\times 2$ windows with a stride of 2) from the input feature maps and outputs the maximum value for each channel independently. This halves both the height and the width of the map.\nDownsampling serves two essential structural purposes:\nBuilding Spatial Hierarchies: By shrinking the feature maps, subsequent convolution layers with the same kernel size ($3 \\times 3$) effectively \u0026ldquo;see\u0026rdquo; a larger percentage of the original input space. Without downsampling, a deep layer would still be restricted to analyzing tiny, isolated pixel neighborhoods, preventing the network from composing global concepts. Information Compression: It drastically reduces the number of coefficients passed to later stages of the network, mitigating the risk of overfitting and lowering the computational overhead. Why Max Pooling Outperforms Average Pooling # An alternative downsampling strategy is average pooling, which computes the mean value of a local patch. However, max pooling generally yields superior results in computer vision tasks.\nFeatures within a network encode the presence or activation of a specific pattern. Taking the average over a spatial neighborhood dilutes strong activation signals with surrounding quiet pixels, washing out vital structural information. Retaining the maximum value preserves the definitive presence of a feature within that region, making the network more robust to subtle spatial distortions.\nBridging the Gap: The Classification Head # A convolutional pipeline transforms raw input pixels into highly abstracted, spatially compact feature maps. However, to perform an operation like 10-way digit classification, these multi-dimensional tensors must be mapped to a discrete probability distribution.\nTo bridge this gap, modern architectures use a Global Average Pooling layer. This layer computes the mean of every single spatial position within each channel. If the final convolutional feature map has a shape of $(\\text{Height}, \\text{Width}, \\text{Channels})$, Global Average Pooling collapses the spatial dimensions entirely, yielding a 1D vector of length equal to the number of channels. This vector is then fed into a final dense layer with a softmax activation function to produce the class probabilities.\n","date":"22 May 2026","externalUrl":null,"permalink":"/blog/cnn/","section":"My Blog Articles","summary":"","title":"Convolutional Neural Networks","type":"blog"},{"content":"Perhaps you\u0026rsquo;ve heard of vi, vim, and neovim multiple times—how great and efficient it is. And during one of those times, you decide to install it, only to open a document in the terminal that doesn\u0026rsquo;t even look good, you don\u0026rsquo;t know how to type, and worst of all, you don\u0026rsquo;t even know how to exit the editor.\nA common joke goes:\nI use Neovim because I don\u0026rsquo;t know how to close it.\nThere are several ways to approach Neovim, whether by installing the app and tailoring it to your liking, or by installing a pre-configured distribution. Among the different distributions, we have the following:\nLazyVim # NVChad # LunarVim # AstroVim # They all promise the same thing: minimal configuration and an out-of-the-box start. But they all fail at one thing:\nYou don\u0026rsquo;t know how to use Vim, so it doesn\u0026rsquo;t matter if they come pre-configured.\nWhat we want to ensure is that we actually ease the entry into using Neovim without failing.\nWhy Neovim? # It\u0026rsquo;s a valid question. With editors as good as VS Code and its derivatives, Zed, and PyCharm, it seems like using an editor in the terminal makes no sense at all. But let\u0026rsquo;s break it down.\nNeovim is a modal editor and a fork of Vim, meaning it inherited its core features.\nModal means that it has several modes.\nIn a traditional editor, if you press the d key, you type a \u0026ldquo;d\u0026rdquo;. In Neovim, it depends on which mode you are in. This is the biggest hurdle at first, but once you master it, it\u0026rsquo;s the reason why you\u0026rsquo;ll want to use it forever.\nUltimately, you want Neovim because you\u0026rsquo;re a geek, you love the terminal, efficiency, saving time, using the keyboard, and you adore customization.\nTo a greater or lesser extent, some of these things can also be achieved with other editors, especially if you use their Vim mode.\nHow to start without failing? # The first thing you need to know is that Neovim is a modal editor, and it is vastly different from what you know. So my recommendation is NOT TO USE NEOVIM to learn Neovim.\nSay what?\nExactly. The first step is to stick with your favorite editor and turn it into a modal editor until it makes sense to you, and only then make the jump to Neovim.\nVS Code and derivatives # Use the VSCodeVim extension. VS Code might be the weakest editor to use in Vim mode, but it is enough to learn Vim motions.\nZed # This editor would be my favorite if I didn\u0026rsquo;t use Neovim. You don\u0026rsquo;t need to install any extension to configure it in Vim mode, and this option works beautifully.\nJust open the command palette with ctrl+shift+p and type workspace: toggle vim mode.\nAlternatively, you can edit your global or project configuration file (JSON file) and add:\n\u0026#34;vim_mode\u0026#34;: true PyCharm # This is the one I have the least info on, but I\u0026rsquo;ll leave the official link here.\nInstalling Neovim # Even if you are not going to use it daily yet, install vanilla Neovim so you can use the Tutor mode.\nThe installation depends on your operating system. In any case, it\u0026rsquo;s best to check their official website to see how to install it for your system.\nArch and derivatives # sudo pacman -S neovim Ubuntu # sudo apt install neovim Fedora # sudo dnf install neovim macOS # First install Homebrew if you don\u0026rsquo;t have it, and then run:\nbrew install neovim Windows # You can download the binaries directly or use a package manager.\nWinget # winget install Neovim.Neovim Chocolatey # choco install neovim Scoop # scoop bucket add main scoop install neovim Tutor Mode # Once installed, open a terminal and run:\nnvim Right there, run :Tutor that\u0026rsquo;s a colon followed by the word Tutor with a capital \u0026ldquo;T\u0026rdquo;.\nFollow the tutorial step by step. Don\u0026rsquo;t try to learn everything in one day; simply apply what you learn there to your regular editor. Within two weeks, it will all start making sense.\nModes in Neovim # Don\u0026rsquo;t worry too much about understanding them perfectly right now; these descriptions will be brief. We will use Neovim to learn them even if you don\u0026rsquo;t use it for work just yet.\n1. Normal Mode: Operations Center # This is the default mode. When you open Neovim, you are here. It\u0026rsquo;s not for typing; it\u0026rsquo;s for moving and editing. Think of your fingers not as writers, but as code surgeons.\nHow to use it: You move around using h, j, k, l (instead of the arrow keys) or jump between whole words. The trick: If you press dd, you delete a whole line. If you press u, you undo the last change. All without touching the mouse or reaching for the Backspace key. 2. Insert Mode: The Typewriter # This is the mode you are used to in any other editor. Here, keys actually type text onto the screen.\nHow to enter: From Normal Mode, press the i key (for Insert). How to exit: This is the key to survival: press the Esc key. Returning to Normal Mode should become your reflex action every time you finish typing a sentence. 3. Visual Mode: The Keyboard \u0026ldquo;Click and Drag\u0026rdquo; # Used to select blocks of text. It\u0026rsquo;s the equivalent of holding down the left mouse click and dragging it across the screen, but done with absolute precision right from the keyboard.\nHow to enter: From Normal Mode, press v. As you move, you\u0026rsquo;ll see the text highlighted. The trick: Once the text is selected, you can press y (for yank) to copy it, or d (for delete) to remove it. 4. Command Mode: The Control Panel # Allows you to execute direct commands to the editor.\nHow to enter: From Normal Mode, type : (colon). The cursor will move to the bottom of the screen. The joke writes itself: To save you type :w (write), to exit you type :q (quit), and to exit without saving the disasters you just made, you type :q!. 5. Terminal Mode: A Terminal Inside Your Editor # Neovim allows you to open a real terminal inside one of its windows. This way, you don\u0026rsquo;t have to keep switching apps to run your tests or spin up your local server. Though personally, I use it sparingly since I manage my workflow with tmux.\nHow to use it: It acts like a regular terminal, but requires a key combination (usually Ctrl-\\ Ctrl-n) to \u0026ldquo;detach\u0026rdquo; from it and return to managing the window in Normal Mode. Installing LazyVim # Requirements: # tree-sitter-cli and a C compiler (recommended to use tree-sitter) Git Neovim \u0026gt;= 0.11.2 A Nerd Font LazyGit (optional) A terminal emulator with true color support: kitty wezterm alacritty ghostty Everything above can be installed using your operating system\u0026rsquo;s package manager. Here are examples for Arch-based systems:\nsudo pacman -S ghostty lazygit neovim tree-sitter-cli git On Arch-based distros, it\u0026rsquo;s highly likely you already have the compiler installed as it\u0026rsquo;s part of base-devel. Similarly, on macOS, it should already be installed. Other Linux distros might require an explicit installation:\nUbuntu # sudo apt install build-essential Fedora # sudo dnf groupinstall \u0026#34;Development Tools\u0026#34; Windows well\u0026hellip; Homework assignment.\nIMPORTANT!\nIf you already had Neovim installed, back up your files first.\nMac and Linux # mv ~/.config/nvim{,.bak} Now clone the LazyVim starter repository:\ngit clone https://github.com/LazyVim/starter ~/.config/nvim Remove the .git directory:\nrm -rf ~/.config/nvim/.git Windows # Homework assignment.\nDone! Now just run:\nnvim And now, since you\u0026rsquo;ve gone through the Tutor, you should be able to use it regularly. It\u0026rsquo;s impossible to learn everything at once, so enjoy the journey.\nThe next step is to check out the LazyVim documentation.\n","date":"22 May 2026","externalUrl":null,"permalink":"/blog/lazyvim_tutorial/","section":"My Blog Articles","summary":"","title":"How to adopt Neovim without dying in the attempt","type":"blog"},{"content":"","externalUrl":null,"permalink":"/about/","section":"","summary":"","title":"","type":"page"},{"content":"","externalUrl":null,"permalink":"/authors/","section":"Authors","summary":"","title":"Authors","type":"authors"},{"content":"","externalUrl":null,"permalink":"/series/","section":"Series","summary":"","title":"Series","type":"series"},{"content":"","externalUrl":null,"permalink":"/tags/","section":"Tags","summary":"","title":"Tags","type":"tags"}]