diff --git a/mmp/a4/dataset.py b/mmp/a4/dataset.py index a46852b..d2cc72b 100644 --- a/mmp/a4/dataset.py +++ b/mmp/a4/dataset.py @@ -255,7 +255,7 @@ def draw_positive_boxes( def main(): anchor_grid = get_anchor_grid( anchor_widths=[8, 16, 32, 64, 96, 128, 160, 192], - aspect_ratios=[1 / 3, 1 / 2, 3 / 5, 2 / 3, 3 / 4, 1, 4 / 3, 5 / 3, 2, 2.5, 3], + aspect_ratios=[1 / 2, 2 / 3, 1, 4 / 3, 5 / 3, 2, 2.5, 3], num_rows=28, num_cols=28, scale_factor=8, diff --git a/mmp/a5/document.pdf b/mmp/a5/document.pdf new file mode 100644 index 0000000..1d68e21 Binary files /dev/null and b/mmp/a5/document.pdf differ diff --git a/mmp/a5/document.tex b/mmp/a5/document.tex new file mode 100644 index 0000000..3ab24f8 --- /dev/null +++ b/mmp/a5/document.tex @@ -0,0 +1,82 @@ +\documentclass[11pt,a4paper]{article} + +% Language and encoding settings +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[english]{babel} + +% Page formatting +\usepackage[left=1in, right=1in, top=1in, bottom=1in]{geometry} +\usepackage{setspace} +\onehalfspacing + +% Header/Footer +\usepackage{fancyhdr} +\pagestyle{fancy} +\fancyhf{} % clear all header and footer fields +\fancyhead[L]{\textbf{\course}} +\fancyhead[C]{Assignment \assignmentnumber} +\fancyhead[R]{\name} +\fancyfoot[C]{\thepage} + +% Other packages +\usepackage{enumitem} +\usepackage{graphicx} + +% Custom commands for easy detail insertion +\newcommand{\assignmentnumber}{05} % <-- CHANGE Assignment Number +\newcommand{\name}{Simon Franken} % <-- CHANGE YOUR NAME +\newcommand{\course}{Multimedia Project WiSe 2526} % <-- CHANGE COURSE NAME +\newcommand{\duedate}{2025-11-26} % <-- CHANGE DUE DATE + +% Title formatting +\usepackage{titling} +\pretitle{ + \vspace*{2cm} + \begin{center} + \LARGE\bfseries +} +\posttitle{\par\end{center}\vspace{1cm}} + +\begin{document} + +\title{Assignment \assignmentnumber} +\author{\name} +\date{\duedate} + +\maketitle + +\begin{center} + \textbf{Course:} \course +\end{center} +\vspace{0.5cm} + +%------------------ START OF ASSIGNMENT ----------------------- + +% Write your solutions below + +\section*{Exercise 5.2 Training} +\begin{figure}[htp] + \centering + \includegraphics[width=14cm]{image0.png} +\end{figure} + +\section*{Exercise 5.3 Negative Mining} +\begin{figure}[htp] + \centering + \includegraphics[width=14cm]{image1.jpg} + \includegraphics[width=14cm]{image2.png} + +\end{figure} + +The idea of negative mining is, to get a certain balance between positive and negative labels. +Especially in our case it is important, since there are a lot more negative boxes then positive ones. +By sampling a random subset of negatives. +In my case I could observe that the loss was actually higher with negative mining. But the accuracy was better. + + + + +%------------------ END OF ASSIGNMENT ----------------------- + +\end{document} \ No newline at end of file diff --git a/mmp/a5/image0.png b/mmp/a5/image0.png new file mode 100644 index 0000000..553de31 Binary files /dev/null and b/mmp/a5/image0.png differ diff --git a/mmp/a5/image1.jpg b/mmp/a5/image1.jpg new file mode 100644 index 0000000..2152f52 Binary files /dev/null and b/mmp/a5/image1.jpg differ diff --git a/mmp/a5/image2.png b/mmp/a5/image2.png new file mode 100644 index 0000000..1c8c214 Binary files /dev/null and b/mmp/a5/image2.png differ diff --git a/mmp/a5/main.py b/mmp/a5/main.py index f05f48b..0905199 100644 --- a/mmp/a5/main.py +++ b/mmp/a5/main.py @@ -1,7 +1,16 @@ +import argparse import torch import torch.optim as optim +import torch.nn as nn +from torch.utils.data import DataLoader +from torch import Tensor +from tqdm import tqdm +import datetime from .model import MmpNet +from ..a4.anchor_grid import get_anchor_grid +from ..a4.dataset import get_dataloader +from ..a2.main import get_criterion_optimizer def step( @@ -11,11 +20,19 @@ def step( img_batch: torch.Tensor, lbl_batch: torch.Tensor, ) -> float: - """Performs one update step for the model + model.train() + optimizer.zero_grad() - @return: The loss for the specified batch. Return a float and not a PyTorch tensor - """ - raise NotImplementedError() + device = next(model.parameters()).device + img_batch = img_batch.to(device) + lbl_batch = lbl_batch.to(device) + + outputs = model(img_batch) + loss = criterion(outputs, lbl_batch) + loss.backward() + optimizer.step() + + return loss.item() def get_random_sampling_mask(labels: torch.Tensor, neg_ratio: float) -> torch.Tensor: @@ -26,13 +43,229 @@ def get_random_sampling_mask(labels: torch.Tensor, neg_ratio: float) -> torch.Te Hint: after computing the mask, check if the neg_ratio is fulfilled. @return: A tensor with the same shape as labels """ - assert labels.min() >= 0 and labels.max() <= 1 # remove this line if you want - raise NotImplementedError() + # Flatten for easier indexing + labels_flat = labels.view(-1) + pos_indices = (labels_flat == 1).nonzero(as_tuple=True)[0] + neg_indices = (labels_flat == 0).nonzero(as_tuple=True)[0] + + num_pos = pos_indices.numel() + num_neg = neg_indices.numel() + + num_neg_to_sample = min(int(neg_ratio * num_pos), num_neg) + + perm = torch.randperm(num_neg, device=labels.device) + sampled_neg_indices = neg_indices[perm[:num_neg_to_sample]] + + mask_flat = torch.zeros_like(labels_flat, dtype=torch.long) + mask_flat[pos_indices] = 1 + mask_flat[sampled_neg_indices] = 1 + + # Reshape to original shape + mask = mask_flat.view_as(labels) + return mask + + +def get_detection_metrics( + output: Tensor, labels: torch.Tensor, threshold: float +) -> tuple[float, float, float, float]: + """ + Returns precision, recall, f1 for the positive (human) class, and overall accuracy. + """ + with torch.no_grad(): + probs = torch.softmax(output, dim=-1)[..., 1] + + preds = probs >= threshold + + TP = ((preds == 1) & (labels == 1)).sum().item() + FP = ((preds == 1) & (labels == 0)).sum().item() + FN = ((preds == 0) & (labels == 1)).sum().item() + TN = ((preds == 0) & (labels == 0)).sum().item() + + precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0 + recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0 + f1 = ( + 2 * precision * recall / (precision + recall) + if (precision + recall) > 0 + else 0.0 + ) + accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0 + + return ( + precision, + recall, + f1, + accuracy, + ) + + +def evaluate( + model: MmpNet, + criterion, + dataloader: DataLoader, +) -> tuple[float, float, float, float]: + device = next(model.parameters()).device + model.eval() + total_loss = 0.0 + total_samples = 0 + all_outputs = [] + all_labels = [] + with torch.no_grad(): + for img_batch, lbl_batch, _ in dataloader: + img_batch = img_batch.to(device) + lbl_batch = lbl_batch.to(device) + + outputs = model(img_batch) + loss = criterion(outputs, lbl_batch) + batch_size = img_batch.size(0) + total_loss += loss.item() * batch_size + total_samples += batch_size + + all_outputs.append(outputs.cpu()) + all_labels.append(lbl_batch.cpu()) + avg_loss = total_loss / total_samples if total_samples > 0 else 0.0 + if all_outputs and all_labels: + outputs_cat = torch.cat(all_outputs) + labels_cat = torch.cat(all_labels) + precision, recall, f1, acc = get_detection_metrics( + outputs_cat, labels_cat, threshold=0.5 + ) + else: + precision = recall = f1 = 0.0 + return avg_loss, precision, recall, f1, acc + + +def train( + model: MmpNet, + loader: DataLoader, + criterion: nn.Module, + optimizer: optim.Optimizer, +): + model.train() + running_loss = 0.0 + total_samples = 0 + + progress_bar = tqdm(loader, desc="Training", unit="batch") + + for img_batch, lbl_batch, _ in progress_bar: + loss = step( + model=model, + criterion=criterion, + optimizer=optimizer, + img_batch=img_batch, + lbl_batch=lbl_batch, + ) + batch_size = img_batch.size(0) + running_loss += loss * batch_size + total_samples += batch_size + progress_bar.set_postfix( + {"loss": running_loss / total_samples if total_samples > 0 else 0.0} + ) + + epoch_loss = running_loss / total_samples if total_samples > 0 else 0.0 + progress_bar.close() + return epoch_loss + + +class NegativeMiningCriterion(nn.Module): + def __init__(self, neg_ratio=3.0, enable_negative_mining: bool = True): + super().__init__() + self.backbone = nn.CrossEntropyLoss(reduction="none") + self.neg_ratio = neg_ratio + self.enable_negative_mining = enable_negative_mining + + def forward(self, outputs, labels): + outputs_flat = outputs.view(-1, outputs.shape[-1]) + labels_flat = labels.view(-1).long() + unfiltered = self.backbone(outputs_flat, labels_flat) + assert unfiltered.shape == labels_flat.shape + + if not self.enable_negative_mining: + return unfiltered.mean() + + mask = get_random_sampling_mask(labels_flat, self.neg_ratio) + filtered_loss = unfiltered[mask == 1] + return filtered_loss.mean() def main(): - """Put your training code for exercises 5.2 and 5.3 here""" - raise NotImplementedError() + parser = argparse.ArgumentParser() + parser.add_argument( + "--tensorboard", + nargs="?", + const=True, + default=False, + help="Enable TensorBoard logging. If a label is provided, it will be used in the log directory name.", + ) + args = parser.parse_args() + + if args.tensorboard: + from torch.utils.tensorboard import SummaryWriter + + timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + if isinstance(args.tensorboard, str): + label = args.tensorboard + log_dir = f"runs/a5_mmpnet_{label}_{timestamp}" + else: + log_dir = f"runs/a5_mmpnet_{timestamp}" + writer = SummaryWriter(log_dir=log_dir) + else: + writer = None + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = MmpNet(num_aspect_ratios=8, num_widths=8).to(device) + + anchor_grid = get_anchor_grid( + anchor_widths=[8, 16, 32, 64, 96, 128, 160, 192], + aspect_ratios=[1 / 2, 2 / 3, 1, 4 / 3, 5 / 3, 2, 2.5, 3], + num_rows=7, + num_cols=7, + scale_factor=32, + ) + + dataloader_train = get_dataloader( + path_to_data=".data/mmp-public-3.2/train", + image_size=224, + batch_size=32, + num_workers=9, + is_test=False, + is_train=True, + anchor_grid=anchor_grid, + ) + dataloader_val = get_dataloader( + path_to_data=".data/mmp-public-3.2/val", + image_size=224, + batch_size=32, + num_workers=9, + is_test=False, + is_train=False, + anchor_grid=anchor_grid, + ) + _, optimizer = get_criterion_optimizer(model=model) + criterion = NegativeMiningCriterion(enable_negative_mining=True) + criterion_eval = NegativeMiningCriterion(enable_negative_mining=False) + num_epochs = 10 + + for epoch in range(num_epochs): + train_loss = train( + model=model, + loader=dataloader_train, + criterion=criterion, + optimizer=optimizer, + ) + avg_loss, precision, recall, f1, acc = evaluate( + model=model, criterion=criterion_eval, dataloader=dataloader_val + ) + + if writer is not None: + writer.add_scalar("Loss/train_epoch", train_loss, epoch) + writer.add_scalar("Loss/eval_epoch", avg_loss, epoch) + writer.add_scalar("Acc/precision", precision, epoch) + writer.add_scalar("Acc/recall", recall, epoch) + writer.add_scalar("Acc/acc", acc, epoch) + writer.add_scalar("Acc/f1", f1, epoch) + + if writer is not None: + writer.close() if __name__ == "__main__": diff --git a/mmp/a5/model.py b/mmp/a5/model.py index fef4c95..a45f852 100644 --- a/mmp/a5/model.py +++ b/mmp/a5/model.py @@ -1,9 +1,40 @@ import torch +from torchvision import models +from torchvision.models import MobileNet_V2_Weights +from torch import nn class MmpNet(torch.nn.Module): - def __init__(self, num_widths: int, num_aspect_ratios: int): - raise NotImplementedError() + def __init__(self, num_widths: int, num_aspect_ratios: int, num_classes: int = 2): + super().__init__() + self.backbone = models.mobilenet_v2( + weights=MobileNet_V2_Weights.DEFAULT + ).features + self.num_widths = num_widths + self.num_aspect_ratios = num_aspect_ratios + self.num_classes = num_classes - def forward(self, x: torch.Tensor) -> torch.Tensor: - raise NotImplementedError() + with torch.no_grad(): + dummy = torch.zeros(1, 3, 224, 224) + backbone_out = self.backbone(dummy) + in_channels = backbone_out.shape[1] + + self.head = nn.Conv2d( + in_channels=in_channels, + kernel_size=3, + out_channels=self.get_required_output_channels(), + stride=1, + padding=1, + ) + + def get_required_output_channels(self): + return self.num_widths * self.num_aspect_ratios * self.num_classes + + def forward(self, x: torch.Tensor): + x = self.backbone(x) + x = self.head(x) + b, out_c, h, w = x.shape + x = x.view(b, self.num_widths, self.num_aspect_ratios, self.num_classes, h, w) + x = x.permute(0, 1, 2, 4, 5, 3).contiguous() + # Now: (batch, num_widths, num_aspect_ratios, h, w, num_classes) + return x