Compare commits

...

10 Commits

Author SHA1 Message Date
franksim
a6f70005f2 adds nms and eval 2025-12-02 11:04:47 +01:00
franksim
3b6a588719 Merge branch 'assignment-a5' into 'main'
Assignment a5

See merge request mmc-mmp/mmp_wise2526_franksim!4
2025-11-18 09:53:18 +01:00
franksim
c50d9e83b8 adapts metrics 2025-11-18 09:14:23 +01:00
franksim
f21fb57303 adds documentation 2025-11-16 16:39:17 +01:00
franksim
edbad414e2 adds solutions 2025-11-16 16:28:13 +01:00
franksim
721e46b768 undo renaming 2025-11-11 11:09:42 +01:00
franksim
56e21a1e54 renaming 2025-11-11 11:08:07 +01:00
franksim
7245042b54 performance improvements 2025-11-11 10:52:27 +01:00
franksim
a00ddedb23 adapts doc 2025-11-09 17:52:35 +01:00
franksim
5c8e06f62f small impovements 2025-11-09 17:49:50 +01:00
54 changed files with 865 additions and 990 deletions

View File

@@ -25,6 +25,7 @@ class AnnotationRect:
self.x2 *= factor self.x2 *= factor
self.y1 *= factor self.y1 *= factor
self.y2 *= factor self.y2 *= factor
return self
@staticmethod @staticmethod
def fromarray(arr: np.ndarray): def fromarray(arr: np.ndarray):

Binary file not shown.

After

Width:  |  Height:  |  Size: 180 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 152 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 226 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 223 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 285 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 429 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 252 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 192 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 243 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 432 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 234 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 349 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 257 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

View File

@@ -5,7 +5,7 @@ import numpy as np
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from ..a3.annotation import read_groundtruth_file, AnnotationRect from ..a3.annotation import read_groundtruth_file, AnnotationRect
from .label_grid import get_label_grid, draw_annotation_rects from .label_grid import get_label_grid
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.patches as patches import matplotlib.patches as patches
from .anchor_grid import get_anchor_grid from .anchor_grid import get_anchor_grid
@@ -29,10 +29,11 @@ class MMP_Dataset(torch.utils.data.Dataset):
@param is_test: Whether this is the test set (True) or the validation/training set (False) @param is_test: Whether this is the test set (True) or the validation/training set (False)
""" """
self.image_size = image_size self.image_size = image_size
self.images: Sequence[Tuple[str, Sequence[AnnotationRect]]] = [] self.images: Sequence[Tuple[str, str | None]] = []
self.anchor_grid = anchor_grid self.anchor_grid = anchor_grid
self.min_iou = min_iou self.min_iou = min_iou
self.is_test = is_test self.is_test = is_test
self.path_to_data = path_to_data
img_pattern = re.compile(r"^(\d+)\.jpg$") img_pattern = re.compile(r"^(\d+)\.jpg$")
files = set(os.listdir(path_to_data)) files = set(os.listdir(path_to_data))
@@ -42,11 +43,10 @@ class MMP_Dataset(torch.utils.data.Dataset):
img_file = os.path.join(path_to_data, fname) img_file = os.path.join(path_to_data, fname)
if is_test: if is_test:
self.images.append((img_file, None)) self.images.append((img_file, None))
else: annotation_file = os.path.join(
annotations = read_groundtruth_file( path_to_data, f"{match.group(1)}.gt_data.txt"
os.path.join(path_to_data, f"{match.group(1)}.gt_data.txt") )
) self.images.append((img_file, annotation_file))
self.images.append((img_file, annotations))
self.images.sort( self.images.sort(
key=lambda x: int(re.match(r"(.*/)(\d+)(\.jpg)", x[0]).group(2)) key=lambda x: int(re.match(r"(.*/)(\d+)(\.jpg)", x[0]).group(2))
@@ -73,13 +73,13 @@ class MMP_Dataset(torch.utils.data.Dataset):
if self.is_test: if self.is_test:
return (img_tensor, torch.Tensor(), int(img_id)) return (img_tensor, torch.Tensor(), int(img_id))
scaled_annotations = [] annotations = [
for annotation in self.images[idx][1]:
annotation.scale(self.image_size / max(img.size[0], img.size[1])) annotation.scale(self.image_size / max(img.size[0], img.size[1]))
scaled_annotations.append(annotation) for annotation in read_groundtruth_file(self.images[idx][1])
]
label_grid = get_label_grid( label_grid = get_label_grid(
anchor_grid=self.anchor_grid, gts=scaled_annotations, min_iou=self.min_iou anchor_grid=self.anchor_grid, gts=annotations, min_iou=self.min_iou
) )
return (img_tensor, label_grid, int(img_id)) return (img_tensor, label_grid, int(img_id))
@@ -120,21 +120,93 @@ def get_dataloader(
return dataloader return dataloader
def calculate_max_coverage(loader: DataLoader, min_iou: float) -> float: def calculate_max_coverage(loader, min_iou):
""" """
@param loader: A DataLoader object, generated with the get_dataloader function. @param loader: DataLoader object.
@param min_iou: Minimum IoU overlap that is required to count a ground truth box as covered. @param min_iou: Minimum IoU overlap to count a ground truth box as covered.
@return: Ratio of how mamy ground truth boxes are covered by a label grid box. Must be a value between 0 and 1. @return: Ratio of how many ground truth boxes are covered by a label grid box. Value between 0 and 1.
""" """
raise NotImplementedError() total_boxes = 0
covered_boxes = 0
dataset = loader.dataset
anchor_grid = dataset.anchor_grid # Shape: (H, W, 4)
# Reshape anchor grid to (N, 4)
anchors = anchor_grid.reshape(-1, 4)
for img, _, img_id in loader:
for batch_index in range(len(img)):
gts_file = os.path.join(
dataset.path_to_data,
f"{str(img_id[batch_index].item()).zfill(8)}.gt_data.txt",
)
# Load and scale ground truth boxes if necessary
with Image.open(
os.path.join(
dataset.path_to_data,
f"{str(img_id[batch_index].item()).zfill(8)}.jpg",
)
) as original_image:
original_w, original_h = original_image.size
# Assume square resize for model, get transform size from img tensor
transformed_size = img[batch_index].shape[-1]
scale = transformed_size / max(original_w, original_h)
annotations = [
annotation.scale(scale)
for annotation in read_groundtruth_file(gts_file)
]
gt_boxes = np.stack(
[np.array(a) for a in annotations], axis=0
) # shape (M, 4)
total_boxes += len(gt_boxes)
# Vectorized IoU calculation: (M, N)
ious = compute_ious_vectorized(gt_boxes, anchors) # shape (M, N)
# Count ground truths for which any anchor box matches min_iou
covered = (ious >= min_iou).any(axis=1).sum()
covered_boxes += covered
return covered_boxes / total_boxes if total_boxes > 0 else 0.0
def print_img_tensor_with_annotations( def compute_ious_vectorized(boxes1, boxes2):
img: torch.Tensor, annotations: Sequence["AnnotationRect"], output_file: str """
Compute the IoU matrix between each box in boxes1 and each box in boxes2.
boxes1: (M, 4), boxes2: (N, 4) -- format [x1, y1, x2, y2]
Returns: (M, N) IoU
"""
# Expand to (M, N, 4)
boxes1 = boxes1[:, None, :] # (M, 1, 4)
boxes2 = boxes2[None, :, :] # (1, N, 4)
# Intersection box
inter_x1 = np.maximum(boxes1[..., 0], boxes2[..., 0])
inter_y1 = np.maximum(boxes1[..., 1], boxes2[..., 1])
inter_x2 = np.minimum(boxes1[..., 2], boxes2[..., 2])
inter_y2 = np.minimum(boxes1[..., 3], boxes2[..., 3])
inter_w = np.clip(inter_x2 - inter_x1, 0, None)
inter_h = np.clip(inter_y2 - inter_y1, 0, None)
inter_area = inter_w * inter_h
area1 = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
area2 = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
union_area = area1 + area2 - inter_area
return inter_area / (union_area + 1e-6)
def draw_image_tensor_with_annotations(
img: torch.Tensor,
annotations: Sequence["AnnotationRect"] | None,
output_file: str,
): ):
# Convert tensor to numpy, permute dimensions # Convert tensor to numpy, permute dimensions
img_np = img.permute(1, 2, 0).cpu().numpy() img_np = img.permute(1, 2, 0).numpy()
img_np = img_np.astype(np.uint8) img_np = np.clip(img_np, 0, 1)
fig, ax = plt.subplots(1) fig, ax = plt.subplots(1)
ax.imshow(img_np) ax.imshow(img_np)
@@ -152,40 +224,43 @@ def print_img_tensor_with_annotations(
plt.close(fig) plt.close(fig)
def print_positive_boxes( def denormalize_image_tensor(
img: torch.Tensor,
mean=torch.tensor([0.485, 0.456, 0.406]).view(-1, 1, 1),
std=torch.tensor([0.229, 0.224, 0.225]).view(-1, 1, 1),
) -> torch.Tensor:
img_denormalized = img * std + mean
return img_denormalized
def draw_positive_boxes(
img_tensor: torch.Tensor, img_tensor: torch.Tensor,
label_grid: np.ndarray, label_grid: np.ndarray,
img_id: torch.Tensor, img_id: torch.Tensor,
anchor_grid: np.ndarray, anchor_grid: np.ndarray,
path_to_data: str,
): ):
annotations = [ annotations = [
AnnotationRect.fromarray(anchor_grid[idx]) AnnotationRect.fromarray(anchor_grid[idx])
for idx in np.ndindex(anchor_grid.shape[:-1]) for idx in np.ndindex(anchor_grid.shape[:-1])
if label_grid[idx] if label_grid[idx]
] ]
print_img_tensor_with_annotations( draw_image_tensor_with_annotations(
img_tensor, img_tensor,
annotations=annotations, annotations=annotations,
output_file=f"mmp/a4/{img_id}_transformed.png", output_file=f"mmp/a4/.output/{img_id}_transformed.png",
)
draw_annotation_rects(
annotations=annotations,
image=f"{os.path.join(path_to_data, f'{str(img_id.item()).zfill(8)}.jpg')}",
output_path=f"mmp/a4/{img_id}_original.png",
) )
def main(): def main():
anchor_grid = get_anchor_grid( anchor_grid = get_anchor_grid(
anchor_widths=[16, 32, 64, 96, 128, 144, 150, 160, 192, 224, 256], anchor_widths=[8, 16, 32, 64, 96, 128, 160, 192],
aspect_ratios=[1 / 3, 1 / 2, 3 / 5, 2 / 3, 3 / 4, 1, 4 / 3, 5 / 3, 2, 2.5, 3], aspect_ratios=[1 / 2, 2 / 3, 1, 4 / 3, 5 / 3, 2, 2.5, 3],
num_rows=32, num_rows=28,
num_cols=32, num_cols=28,
scale_factor=20, scale_factor=8,
) )
dataloader = get_dataloader( dataloader = get_dataloader(
num_workers=6, num_workers=9,
is_train=True, is_train=True,
is_test=False, is_test=False,
batch_size=8, batch_size=8,
@@ -194,13 +269,14 @@ def main():
anchor_grid=anchor_grid, anchor_grid=anchor_grid,
) )
# print(calculate_coverage(dataloader, 0.7))
for img, label, img_id in islice(dataloader, 12): for img, label, img_id in islice(dataloader, 12):
print_positive_boxes( draw_positive_boxes(
img_tensor=img[5], img_tensor=denormalize_image_tensor(img=img[5]),
label_grid=label[5], label_grid=label[5],
img_id=img_id[5], img_id=img_id[5],
anchor_grid=anchor_grid, anchor_grid=anchor_grid,
path_to_data=".data/mmp-public-3.2/train",
) )

Binary file not shown.

View File

@@ -47,7 +47,7 @@
\maketitle \maketitle
\begin{center} \begin{center}
\textbf{Course:} \course \textbf{Course:} \course
\end{center} \end{center}
\vspace{0.5cm} \vspace{0.5cm}
@@ -57,14 +57,22 @@
\section*{Exercise 4.2 Label Grid} \section*{Exercise 4.2 Label Grid}
\begin{enumerate}[label=\alph*)] \begin{enumerate}[label=\alph*)]
\setcounter{enumi}{2} \setcounter{enumi}{2}
\item \begin{figure}[htp] \item \begin{figure}[htp]
\centering \centering
\includegraphics[width=4cm]{output.jpg} \includegraphics[width=4cm]{output.jpg}
\caption{output.txt} \caption{output.txt}
\end{figure} \end{figure}
\end{enumerate} \end{enumerate}
\section*{Exercise 4.3 Finalizing the Data Pipeline}
\begin{enumerate}[label=\alph*)]
\setcounter{enumi}{2}
\item The generated images can be found at `.output/`.
\end{enumerate}
%------------------ END OF ASSIGNMENT ----------------------- %------------------ END OF ASSIGNMENT -----------------------
\end{document} \end{document}

BIN
mmp/a5/document.pdf Normal file

Binary file not shown.

82
mmp/a5/document.tex Normal file
View File

@@ -0,0 +1,82 @@
\documentclass[11pt,a4paper]{article}
% Language and encoding settings
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[english]{babel}
% Page formatting
\usepackage[left=1in, right=1in, top=1in, bottom=1in]{geometry}
\usepackage{setspace}
\onehalfspacing
% Header/Footer
\usepackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{} % clear all header and footer fields
\fancyhead[L]{\textbf{\course}}
\fancyhead[C]{Assignment \assignmentnumber}
\fancyhead[R]{\name}
\fancyfoot[C]{\thepage}
% Other packages
\usepackage{enumitem}
\usepackage{graphicx}
% Custom commands for easy detail insertion
\newcommand{\assignmentnumber}{05} % <-- CHANGE Assignment Number
\newcommand{\name}{Simon Franken} % <-- CHANGE YOUR NAME
\newcommand{\course}{Multimedia Project WiSe 2526} % <-- CHANGE COURSE NAME
\newcommand{\duedate}{2025-11-26} % <-- CHANGE DUE DATE
% Title formatting
\usepackage{titling}
\pretitle{
\vspace*{2cm}
\begin{center}
\LARGE\bfseries
}
\posttitle{\par\end{center}\vspace{1cm}}
\begin{document}
\title{Assignment \assignmentnumber}
\author{\name}
\date{\duedate}
\maketitle
\begin{center}
\textbf{Course:} \course
\end{center}
\vspace{0.5cm}
%------------------ START OF ASSIGNMENT -----------------------
% Write your solutions below
\section*{Exercise 5.2 Training}
\begin{figure}[htp]
\centering
\includegraphics[width=14cm]{image0.png}
\end{figure}
\section*{Exercise 5.3 Negative Mining}
\begin{figure}[htp]
\centering
\includegraphics[width=14cm]{image1.jpg}
\includegraphics[width=14cm]{image2.png}
\end{figure}
The idea of negative mining is, to get a certain balance between positive and negative labels.
Especially in our case it is important, since there are a lot more negative boxes then positive ones.
By sampling a random subset of negatives.
In my case I could observe that the loss was actually higher with negative mining. But the accuracy was better.
%------------------ END OF ASSIGNMENT -----------------------
\end{document}

BIN
mmp/a5/image0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
mmp/a5/image1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

BIN
mmp/a5/image2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

View File

@@ -1,7 +1,16 @@
import argparse
import torch import torch
import torch.optim as optim import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import datetime
from .model import MmpNet from .model import MmpNet
from ..a4.anchor_grid import get_anchor_grid
from ..a4.dataset import get_dataloader
from ..a2.main import get_criterion_optimizer
from ..a6.main import evaluate as evaluate_v2
def step( def step(
@@ -11,11 +20,19 @@ def step(
img_batch: torch.Tensor, img_batch: torch.Tensor,
lbl_batch: torch.Tensor, lbl_batch: torch.Tensor,
) -> float: ) -> float:
"""Performs one update step for the model model.train()
optimizer.zero_grad()
@return: The loss for the specified batch. Return a float and not a PyTorch tensor device = next(model.parameters()).device
""" img_batch = img_batch.to(device)
raise NotImplementedError() lbl_batch = lbl_batch.to(device)
outputs = model(img_batch)
loss = criterion(outputs, lbl_batch)
loss.backward()
optimizer.step()
return loss.item()
def get_random_sampling_mask(labels: torch.Tensor, neg_ratio: float) -> torch.Tensor: def get_random_sampling_mask(labels: torch.Tensor, neg_ratio: float) -> torch.Tensor:
@@ -26,13 +43,187 @@ def get_random_sampling_mask(labels: torch.Tensor, neg_ratio: float) -> torch.Te
Hint: after computing the mask, check if the neg_ratio is fulfilled. Hint: after computing the mask, check if the neg_ratio is fulfilled.
@return: A tensor with the same shape as labels @return: A tensor with the same shape as labels
""" """
assert labels.min() >= 0 and labels.max() <= 1 # remove this line if you want # Flatten for easier indexing
raise NotImplementedError() labels_flat = labels.view(-1)
pos_indices = (labels_flat == 1).nonzero(as_tuple=True)[0]
neg_indices = (labels_flat == 0).nonzero(as_tuple=True)[0]
num_pos = pos_indices.numel()
num_neg = neg_indices.numel()
num_neg_to_sample = min(int(neg_ratio * num_pos), num_neg)
perm = torch.randperm(num_neg, device=labels.device)
sampled_neg_indices = neg_indices[perm[:num_neg_to_sample]]
mask_flat = torch.zeros_like(labels_flat, dtype=torch.long)
mask_flat[pos_indices] = 1
mask_flat[sampled_neg_indices] = 1
# Reshape to original shape
mask = mask_flat.view_as(labels)
return mask
def evaluate(
model: MmpNet,
criterion,
dataloader: DataLoader,
) -> float:
device = next(model.parameters()).device
model.eval()
total_loss = 0.0
total_samples = 0
all_outputs = []
all_labels = []
with torch.no_grad():
for img_batch, lbl_batch, _ in dataloader:
img_batch = img_batch.to(device)
lbl_batch = lbl_batch.to(device)
outputs = model(img_batch)
loss = criterion(outputs, lbl_batch)
batch_size = img_batch.size(0)
total_loss += loss.item() * batch_size
total_samples += batch_size
all_outputs.append(outputs.cpu())
all_labels.append(lbl_batch.cpu())
avg_loss = total_loss / total_samples if total_samples > 0 else 0.0
return avg_loss
def train(
model: MmpNet,
loader: DataLoader,
criterion: nn.Module,
optimizer: optim.Optimizer,
):
model.train()
running_loss = 0.0
total_samples = 0
progress_bar = tqdm(loader, desc="Training", unit="batch")
for img_batch, lbl_batch, _ in progress_bar:
loss = step(
model=model,
criterion=criterion,
optimizer=optimizer,
img_batch=img_batch,
lbl_batch=lbl_batch,
)
batch_size = img_batch.size(0)
running_loss += loss * batch_size
total_samples += batch_size
progress_bar.set_postfix(
{"loss": running_loss / total_samples if total_samples > 0 else 0.0}
)
epoch_loss = running_loss / total_samples if total_samples > 0 else 0.0
progress_bar.close()
return epoch_loss
class NegativeMiningCriterion(nn.Module):
def __init__(self, neg_ratio=3.0, enable_negative_mining: bool = True):
super().__init__()
self.backbone = nn.CrossEntropyLoss(reduction="none")
self.neg_ratio = neg_ratio
self.enable_negative_mining = enable_negative_mining
def forward(self, outputs, labels):
outputs_flat = outputs.view(-1, outputs.shape[-1])
labels_flat = labels.view(-1).long()
unfiltered = self.backbone(outputs_flat, labels_flat)
assert unfiltered.shape == labels_flat.shape
if not self.enable_negative_mining:
return unfiltered.mean()
mask = get_random_sampling_mask(labels_flat, self.neg_ratio)
filtered_loss = unfiltered[mask == 1]
return filtered_loss.mean()
def main(): def main():
"""Put your training code for exercises 5.2 and 5.3 here""" parser = argparse.ArgumentParser()
raise NotImplementedError() parser.add_argument(
"--tensorboard",
nargs="?",
const=True,
default=False,
help="Enable TensorBoard logging. If a label is provided, it will be used in the log directory name.",
)
args = parser.parse_args()
if args.tensorboard:
from torch.utils.tensorboard import SummaryWriter
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
if isinstance(args.tensorboard, str):
label = args.tensorboard
log_dir = f"runs/a5_mmpnet_{label}_{timestamp}"
else:
log_dir = f"runs/a5_mmpnet_{timestamp}"
writer = SummaryWriter(log_dir=log_dir)
else:
writer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MmpNet(num_aspect_ratios=8, num_widths=8).to(device)
anchor_grid = get_anchor_grid(
anchor_widths=[8, 16, 32, 64, 96, 128, 160, 192],
aspect_ratios=[1 / 2, 2 / 3, 1, 4 / 3, 5 / 3, 2, 2.5, 3],
num_rows=7,
num_cols=7,
scale_factor=32,
)
dataloader_train = get_dataloader(
path_to_data=".data/mmp-public-3.2/train",
image_size=224,
batch_size=32,
num_workers=9,
is_test=False,
is_train=True,
anchor_grid=anchor_grid,
)
dataloader_val = get_dataloader(
path_to_data=".data/mmp-public-3.2/val",
image_size=224,
batch_size=32,
num_workers=9,
is_test=False,
is_train=False,
anchor_grid=anchor_grid,
)
_, optimizer = get_criterion_optimizer(model=model)
criterion = NegativeMiningCriterion(enable_negative_mining=True)
criterion_eval = NegativeMiningCriterion(enable_negative_mining=False)
num_epochs = 5
for epoch in range(num_epochs):
train_loss = train(
model=model,
loader=dataloader_train,
criterion=criterion,
optimizer=optimizer,
)
avg_loss = evaluate(
model=model, criterion=criterion_eval, dataloader=dataloader_val
)
_ = evaluate_v2(
model=model, device=device, anchor_grid=anchor_grid, loader=dataloader_train
)
if writer is not None:
writer.add_scalar("Loss/train_epoch", train_loss, epoch)
writer.add_scalar("Loss/eval_epoch", avg_loss, epoch)
if writer is not None:
writer.close()
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,9 +1,40 @@
import torch import torch
from torchvision import models
from torchvision.models import MobileNet_V2_Weights
from torch import nn
class MmpNet(torch.nn.Module): class MmpNet(torch.nn.Module):
def __init__(self, num_widths: int, num_aspect_ratios: int): def __init__(self, num_widths: int, num_aspect_ratios: int, num_classes: int = 2):
raise NotImplementedError() super().__init__()
self.backbone = models.mobilenet_v2(
weights=MobileNet_V2_Weights.DEFAULT
).features
self.num_widths = num_widths
self.num_aspect_ratios = num_aspect_ratios
self.num_classes = num_classes
def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.no_grad():
raise NotImplementedError() dummy = torch.zeros(1, 3, 224, 224)
backbone_out = self.backbone(dummy)
in_channels = backbone_out.shape[1]
self.head = nn.Conv2d(
in_channels=in_channels,
kernel_size=3,
out_channels=self.get_required_output_channels(),
stride=1,
padding=1,
)
def get_required_output_channels(self):
return self.num_widths * self.num_aspect_ratios * self.num_classes
def forward(self, x: torch.Tensor):
x = self.backbone(x)
x = self.head(x)
b, out_c, h, w = x.shape
x = x.view(b, self.num_widths, self.num_aspect_ratios, self.num_classes, h, w)
x = x.permute(0, 1, 2, 4, 5, 3).contiguous()
# Now: (batch, num_widths, num_aspect_ratios, h, w, num_classes)
return x

0
mmp/a6/__init__.py Normal file
View File

View File

@@ -1,25 +1,144 @@
from typing import List, Tuple from typing import List, Tuple
import torch import torch
import numpy as np import numpy as np
from tqdm import tqdm
import os
from torch.utils.data import DataLoader
from mmp.a6.evallib import calculate_ap_pr
from ..a4.label_grid import iou
from ..a5.model import MmpNet from ..a5.model import MmpNet
from ..a3.annotation import AnnotationRect from ..a3.annotation import AnnotationRect, read_groundtruth_file
from .nms import non_maximum_suppression
def batch_inference( def batch_inference(
model: MmpNet, images: torch.Tensor, device: torch.device, anchor_grid: np.ndarray model: MmpNet, images: torch.Tensor, device: torch.device, anchor_grid: np.ndarray
) -> List[List[Tuple[AnnotationRect, float]]]: ) -> List[List[Tuple[AnnotationRect, float]]]:
raise NotImplementedError() score_thresh = 0.5
nms_thresh = 0.3
model = model.to(device)
model.eval()
images = images.to(device)
anchor_grid = anchor_grid # shape [W, R, h, w, 4]
results = []
with torch.no_grad():
outputs = model(images) # (B, W, R, h, w, 2)
probs = torch.softmax(outputs, dim=-1)[..., 1] # (B, W, R, h, w)
probs_np = probs.cpu().numpy()
batch_size = outputs.shape[0]
for b in range(batch_size):
detections = []
for idx in np.ndindex(anchor_grid.shape[:-1]):
score = probs_np[b][idx]
# if score >= score_thresh:
box = anchor_grid[idx]
rect = AnnotationRect.fromarray(box)
detections.append((rect, float(score)))
detections_nms = non_maximum_suppression(detections, nms_thresh)
results.append(detections_nms)
return results
def evaluate() -> float: # feel free to change the arguments def evaluate(
model: MmpNet, loader: DataLoader, device: torch.device, anchor_grid: np.ndarray
) -> float:
"""Evaluates a specified model on the whole validation dataset. """Evaluates a specified model on the whole validation dataset.
@return: AP for the validation set as a float. @return: AP for the validation set as a float.
You decide which arguments this function should receive You decide which arguments this function should receive
""" """
raise NotImplementedError()
path_to_data = ".data/mmp-public-3.2/train"
progress_bar = tqdm(loader, desc="Evaluation", unit="batch")
image_count = 0
ap_total = 0
for img_batch, _, id_batch in progress_bar:
inference = batch_inference(
anchor_grid=anchor_grid, device=device, images=img_batch, model=model
)
gts = get_gts_for_batch(id_batch=id_batch, gt_base_path=path_to_data)
dict_detections = {
img_id.item(): inference[idx] for idx, img_id in enumerate(id_batch)
}
dict_gt = {img_id.item(): gts[idx] for idx, img_id in enumerate(id_batch)}
average_prevision, precision, recall = calculate_ap_pr(dict_detections, dict_gt)
ap_total = (ap_total * image_count + average_prevision) / (
image_count + id_batch.shape[0]
)
image_count += id_batch.shape[0]
progress_bar.set_postfix(
{
"ap": ap_total,
}
)
return ap_total
def get_gts_for_batch(
id_batch: torch.Tensor, gt_base_path: str
) -> List[List[AnnotationRect]]:
return [
read_groundtruth_file(
os.path.join(gt_base_path, f"{str(img_id.item()).zfill(8)}.gt_data.txt")
)
for img_id in id_batch
]
def calc_tp_fp_fn(
detections: List[Tuple[AnnotationRect, float]],
gts: List[AnnotationRect],
iou_threshold: float = 0.5,
confidence_threshhold: float = 0.5,
) -> tuple[int, int, int]:
"""
Calculates precision and recall for object detection results on a single image.
Args:
detections: List of (AnnotationRect, confidence) tuples representing predicted boxes and scores. Should be sorted by descending confidence.
gts: List of AnnotationRect for ground truth.
iou_threshold: Minimum IoU to consider a detection a true positive.
confidence_threshhold: Minimum confidence required to include a detection.
Returns:
num_tp: Number of true positives (int).
num_fp: Number of false positives (int).
num_fn: Number of false negatives (int).
"""
detections = [det for det in detections if det[1] >= confidence_threshhold]
detections.sort(key=lambda x: x[1], reverse=True)
matches = set()
fp = 0
tp = 0
for det_rect, _ in detections:
iou_map = [iou(det_rect, gt_rect) for gt_rect in gts]
if len(iou_map) == 0:
fp += 1
continue
max_idx = np.argmax(iou_map)
if max_idx in matches or iou_map[max_idx] < iou_threshold:
fp += 1
continue
matches.add(max_idx)
tp += 1
fn = len(gts) - len(matches)
return tp, fp, fn
def evaluate_test(): # feel free to change the arguments def evaluate_test(): # feel free to change the arguments

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,9 @@
import os
from typing import List, Sequence, Tuple from typing import List, Sequence, Tuple
from ..a3.annotation import AnnotationRect from ..a3.annotation import AnnotationRect
from ..a4.label_grid import iou, draw_annotation_rects
from collections import defaultdict
def non_maximum_suppression( def non_maximum_suppression(
@@ -12,4 +15,68 @@ def non_maximum_suppression(
@return: A list of tuples of the remaining boxes after NMS together with their scores @return: A list of tuples of the remaining boxes after NMS together with their scores
""" """
raise NotImplementedError() if not boxes_scores:
return []
# Sort the boxes by score in descending order
boxes_scores_sorted = sorted(boxes_scores, key=lambda bs: bs[1], reverse=True)
result = []
while boxes_scores_sorted:
# Select the box with highest score and remove it from the list
curr_box, curr_score = boxes_scores_sorted.pop(0)
result.append((curr_box, curr_score))
# Remove boxes with IoU > threshold
new_boxes = []
for box, score in boxes_scores_sorted:
if iou(curr_box, box) <= threshold:
new_boxes.append((box, score))
boxes_scores_sorted = new_boxes
return result
def read_boxes_from_file(filepath: str) -> List[Tuple[str, AnnotationRect, float]]:
"""
Reads a file containing bounding boxes and scores in the format:
{image_number} {x1} {y1} {x2} {y2} {score}
Returns a list of tuples: (image_number, x1, y1, x2, y2, score)
"""
boxes: List[Tuple[AnnotationRect, float]] = []
with open(filepath, "r") as f:
for line in f:
parts = line.strip().split()
if len(parts) != 6:
continue
img_id = parts[0]
x1, y1, x2, y2 = map(int, parts[1:5])
annotation_rect = AnnotationRect(x1, y1, x2, y2)
score = float(parts[5])
boxes.append((img_id, annotation_rect, score))
return boxes
def main():
boxes = read_boxes_from_file("mmp/a6/model_output.txt")
grouped = defaultdict(list)
for image_id, rect, score in boxes:
grouped[image_id].append((rect, score))
for image_id, rects_scores in grouped.items():
filtered_boxes = non_maximum_suppression(rects_scores, 0.3)
annotation_rects = [rect for rect, score in filtered_boxes if score > 0.5]
input_path = f".data/mmp-public-3.2/test/{image_id}.jpg"
output_path = f"mmp/a6/nms_output_{image_id}.png"
if not os.path.exists(input_path):
continue
draw_annotation_rects(
input_path,
annotation_rects,
rect_color=(255, 0, 0),
rect_width=2,
output_path=output_path,
)
if __name__ == "__main__":
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 359 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 231 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 232 KiB