# Encoder
encoder = torch.hub.load("huggingface/pytorch-image-models", model_name)
#...

# Temporal Convolution layers
self.temporal_conv = nn.Sequential(
    nn.Conv3d(
        in_channels=1024,  # From encoder
        out_channels=self.temporal_conv_channels,
        kernel_size=self.temporal_conv_kernel_size,
        padding="same",
        dilation=(self.temporal_conv_dilation, 1, 1),
    ),
    nn.BatchNorm3d(self.temporal_conv_channels, eps=0.001),
    nn.ReLU(inplace=True),
    nn.AdaptiveMaxPool3d((None, 1, 1)),
    nn.Flatten(2, 4),
)

def get_sims(self, embs, temperature):
    # Compute temporal self-similarity matrix
    sim_mat = torch.cdist(embs, embs)**2 # N x D x D
    sim_mat = -sim_mat / temperature
    sim_mat = sim_mat.softmax(dim=-1)

    return sim_mat

# Counting Module (Self-sim > Conv > Transformer > Classifier)
self.conv_3x3_layer = nn.Sequential(
    nn.Conv2d(
        in_channels=1,  # assuming single channel input from self-similarity
        out_channels=self.conv_channels,
        kernel_size=self.conv_kernel_size,
        padding="same",
    ),
    nn.ReLU(inplace=True),
)

class TransformerHead(nn.Module):
    class TransformerLayer(nn.Module):
        """A single transformer layer with self-attention and positional encoding."""

        def __init__(
            self, in_features: int, n_head: int, out_features: int, num_frames: int
        ):
            super().__init__()
            self.input_projection = nn.Linear(in_features, out_features)
            self.pos_encoding = nn.Parameter(
                torch.normal(mean=0, std=0.02, size=(1, num_frames, out_features))
            )
            self.transformer_layer = nn.TransformerEncoderLayer(
                d_model=out_features,
                nhead=n_head,
                dim_feedforward=out_features,
                activation="relu",
                layer_norm_eps=1e-6,
                batch_first=True,
                norm_first=True,
            )

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            """Forward pass, expected input shape: N x C x D."""
            x = self.input_projection(x)
            x = x + self.pos_encoding
            x = self.transformer_layer(x)
            return x

    def __init__(
        self,
        in_features: int,
        n_head: int,
        hidden_features: int,
        out_features: int,
        num_frames: int,
    ):
        super().__init__()

        self.predictor_head = nn.Sequential(
            self.TransformerLayer(in_features, n_head, hidden_features, num_frames),
            nn.Linear(hidden_features, hidden_features),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_features, hidden_features),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_features, out_features),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.predictor_head(x)

    # Forward pass
    period_length_pred, periodicity_pred, final_embs = model(frames)
    
    # Calculate losses
    # Convert ground truth period lengths to class indices
    # Period lengths 2-33 map to class indices 0-31
    period_length_classes = torch.clamp(period_length_gt - 2, min=0, max=model.num_frames//2 - 1).long()
    
    # Handle frames with no periodicity (period_length_gt == 0)
    non_periodic_mask = (period_length_gt == 0)
    period_length_classes[non_periodic_mask] = 0  # Assign to class 0 (period length 2)
    
    # period_length_pred shape: (batch_size, num_frames, num_classes)
    # period_length_classes shape: (batch_size, num_frames)
    batch_size, num_frames, num_classes = period_length_pred.shape
    period_loss = period_length_criterion(
        period_length_pred.view(-1, num_classes),  # Flatten to (batch_size * num_frames, num_classes)
        period_length_classes.view(-1)  # Flatten to (batch_size * num_frames,)
    )
    
    # Periodicity is a binary classification task (BCE loss)
    periodicity_pred_sigmoid = torch.sigmoid(periodicity_pred.squeeze(-1))
    periodicity_loss = F.binary_cross_entropy(periodicity_pred_sigmoid, periodicity_gt)

Computer Vision - Proyecto Final: RepNet¶

Introducción¶

Repeticiones¶

Objetivos¶

Retos/Dificultades¶

Acercamiento al problema¶

Predicciones¶

Descripción General de la arquitectura¶

El Dataset (Countix)¶

Data Sets:¶

Arquitectura del modelo¶

Encoder¶

Self-similarity Matrix¶

Predictor¶

Entrenamiento¶

Datos sintéticos¶

Camera Motion Augmentation¶

Función de Pérdida¶

Evaluación¶

Inferencia¶

Conclusiones¶

Futuras mejoras potenciales¶

Gracias!¶