feat(skills): add pytorch-patterns skill

2026-03-31 17:41:36 +08:00 · 2026-03-31 17:41:36 +08:00 · 06282c7314
commit 06282c7314
parent cd6e1ebd27
1 changed files with 528 additions and 0 deletions
--- a/.opencode/skills/pytorch-patterns/SKILL.md
+++ b/.opencode/skills/pytorch-patterns/SKILL.md
@ -0,0 +1,528 @@
 ---
 name: pytorch-patterns
 description: Use when writing PyTorch code to follow best practices and common patterns
 ---
 # PyTorch Patterns Skill
 ## Overview
 This skill provides best practices and common patterns for writing PyTorch code. Use this when implementing neural networks, training loops, data loading, and related deep learning infrastructure.
 **Announce:** "I'm using the pytorch-patterns skill for best practice code."
 ---
 ## Model Definition
 ### Basic nn.Module Pattern
 ```python
 from typing import NamedTuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class ModelConfig:
    """Configuration for the model."""
    def __init__(
        self,
        input_dim: int = 768,
        hidden_dim: int = 512,
        output_dim: int = 10,
        dropout: float = 0.1,
        num_layers: int = 2,
    ):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.num_layers = num_layers
 class ModelOutput(NamedTuple):
    """Typed output container for model forward pass."""
    logits: torch.Tensor
    hidden_states: torch.Tensor
    attention_weights: torch.Tensor | None = None
 class MyModel(nn.Module):
    """Example model following best practices."""
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config
        # Define layers
        self.input_proj = nn.Linear(config.input_dim, config.hidden_dim)
        self.layers = nn.ModuleList([
            nn.Linear(config.hidden_dim, config.hidden_dim)
            for _ in range(config.num_layers)
        ])
        self.output_proj = nn.Linear(config.hidden_dim, config.output_dim)
        self.dropout = nn.Dropout(config.dropout)
        self.layer_norm = nn.LayerNorm(config.hidden_dim)
        # Initialize weights
        self._init_weights()
    def _init_weights(self):
        """Initialize model weights with appropriate schemes."""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.LayerNorm):
                nn.init.ones_(module.weight)
                nn.init.zeros_(module.bias)
    @property
    def device(self) -> torch.device:
        """Get the device of model parameters."""
        return next(self.parameters()).device
    def forward(self, x: torch.Tensor) -> ModelOutput:
        """Forward pass with typed output."""
        # Input projection
        hidden = self.input_proj(x)
        hidden = self.layer_norm(hidden)
        hidden = F.gelu(hidden)
        # Process through layers
        for layer in self.layers:
            residual = hidden
            hidden = layer(hidden)
            hidden = self.dropout(hidden)
            hidden = F.gelu(hidden)
            hidden = hidden + residual  # Residual connection
        # Output projection
        logits = self.output_proj(hidden)
        return ModelOutput(
            logits=logits,
            hidden_states=hidden,
            attention_weights=None,
        )
 ```
 ---
 ## Device Management
 ### Device Property Pattern
 ```python
 class DeviceAwareModule(nn.Module):
    """Module with device awareness."""
    @property
    def device(self) -> torch.device:
        """Infer device from model parameters."""
        return next(self.parameters()).device
    @property
    def dtype(self) -> torch.dtype:
        """Infer dtype from model parameters."""
        return next(self.parameters()).dtype
 ```
 ### Training Script Device Setup
 ```python
 def get_device() -> torch.device:
    """Get the best available device."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device
 # Usage
 device = get_device()
 model = MyModel(config).to(device)
 ```
 ---
 ## Training Loop
 ### Standard Training Epoch
 ```python
 def train_epoch(
    model: nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    device: torch.device,
    max_grad_norm: float | None = 1.0,
 ) -> dict[str, float]:
    """Train for one epoch."""
    model.train()
    total_loss = 0.0
    num_batches = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        # Move to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.logits, targets)
        # Backward pass
        loss.backward()
        # Gradient clipping (optional but recommended)
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        # Update weights
        optimizer.step()
        # Accumulate metrics (use .item() to prevent memory leak)
        total_loss += loss.item()
        num_batches += 1
    return {
        "train_loss": total_loss / num_batches,
    }
 ```
 ### Evaluation Function
 ```python
@torch.no_grad()
 def evaluate(
    model: nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: nn.Module,
    device: torch.device,
 ) -> dict[str, float]:
    """Evaluate the model."""
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs.logits, targets)
        total_loss += loss.item()
        # Calculate accuracy
        predictions = outputs.logits.argmax(dim=-1)
        correct += (predictions == targets).sum().item()
        total += targets.size(0)
    return {
        "eval_loss": total_loss / len(dataloader),
        "accuracy": correct / total,
    }
 ```
 ---
 ## Data Loading
 ### Custom Dataset Pattern
 ```python
 from torch.utils.data import Dataset, DataLoader
 class CustomDataset(Dataset):
    """Example custom dataset."""
    def __init__(self, data: list, transform=None):
        self.data = data
        self.transform = transform
    def __len__(self) -> int:
        return len(self.data)
    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        item = self.data[idx]
        features = torch.tensor(item["features"], dtype=torch.float32)
        label = torch.tensor(item["label"], dtype=torch.long)
        if self.transform:
            features = self.transform(features)
        return features, label
 ```
 ### DataLoader Helper
 ```python
 def get_dataloader(
    dataset: Dataset,
    batch_size: int = 32,
    shuffle: bool = True,
    num_workers: int = 4,
    pin_memory: bool = True,
    drop_last: bool = False,
 ) -> DataLoader:
    """Create a DataLoader with sensible defaults."""
    # Adjust for Windows/macOS compatibility
    if num_workers > 0:
        import platform
        if platform.system() == "Windows":
            num_workers = 0  # Windows has issues with multiprocessing
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory and torch.cuda.is_available(),
        drop_last=drop_last,
        persistent_workers=num_workers > 0,
    )
 ```
 ---
 ## Checkpointing
 ### Save Checkpoint
 ```python
 def save_checkpoint(
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    path: str,
    scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
    **kwargs,
 ) -> None:
    """Save a training checkpoint."""
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    if scheduler is not None:
        checkpoint["scheduler_state_dict"] = scheduler.state_dict()
    # Add any additional data
    checkpoint.update(kwargs)
    torch.save(checkpoint, path)
    print(f"Checkpoint saved to {path}")
 ```
 ### Load Checkpoint
 ```python
 def load_checkpoint(
    path: str,
    model: nn.Module,
    optimizer: torch.optim.Optimizer | None = None,
    scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
    device: torch.device | None = None,
 ) -> dict:
    """Load a training checkpoint."""
    # Use weights_only=True for security (prevents arbitrary code execution)
    checkpoint = torch.load(path, map_location=device, weights_only=True)
    model.load_state_dict(checkpoint["model_state_dict"])
    if optimizer is not None and "optimizer_state_dict" in checkpoint:
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    if scheduler is not None and "scheduler_state_dict" in checkpoint:
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
    print(f"Loaded checkpoint from {path} (epoch {checkpoint.get('epoch', 'unknown')})")
    return checkpoint
 ```
 ---
 ## Reproducibility
 ### Set Seed Function
 ```python
 import random
 import numpy as np
 import torch
 def set_seed(seed: int = 42, deterministic: bool = True) -> None:
    """Set random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU
    if deterministic:
        # Makes operations deterministic but may reduce performance
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    else:
        # Better performance but non-deterministic
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
 # Usage at start of training script
 set_seed(42)
 ```
 ---
 ## Common Gotchas
 ### 1. In-Place Operations Breaking Autograd
 ```python
 # BAD - In-place operation can break autograd graph
 def bad_forward(x):
    x += 1  # In-place modification
    return x * 2
 # GOOD - Create new tensor
 def good_forward(x):
    x = x + 1  # Creates new tensor
    return x * 2
 ```
 ### 2. Memory Leaks from Not Detaching
 ```python
 # BAD - Keeps computation graph in memory
 losses = []
 for batch in dataloader:
    loss = model(batch)
    losses.append(loss)  # Holds entire graph!
 # GOOD - Detach with .item() for scalars
 losses = []
 for batch in dataloader:
    loss = model(batch)
    losses.append(loss.item())  # Just the number, no graph
 # GOOD - Detach for non-scalar tensors
 features = []
 for batch in dataloader:
    feat = model.encode(batch)
    features.append(feat.detach().cpu())  # Detached, moved to CPU
 ```
 ### 3. Mixed Precision Training
 ```python
 from torch.cuda.amp import autocast, GradScaler
 def train_with_amp(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    device: torch.device,
 ) -> float:
    """Training with Automatic Mixed Precision."""
    model.train()
    scaler = GradScaler()
    total_loss = 0.0
    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        # Forward pass with autocast
        with autocast():
            outputs = model(inputs)
            loss = criterion(outputs.logits, targets)
        # Backward pass with scaled gradients
        scaler.scale(loss).backward()
        # Unscale before clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Step with scaler
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(dataloader)
 ```
 ### 4. Forgetting model.eval() and model.train()
 ```python
 # BAD - Dropout/BatchNorm behave incorrectly
 def evaluate_bad(model, dataloader):
    # model.train() is still active!
    for batch in dataloader:
        output = model(batch)
 # GOOD - Always set mode explicitly
 def evaluate_good(model, dataloader):
    model.eval()  # Disable dropout, use running stats for BatchNorm
    with torch.no_grad():
        for batch in dataloader:
            output = model(batch)
    model.train()  # Restore training mode if continuing
 ```
 ### 5. Proper Tensor Creation on Device
 ```python
 # BAD - Creates on CPU then moves (slow)
 tensor = torch.zeros(100, 100).to(device)
 # GOOD - Create directly on device
 tensor = torch.zeros(100, 100, device=device)
 # GOOD - Create with same device/dtype as reference
 tensor = torch.zeros_like(reference_tensor)
 tensor = torch.empty(100, 100, device=model.device, dtype=model.dtype)
 ```
 ---
 ## Quick Reference Checklist
 - [ ] Use `nn.Module` with proper `__init__` and `forward`
 - [ ] Initialize weights with `_init_weights` method
 - [ ] Use `@property device` for device inference
 - [ ] Always use `.item()` when logging scalar losses
 - [ ] Use `@torch.no_grad()` decorator for evaluation
 - [ ] Call `model.train()` and `model.eval()` explicitly
 - [ ] Use `weights_only=True` when loading checkpoints
 - [ ] Set seeds for reproducibility at script start
 - [ ] Avoid in-place operations in forward pass
 - [ ] Use mixed precision for faster training on GPU
 - [ ] Clip gradients to prevent exploding gradients
 - [ ] Use `pin_memory=True` with CUDA for faster data transfer