feat(skills): add pytorch-patterns skill

2026-03-31 17:41:36 +08:00 · 2026-03-31 17:41:36 +08:00 · 06282c7314
commit 06282c7314
parent cd6e1ebd27
1 changed files with 528 additions and 0 deletions
--- a/.opencode/skills/pytorch-patterns/SKILL.md
+++ b/.opencode/skills/pytorch-patterns/SKILL.md
@ -0,0 +1,528 @@
+---
+name: pytorch-patterns
+description: Use when writing PyTorch code to follow best practices and common patterns
+---
+
+# PyTorch Patterns Skill
+
+## Overview
+
+This skill provides best practices and common patterns for writing PyTorch code. Use this when implementing neural networks, training loops, data loading, and related deep learning infrastructure.
+
+**Announce:** "I'm using the pytorch-patterns skill for best practice code."
+
+---
+
+## Model Definition
+
+### Basic nn.Module Pattern
+
+```python
+from typing import NamedTuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ModelConfig:
+    """Configuration for the model."""
+    def __init__(
+        self,
+        input_dim: int = 768,
+        hidden_dim: int = 512,
+        output_dim: int = 10,
+        dropout: float = 0.1,
+        num_layers: int = 2,
+    ):
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+        self.dropout = dropout
+        self.num_layers = num_layers
+
+
+class ModelOutput(NamedTuple):
+    """Typed output container for model forward pass."""
+    logits: torch.Tensor
+    hidden_states: torch.Tensor
+    attention_weights: torch.Tensor | None = None
+
+
+class MyModel(nn.Module):
+    """Example model following best practices."""
+    
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        
+        # Define layers
+        self.input_proj = nn.Linear(config.input_dim, config.hidden_dim)
+        self.layers = nn.ModuleList([
+            nn.Linear(config.hidden_dim, config.hidden_dim)
+            for _ in range(config.num_layers)
+        ])
+        self.output_proj = nn.Linear(config.hidden_dim, config.output_dim)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_dim)
+        
+        # Initialize weights
+        self._init_weights()
+    
+    def _init_weights(self):
+        """Initialize model weights with appropriate schemes."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.ones_(module.weight)
+                nn.init.zeros_(module.bias)
+    
+    @property
+    def device(self) -> torch.device:
+        """Get the device of model parameters."""
+        return next(self.parameters()).device
+    
+    def forward(self, x: torch.Tensor) -> ModelOutput:
+        """Forward pass with typed output."""
+        # Input projection
+        hidden = self.input_proj(x)
+        hidden = self.layer_norm(hidden)
+        hidden = F.gelu(hidden)
+        
+        # Process through layers
+        for layer in self.layers:
+            residual = hidden
+            hidden = layer(hidden)
+            hidden = self.dropout(hidden)
+            hidden = F.gelu(hidden)
+            hidden = hidden + residual  # Residual connection
+        
+        # Output projection
+        logits = self.output_proj(hidden)
+        
+        return ModelOutput(
+            logits=logits,
+            hidden_states=hidden,
+            attention_weights=None,
+        )
+```
+
+---
+
+## Device Management
+
+### Device Property Pattern
+
+```python
+class DeviceAwareModule(nn.Module):
+    """Module with device awareness."""
+    
+    @property
+    def device(self) -> torch.device:
+        """Infer device from model parameters."""
+        return next(self.parameters()).device
+    
+    @property
+    def dtype(self) -> torch.dtype:
+        """Infer dtype from model parameters."""
+        return next(self.parameters()).dtype
+```
+
+### Training Script Device Setup
+
+```python
+def get_device() -> torch.device:
+    """Get the best available device."""
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        print("Using Apple MPS")
+    else:
+        device = torch.device("cpu")
+        print("Using CPU")
+    return device
+
+
+# Usage
+device = get_device()
+model = MyModel(config).to(device)
+```
+
+---
+
+## Training Loop
+
+### Standard Training Epoch
+
+```python
+def train_epoch(
+    model: nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    criterion: nn.Module,
+    device: torch.device,
+    max_grad_norm: float | None = 1.0,
+) -> dict[str, float]:
+    """Train for one epoch."""
+    model.train()
+    total_loss = 0.0
+    num_batches = 0
+    
+    for batch_idx, (inputs, targets) in enumerate(dataloader):
+        # Move to device
+        inputs = inputs.to(device)
+        targets = targets.to(device)
+        
+        # Forward pass
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = criterion(outputs.logits, targets)
+        
+        # Backward pass
+        loss.backward()
+        
+        # Gradient clipping (optional but recommended)
+        if max_grad_norm is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
+        
+        # Update weights
+        optimizer.step()
+        
+        # Accumulate metrics (use .item() to prevent memory leak)
+        total_loss += loss.item()
+        num_batches += 1
+    
+    return {
+        "train_loss": total_loss / num_batches,
+    }
+```
+
+### Evaluation Function
+
+```python
+@torch.no_grad()
+def evaluate(
+    model: nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    criterion: nn.Module,
+    device: torch.device,
+) -> dict[str, float]:
+    """Evaluate the model."""
+    model.eval()
+    total_loss = 0.0
+    correct = 0
+    total = 0
+    
+    for inputs, targets in dataloader:
+        inputs = inputs.to(device)
+        targets = targets.to(device)
+        
+        outputs = model(inputs)
+        loss = criterion(outputs.logits, targets)
+        
+        total_loss += loss.item()
+        
+        # Calculate accuracy
+        predictions = outputs.logits.argmax(dim=-1)
+        correct += (predictions == targets).sum().item()
+        total += targets.size(0)
+    
+    return {
+        "eval_loss": total_loss / len(dataloader),
+        "accuracy": correct / total,
+    }
+```
+
+---
+
+## Data Loading
+
+### Custom Dataset Pattern
+
+```python
+from torch.utils.data import Dataset, DataLoader
+
+
+class CustomDataset(Dataset):
+    """Example custom dataset."""
+    
+    def __init__(self, data: list, transform=None):
+        self.data = data
+        self.transform = transform
+    
+    def __len__(self) -> int:
+        return len(self.data)
+    
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        item = self.data[idx]
+        features = torch.tensor(item["features"], dtype=torch.float32)
+        label = torch.tensor(item["label"], dtype=torch.long)
+        
+        if self.transform:
+            features = self.transform(features)
+        
+        return features, label
+```
+
+### DataLoader Helper
+
+```python
+def get_dataloader(
+    dataset: Dataset,
+    batch_size: int = 32,
+    shuffle: bool = True,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    drop_last: bool = False,
+) -> DataLoader:
+    """Create a DataLoader with sensible defaults."""
+    # Adjust for Windows/macOS compatibility
+    if num_workers > 0:
+        import platform
+        if platform.system() == "Windows":
+            num_workers = 0  # Windows has issues with multiprocessing
+    
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        num_workers=num_workers,
+        pin_memory=pin_memory and torch.cuda.is_available(),
+        drop_last=drop_last,
+        persistent_workers=num_workers > 0,
+    )
+```
+
+---
+
+## Checkpointing
+
+### Save Checkpoint
+
+```python
+def save_checkpoint(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    epoch: int,
+    loss: float,
+    path: str,
+    scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
+    **kwargs,
+) -> None:
+    """Save a training checkpoint."""
+    checkpoint = {
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "loss": loss,
+    }
+    
+    if scheduler is not None:
+        checkpoint["scheduler_state_dict"] = scheduler.state_dict()
+    
+    # Add any additional data
+    checkpoint.update(kwargs)
+    
+    torch.save(checkpoint, path)
+    print(f"Checkpoint saved to {path}")
+```
+
+### Load Checkpoint
+
+```python
+def load_checkpoint(
+    path: str,
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer | None = None,
+    scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
+    device: torch.device | None = None,
+) -> dict:
+    """Load a training checkpoint."""
+    # Use weights_only=True for security (prevents arbitrary code execution)
+    checkpoint = torch.load(path, map_location=device, weights_only=True)
+    
+    model.load_state_dict(checkpoint["model_state_dict"])
+    
+    if optimizer is not None and "optimizer_state_dict" in checkpoint:
+        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+    
+    if scheduler is not None and "scheduler_state_dict" in checkpoint:
+        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+    
+    print(f"Loaded checkpoint from {path} (epoch {checkpoint.get('epoch', 'unknown')})")
+    return checkpoint
+```
+
+---
+
+## Reproducibility
+
+### Set Seed Function
+
+```python
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed: int = 42, deterministic: bool = True) -> None:
+    """Set random seeds for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # For multi-GPU
+    
+    if deterministic:
+        # Makes operations deterministic but may reduce performance
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    else:
+        # Better performance but non-deterministic
+        torch.backends.cudnn.deterministic = False
+        torch.backends.cudnn.benchmark = True
+
+
+# Usage at start of training script
+set_seed(42)
+```
+
+---
+
+## Common Gotchas
+
+### 1. In-Place Operations Breaking Autograd
+
+```python
+# BAD - In-place operation can break autograd graph
+def bad_forward(x):
+    x += 1  # In-place modification
+    return x * 2
+
+# GOOD - Create new tensor
+def good_forward(x):
+    x = x + 1  # Creates new tensor
+    return x * 2
+```
+
+### 2. Memory Leaks from Not Detaching
+
+```python
+# BAD - Keeps computation graph in memory
+losses = []
+for batch in dataloader:
+    loss = model(batch)
+    losses.append(loss)  # Holds entire graph!
+
+# GOOD - Detach with .item() for scalars
+losses = []
+for batch in dataloader:
+    loss = model(batch)
+    losses.append(loss.item())  # Just the number, no graph
+
+# GOOD - Detach for non-scalar tensors
+features = []
+for batch in dataloader:
+    feat = model.encode(batch)
+    features.append(feat.detach().cpu())  # Detached, moved to CPU
+```
+
+### 3. Mixed Precision Training
+
+```python
+from torch.cuda.amp import autocast, GradScaler
+
+
+def train_with_amp(
+    model: nn.Module,
+    dataloader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    criterion: nn.Module,
+    device: torch.device,
+) -> float:
+    """Training with Automatic Mixed Precision."""
+    model.train()
+    scaler = GradScaler()
+    total_loss = 0.0
+    
+    for inputs, targets in dataloader:
+        inputs = inputs.to(device)
+        targets = targets.to(device)
+        
+        optimizer.zero_grad()
+        
+        # Forward pass with autocast
+        with autocast():
+            outputs = model(inputs)
+            loss = criterion(outputs.logits, targets)
+        
+        # Backward pass with scaled gradients
+        scaler.scale(loss).backward()
+        
+        # Unscale before clipping
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        
+        # Step with scaler
+        scaler.step(optimizer)
+        scaler.update()
+        
+        total_loss += loss.item()
+    
+    return total_loss / len(dataloader)
+```
+
+### 4. Forgetting model.eval() and model.train()
+
+```python
+# BAD - Dropout/BatchNorm behave incorrectly
+def evaluate_bad(model, dataloader):
+    # model.train() is still active!
+    for batch in dataloader:
+        output = model(batch)
+
+# GOOD - Always set mode explicitly
+def evaluate_good(model, dataloader):
+    model.eval()  # Disable dropout, use running stats for BatchNorm
+    with torch.no_grad():
+        for batch in dataloader:
+            output = model(batch)
+    model.train()  # Restore training mode if continuing
+```
+
+### 5. Proper Tensor Creation on Device
+
+```python
+# BAD - Creates on CPU then moves (slow)
+tensor = torch.zeros(100, 100).to(device)
+
+# GOOD - Create directly on device
+tensor = torch.zeros(100, 100, device=device)
+
+# GOOD - Create with same device/dtype as reference
+tensor = torch.zeros_like(reference_tensor)
+tensor = torch.empty(100, 100, device=model.device, dtype=model.dtype)
+```
+
+---
+
+## Quick Reference Checklist
+
+- [ ] Use `nn.Module` with proper `__init__` and `forward`
+- [ ] Initialize weights with `_init_weights` method
+- [ ] Use `@property device` for device inference
+- [ ] Always use `.item()` when logging scalar losses
+- [ ] Use `@torch.no_grad()` decorator for evaluation
+- [ ] Call `model.train()` and `model.eval()` explicitly
+- [ ] Use `weights_only=True` when loading checkpoints
+- [ ] Set seeds for reproducibility at script start
+- [ ] Avoid in-place operations in forward pass
+- [ ] Use mixed precision for faster training on GPU
+- [ ] Clip gradients to prevent exploding gradients
+- [ ] Use `pin_memory=True` with CUDA for faster data transfer