PaperTool/.opencode/skills/pytorch-patterns/SKILL.md

529 lines
14 KiB
Markdown

---
name: pytorch-patterns
description: Use when writing PyTorch code to follow best practices and common patterns
---
# PyTorch Patterns Skill
## Overview
This skill provides best practices and common patterns for writing PyTorch code. Use this when implementing neural networks, training loops, data loading, and related deep learning infrastructure.
**Announce:** "I'm using the pytorch-patterns skill for best practice code."
---
## Model Definition
### Basic nn.Module Pattern
```python
from typing import NamedTuple
import torch
import torch.nn as nn
import torch.nn.functional as F
class ModelConfig:
"""Configuration for the model."""
def __init__(
self,
input_dim: int = 768,
hidden_dim: int = 512,
output_dim: int = 10,
dropout: float = 0.1,
num_layers: int = 2,
):
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.dropout = dropout
self.num_layers = num_layers
class ModelOutput(NamedTuple):
"""Typed output container for model forward pass."""
logits: torch.Tensor
hidden_states: torch.Tensor
attention_weights: torch.Tensor | None = None
class MyModel(nn.Module):
"""Example model following best practices."""
def __init__(self, config: ModelConfig):
super().__init__()
self.config = config
# Define layers
self.input_proj = nn.Linear(config.input_dim, config.hidden_dim)
self.layers = nn.ModuleList([
nn.Linear(config.hidden_dim, config.hidden_dim)
for _ in range(config.num_layers)
])
self.output_proj = nn.Linear(config.hidden_dim, config.output_dim)
self.dropout = nn.Dropout(config.dropout)
self.layer_norm = nn.LayerNorm(config.hidden_dim)
# Initialize weights
self._init_weights()
def _init_weights(self):
"""Initialize model weights with appropriate schemes."""
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.LayerNorm):
nn.init.ones_(module.weight)
nn.init.zeros_(module.bias)
@property
def device(self) -> torch.device:
"""Get the device of model parameters."""
return next(self.parameters()).device
def forward(self, x: torch.Tensor) -> ModelOutput:
"""Forward pass with typed output."""
# Input projection
hidden = self.input_proj(x)
hidden = self.layer_norm(hidden)
hidden = F.gelu(hidden)
# Process through layers
for layer in self.layers:
residual = hidden
hidden = layer(hidden)
hidden = self.dropout(hidden)
hidden = F.gelu(hidden)
hidden = hidden + residual # Residual connection
# Output projection
logits = self.output_proj(hidden)
return ModelOutput(
logits=logits,
hidden_states=hidden,
attention_weights=None,
)
```
---
## Device Management
### Device Property Pattern
```python
class DeviceAwareModule(nn.Module):
"""Module with device awareness."""
@property
def device(self) -> torch.device:
"""Infer device from model parameters."""
return next(self.parameters()).device
@property
def dtype(self) -> torch.dtype:
"""Infer dtype from model parameters."""
return next(self.parameters()).dtype
```
### Training Script Device Setup
```python
def get_device() -> torch.device:
"""Get the best available device."""
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
device = torch.device("mps")
print("Using Apple MPS")
else:
device = torch.device("cpu")
print("Using CPU")
return device
# Usage
device = get_device()
model = MyModel(config).to(device)
```
---
## Training Loop
### Standard Training Epoch
```python
def train_epoch(
model: nn.Module,
dataloader: torch.utils.data.DataLoader,
optimizer: torch.optim.Optimizer,
criterion: nn.Module,
device: torch.device,
max_grad_norm: float | None = 1.0,
) -> dict[str, float]:
"""Train for one epoch."""
model.train()
total_loss = 0.0
num_batches = 0
for batch_idx, (inputs, targets) in enumerate(dataloader):
# Move to device
inputs = inputs.to(device)
targets = targets.to(device)
# Forward pass
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs.logits, targets)
# Backward pass
loss.backward()
# Gradient clipping (optional but recommended)
if max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
# Update weights
optimizer.step()
# Accumulate metrics (use .item() to prevent memory leak)
total_loss += loss.item()
num_batches += 1
return {
"train_loss": total_loss / num_batches,
}
```
### Evaluation Function
```python
@torch.no_grad()
def evaluate(
model: nn.Module,
dataloader: torch.utils.data.DataLoader,
criterion: nn.Module,
device: torch.device,
) -> dict[str, float]:
"""Evaluate the model."""
model.eval()
total_loss = 0.0
correct = 0
total = 0
for inputs, targets in dataloader:
inputs = inputs.to(device)
targets = targets.to(device)
outputs = model(inputs)
loss = criterion(outputs.logits, targets)
total_loss += loss.item()
# Calculate accuracy
predictions = outputs.logits.argmax(dim=-1)
correct += (predictions == targets).sum().item()
total += targets.size(0)
return {
"eval_loss": total_loss / len(dataloader),
"accuracy": correct / total,
}
```
---
## Data Loading
### Custom Dataset Pattern
```python
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
"""Example custom dataset."""
def __init__(self, data: list, transform=None):
self.data = data
self.transform = transform
def __len__(self) -> int:
return len(self.data)
def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
item = self.data[idx]
features = torch.tensor(item["features"], dtype=torch.float32)
label = torch.tensor(item["label"], dtype=torch.long)
if self.transform:
features = self.transform(features)
return features, label
```
### DataLoader Helper
```python
def get_dataloader(
dataset: Dataset,
batch_size: int = 32,
shuffle: bool = True,
num_workers: int = 4,
pin_memory: bool = True,
drop_last: bool = False,
) -> DataLoader:
"""Create a DataLoader with sensible defaults."""
# Adjust for Windows/macOS compatibility
if num_workers > 0:
import platform
if platform.system() == "Windows":
num_workers = 0 # Windows has issues with multiprocessing
return DataLoader(
dataset,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
pin_memory=pin_memory and torch.cuda.is_available(),
drop_last=drop_last,
persistent_workers=num_workers > 0,
)
```
---
## Checkpointing
### Save Checkpoint
```python
def save_checkpoint(
model: nn.Module,
optimizer: torch.optim.Optimizer,
epoch: int,
loss: float,
path: str,
scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
**kwargs,
) -> None:
"""Save a training checkpoint."""
checkpoint = {
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"loss": loss,
}
if scheduler is not None:
checkpoint["scheduler_state_dict"] = scheduler.state_dict()
# Add any additional data
checkpoint.update(kwargs)
torch.save(checkpoint, path)
print(f"Checkpoint saved to {path}")
```
### Load Checkpoint
```python
def load_checkpoint(
path: str,
model: nn.Module,
optimizer: torch.optim.Optimizer | None = None,
scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
device: torch.device | None = None,
) -> dict:
"""Load a training checkpoint."""
# Use weights_only=True for security (prevents arbitrary code execution)
checkpoint = torch.load(path, map_location=device, weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
if optimizer is not None and "optimizer_state_dict" in checkpoint:
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
if scheduler is not None and "scheduler_state_dict" in checkpoint:
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
print(f"Loaded checkpoint from {path} (epoch {checkpoint.get('epoch', 'unknown')})")
return checkpoint
```
---
## Reproducibility
### Set Seed Function
```python
import random
import numpy as np
import torch
def set_seed(seed: int = 42, deterministic: bool = True) -> None:
"""Set random seeds for reproducibility."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # For multi-GPU
if deterministic:
# Makes operations deterministic but may reduce performance
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else:
# Better performance but non-deterministic
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
# Usage at start of training script
set_seed(42)
```
---
## Common Gotchas
### 1. In-Place Operations Breaking Autograd
```python
# BAD - In-place operation can break autograd graph
def bad_forward(x):
x += 1 # In-place modification
return x * 2
# GOOD - Create new tensor
def good_forward(x):
x = x + 1 # Creates new tensor
return x * 2
```
### 2. Memory Leaks from Not Detaching
```python
# BAD - Keeps computation graph in memory
losses = []
for batch in dataloader:
loss = model(batch)
losses.append(loss) # Holds entire graph!
# GOOD - Detach with .item() for scalars
losses = []
for batch in dataloader:
loss = model(batch)
losses.append(loss.item()) # Just the number, no graph
# GOOD - Detach for non-scalar tensors
features = []
for batch in dataloader:
feat = model.encode(batch)
features.append(feat.detach().cpu()) # Detached, moved to CPU
```
### 3. Mixed Precision Training
```python
from torch.cuda.amp import autocast, GradScaler
def train_with_amp(
model: nn.Module,
dataloader: DataLoader,
optimizer: torch.optim.Optimizer,
criterion: nn.Module,
device: torch.device,
) -> float:
"""Training with Automatic Mixed Precision."""
model.train()
scaler = GradScaler()
total_loss = 0.0
for inputs, targets in dataloader:
inputs = inputs.to(device)
targets = targets.to(device)
optimizer.zero_grad()
# Forward pass with autocast
with autocast():
outputs = model(inputs)
loss = criterion(outputs.logits, targets)
# Backward pass with scaled gradients
scaler.scale(loss).backward()
# Unscale before clipping
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Step with scaler
scaler.step(optimizer)
scaler.update()
total_loss += loss.item()
return total_loss / len(dataloader)
```
### 4. Forgetting model.eval() and model.train()
```python
# BAD - Dropout/BatchNorm behave incorrectly
def evaluate_bad(model, dataloader):
# model.train() is still active!
for batch in dataloader:
output = model(batch)
# GOOD - Always set mode explicitly
def evaluate_good(model, dataloader):
model.eval() # Disable dropout, use running stats for BatchNorm
with torch.no_grad():
for batch in dataloader:
output = model(batch)
model.train() # Restore training mode if continuing
```
### 5. Proper Tensor Creation on Device
```python
# BAD - Creates on CPU then moves (slow)
tensor = torch.zeros(100, 100).to(device)
# GOOD - Create directly on device
tensor = torch.zeros(100, 100, device=device)
# GOOD - Create with same device/dtype as reference
tensor = torch.zeros_like(reference_tensor)
tensor = torch.empty(100, 100, device=model.device, dtype=model.dtype)
```
---
## Quick Reference Checklist
- [ ] Use `nn.Module` with proper `__init__` and `forward`
- [ ] Initialize weights with `_init_weights` method
- [ ] Use `@property device` for device inference
- [ ] Always use `.item()` when logging scalar losses
- [ ] Use `@torch.no_grad()` decorator for evaluation
- [ ] Call `model.train()` and `model.eval()` explicitly
- [ ] Use `weights_only=True` when loading checkpoints
- [ ] Set seeds for reproducibility at script start
- [ ] Avoid in-place operations in forward pass
- [ ] Use mixed precision for faster training on GPU
- [ ] Clip gradients to prevent exploding gradients
- [ ] Use `pin_memory=True` with CUDA for faster data transfer