feat(skills): add pytorch-patterns skill
This commit is contained in:
parent
cd6e1ebd27
commit
06282c7314
528
.opencode/skills/pytorch-patterns/SKILL.md
Normal file
528
.opencode/skills/pytorch-patterns/SKILL.md
Normal file
@ -0,0 +1,528 @@
|
|||||||
|
---
|
||||||
|
name: pytorch-patterns
|
||||||
|
description: Use when writing PyTorch code to follow best practices and common patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# PyTorch Patterns Skill
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This skill provides best practices and common patterns for writing PyTorch code. Use this when implementing neural networks, training loops, data loading, and related deep learning infrastructure.
|
||||||
|
|
||||||
|
**Announce:** "I'm using the pytorch-patterns skill for best practice code."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Definition
|
||||||
|
|
||||||
|
### Basic nn.Module Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from typing import NamedTuple
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class ModelConfig:
|
||||||
|
"""Configuration for the model."""
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_dim: int = 768,
|
||||||
|
hidden_dim: int = 512,
|
||||||
|
output_dim: int = 10,
|
||||||
|
dropout: float = 0.1,
|
||||||
|
num_layers: int = 2,
|
||||||
|
):
|
||||||
|
self.input_dim = input_dim
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
self.output_dim = output_dim
|
||||||
|
self.dropout = dropout
|
||||||
|
self.num_layers = num_layers
|
||||||
|
|
||||||
|
|
||||||
|
class ModelOutput(NamedTuple):
|
||||||
|
"""Typed output container for model forward pass."""
|
||||||
|
logits: torch.Tensor
|
||||||
|
hidden_states: torch.Tensor
|
||||||
|
attention_weights: torch.Tensor | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class MyModel(nn.Module):
|
||||||
|
"""Example model following best practices."""
|
||||||
|
|
||||||
|
def __init__(self, config: ModelConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
# Define layers
|
||||||
|
self.input_proj = nn.Linear(config.input_dim, config.hidden_dim)
|
||||||
|
self.layers = nn.ModuleList([
|
||||||
|
nn.Linear(config.hidden_dim, config.hidden_dim)
|
||||||
|
for _ in range(config.num_layers)
|
||||||
|
])
|
||||||
|
self.output_proj = nn.Linear(config.hidden_dim, config.output_dim)
|
||||||
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
|
self.layer_norm = nn.LayerNorm(config.hidden_dim)
|
||||||
|
|
||||||
|
# Initialize weights
|
||||||
|
self._init_weights()
|
||||||
|
|
||||||
|
def _init_weights(self):
|
||||||
|
"""Initialize model weights with appropriate schemes."""
|
||||||
|
for module in self.modules():
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
nn.init.xavier_uniform_(module.weight)
|
||||||
|
if module.bias is not None:
|
||||||
|
nn.init.zeros_(module.bias)
|
||||||
|
elif isinstance(module, nn.LayerNorm):
|
||||||
|
nn.init.ones_(module.weight)
|
||||||
|
nn.init.zeros_(module.bias)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def device(self) -> torch.device:
|
||||||
|
"""Get the device of model parameters."""
|
||||||
|
return next(self.parameters()).device
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> ModelOutput:
|
||||||
|
"""Forward pass with typed output."""
|
||||||
|
# Input projection
|
||||||
|
hidden = self.input_proj(x)
|
||||||
|
hidden = self.layer_norm(hidden)
|
||||||
|
hidden = F.gelu(hidden)
|
||||||
|
|
||||||
|
# Process through layers
|
||||||
|
for layer in self.layers:
|
||||||
|
residual = hidden
|
||||||
|
hidden = layer(hidden)
|
||||||
|
hidden = self.dropout(hidden)
|
||||||
|
hidden = F.gelu(hidden)
|
||||||
|
hidden = hidden + residual # Residual connection
|
||||||
|
|
||||||
|
# Output projection
|
||||||
|
logits = self.output_proj(hidden)
|
||||||
|
|
||||||
|
return ModelOutput(
|
||||||
|
logits=logits,
|
||||||
|
hidden_states=hidden,
|
||||||
|
attention_weights=None,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Device Management
|
||||||
|
|
||||||
|
### Device Property Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DeviceAwareModule(nn.Module):
|
||||||
|
"""Module with device awareness."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def device(self) -> torch.device:
|
||||||
|
"""Infer device from model parameters."""
|
||||||
|
return next(self.parameters()).device
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtype(self) -> torch.dtype:
|
||||||
|
"""Infer dtype from model parameters."""
|
||||||
|
return next(self.parameters()).dtype
|
||||||
|
```
|
||||||
|
|
||||||
|
### Training Script Device Setup
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_device() -> torch.device:
|
||||||
|
"""Get the best available device."""
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = torch.device("cuda")
|
||||||
|
print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
device = torch.device("mps")
|
||||||
|
print("Using Apple MPS")
|
||||||
|
else:
|
||||||
|
device = torch.device("cpu")
|
||||||
|
print("Using CPU")
|
||||||
|
return device
|
||||||
|
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
device = get_device()
|
||||||
|
model = MyModel(config).to(device)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Training Loop
|
||||||
|
|
||||||
|
### Standard Training Epoch
|
||||||
|
|
||||||
|
```python
|
||||||
|
def train_epoch(
|
||||||
|
model: nn.Module,
|
||||||
|
dataloader: torch.utils.data.DataLoader,
|
||||||
|
optimizer: torch.optim.Optimizer,
|
||||||
|
criterion: nn.Module,
|
||||||
|
device: torch.device,
|
||||||
|
max_grad_norm: float | None = 1.0,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Train for one epoch."""
|
||||||
|
model.train()
|
||||||
|
total_loss = 0.0
|
||||||
|
num_batches = 0
|
||||||
|
|
||||||
|
for batch_idx, (inputs, targets) in enumerate(dataloader):
|
||||||
|
# Move to device
|
||||||
|
inputs = inputs.to(device)
|
||||||
|
targets = targets.to(device)
|
||||||
|
|
||||||
|
# Forward pass
|
||||||
|
optimizer.zero_grad()
|
||||||
|
outputs = model(inputs)
|
||||||
|
loss = criterion(outputs.logits, targets)
|
||||||
|
|
||||||
|
# Backward pass
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# Gradient clipping (optional but recommended)
|
||||||
|
if max_grad_norm is not None:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
|
||||||
|
|
||||||
|
# Update weights
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
# Accumulate metrics (use .item() to prevent memory leak)
|
||||||
|
total_loss += loss.item()
|
||||||
|
num_batches += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"train_loss": total_loss / num_batches,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Evaluation Function
|
||||||
|
|
||||||
|
```python
|
||||||
|
@torch.no_grad()
|
||||||
|
def evaluate(
|
||||||
|
model: nn.Module,
|
||||||
|
dataloader: torch.utils.data.DataLoader,
|
||||||
|
criterion: nn.Module,
|
||||||
|
device: torch.device,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Evaluate the model."""
|
||||||
|
model.eval()
|
||||||
|
total_loss = 0.0
|
||||||
|
correct = 0
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for inputs, targets in dataloader:
|
||||||
|
inputs = inputs.to(device)
|
||||||
|
targets = targets.to(device)
|
||||||
|
|
||||||
|
outputs = model(inputs)
|
||||||
|
loss = criterion(outputs.logits, targets)
|
||||||
|
|
||||||
|
total_loss += loss.item()
|
||||||
|
|
||||||
|
# Calculate accuracy
|
||||||
|
predictions = outputs.logits.argmax(dim=-1)
|
||||||
|
correct += (predictions == targets).sum().item()
|
||||||
|
total += targets.size(0)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"eval_loss": total_loss / len(dataloader),
|
||||||
|
"accuracy": correct / total,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Loading
|
||||||
|
|
||||||
|
### Custom Dataset Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
|
||||||
|
|
||||||
|
class CustomDataset(Dataset):
|
||||||
|
"""Example custom dataset."""
|
||||||
|
|
||||||
|
def __init__(self, data: list, transform=None):
|
||||||
|
self.data = data
|
||||||
|
self.transform = transform
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
item = self.data[idx]
|
||||||
|
features = torch.tensor(item["features"], dtype=torch.float32)
|
||||||
|
label = torch.tensor(item["label"], dtype=torch.long)
|
||||||
|
|
||||||
|
if self.transform:
|
||||||
|
features = self.transform(features)
|
||||||
|
|
||||||
|
return features, label
|
||||||
|
```
|
||||||
|
|
||||||
|
### DataLoader Helper
|
||||||
|
|
||||||
|
```python
|
||||||
|
def get_dataloader(
|
||||||
|
dataset: Dataset,
|
||||||
|
batch_size: int = 32,
|
||||||
|
shuffle: bool = True,
|
||||||
|
num_workers: int = 4,
|
||||||
|
pin_memory: bool = True,
|
||||||
|
drop_last: bool = False,
|
||||||
|
) -> DataLoader:
|
||||||
|
"""Create a DataLoader with sensible defaults."""
|
||||||
|
# Adjust for Windows/macOS compatibility
|
||||||
|
if num_workers > 0:
|
||||||
|
import platform
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
num_workers = 0 # Windows has issues with multiprocessing
|
||||||
|
|
||||||
|
return DataLoader(
|
||||||
|
dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=shuffle,
|
||||||
|
num_workers=num_workers,
|
||||||
|
pin_memory=pin_memory and torch.cuda.is_available(),
|
||||||
|
drop_last=drop_last,
|
||||||
|
persistent_workers=num_workers > 0,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Checkpointing
|
||||||
|
|
||||||
|
### Save Checkpoint
|
||||||
|
|
||||||
|
```python
|
||||||
|
def save_checkpoint(
|
||||||
|
model: nn.Module,
|
||||||
|
optimizer: torch.optim.Optimizer,
|
||||||
|
epoch: int,
|
||||||
|
loss: float,
|
||||||
|
path: str,
|
||||||
|
scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""Save a training checkpoint."""
|
||||||
|
checkpoint = {
|
||||||
|
"epoch": epoch,
|
||||||
|
"model_state_dict": model.state_dict(),
|
||||||
|
"optimizer_state_dict": optimizer.state_dict(),
|
||||||
|
"loss": loss,
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheduler is not None:
|
||||||
|
checkpoint["scheduler_state_dict"] = scheduler.state_dict()
|
||||||
|
|
||||||
|
# Add any additional data
|
||||||
|
checkpoint.update(kwargs)
|
||||||
|
|
||||||
|
torch.save(checkpoint, path)
|
||||||
|
print(f"Checkpoint saved to {path}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Load Checkpoint
|
||||||
|
|
||||||
|
```python
|
||||||
|
def load_checkpoint(
|
||||||
|
path: str,
|
||||||
|
model: nn.Module,
|
||||||
|
optimizer: torch.optim.Optimizer | None = None,
|
||||||
|
scheduler: torch.optim.lr_scheduler._LRScheduler | None = None,
|
||||||
|
device: torch.device | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Load a training checkpoint."""
|
||||||
|
# Use weights_only=True for security (prevents arbitrary code execution)
|
||||||
|
checkpoint = torch.load(path, map_location=device, weights_only=True)
|
||||||
|
|
||||||
|
model.load_state_dict(checkpoint["model_state_dict"])
|
||||||
|
|
||||||
|
if optimizer is not None and "optimizer_state_dict" in checkpoint:
|
||||||
|
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
|
||||||
|
|
||||||
|
if scheduler is not None and "scheduler_state_dict" in checkpoint:
|
||||||
|
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
|
||||||
|
|
||||||
|
print(f"Loaded checkpoint from {path} (epoch {checkpoint.get('epoch', 'unknown')})")
|
||||||
|
return checkpoint
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reproducibility
|
||||||
|
|
||||||
|
### Set Seed Function
|
||||||
|
|
||||||
|
```python
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def set_seed(seed: int = 42, deterministic: bool = True) -> None:
|
||||||
|
"""Set random seeds for reproducibility."""
|
||||||
|
random.seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed_all(seed) # For multi-GPU
|
||||||
|
|
||||||
|
if deterministic:
|
||||||
|
# Makes operations deterministic but may reduce performance
|
||||||
|
torch.backends.cudnn.deterministic = True
|
||||||
|
torch.backends.cudnn.benchmark = False
|
||||||
|
else:
|
||||||
|
# Better performance but non-deterministic
|
||||||
|
torch.backends.cudnn.deterministic = False
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
|
||||||
|
|
||||||
|
# Usage at start of training script
|
||||||
|
set_seed(42)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Gotchas
|
||||||
|
|
||||||
|
### 1. In-Place Operations Breaking Autograd
|
||||||
|
|
||||||
|
```python
|
||||||
|
# BAD - In-place operation can break autograd graph
|
||||||
|
def bad_forward(x):
|
||||||
|
x += 1 # In-place modification
|
||||||
|
return x * 2
|
||||||
|
|
||||||
|
# GOOD - Create new tensor
|
||||||
|
def good_forward(x):
|
||||||
|
x = x + 1 # Creates new tensor
|
||||||
|
return x * 2
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Memory Leaks from Not Detaching
|
||||||
|
|
||||||
|
```python
|
||||||
|
# BAD - Keeps computation graph in memory
|
||||||
|
losses = []
|
||||||
|
for batch in dataloader:
|
||||||
|
loss = model(batch)
|
||||||
|
losses.append(loss) # Holds entire graph!
|
||||||
|
|
||||||
|
# GOOD - Detach with .item() for scalars
|
||||||
|
losses = []
|
||||||
|
for batch in dataloader:
|
||||||
|
loss = model(batch)
|
||||||
|
losses.append(loss.item()) # Just the number, no graph
|
||||||
|
|
||||||
|
# GOOD - Detach for non-scalar tensors
|
||||||
|
features = []
|
||||||
|
for batch in dataloader:
|
||||||
|
feat = model.encode(batch)
|
||||||
|
features.append(feat.detach().cpu()) # Detached, moved to CPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Mixed Precision Training
|
||||||
|
|
||||||
|
```python
|
||||||
|
from torch.cuda.amp import autocast, GradScaler
|
||||||
|
|
||||||
|
|
||||||
|
def train_with_amp(
|
||||||
|
model: nn.Module,
|
||||||
|
dataloader: DataLoader,
|
||||||
|
optimizer: torch.optim.Optimizer,
|
||||||
|
criterion: nn.Module,
|
||||||
|
device: torch.device,
|
||||||
|
) -> float:
|
||||||
|
"""Training with Automatic Mixed Precision."""
|
||||||
|
model.train()
|
||||||
|
scaler = GradScaler()
|
||||||
|
total_loss = 0.0
|
||||||
|
|
||||||
|
for inputs, targets in dataloader:
|
||||||
|
inputs = inputs.to(device)
|
||||||
|
targets = targets.to(device)
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
# Forward pass with autocast
|
||||||
|
with autocast():
|
||||||
|
outputs = model(inputs)
|
||||||
|
loss = criterion(outputs.logits, targets)
|
||||||
|
|
||||||
|
# Backward pass with scaled gradients
|
||||||
|
scaler.scale(loss).backward()
|
||||||
|
|
||||||
|
# Unscale before clipping
|
||||||
|
scaler.unscale_(optimizer)
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
||||||
|
|
||||||
|
# Step with scaler
|
||||||
|
scaler.step(optimizer)
|
||||||
|
scaler.update()
|
||||||
|
|
||||||
|
total_loss += loss.item()
|
||||||
|
|
||||||
|
return total_loss / len(dataloader)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Forgetting model.eval() and model.train()
|
||||||
|
|
||||||
|
```python
|
||||||
|
# BAD - Dropout/BatchNorm behave incorrectly
|
||||||
|
def evaluate_bad(model, dataloader):
|
||||||
|
# model.train() is still active!
|
||||||
|
for batch in dataloader:
|
||||||
|
output = model(batch)
|
||||||
|
|
||||||
|
# GOOD - Always set mode explicitly
|
||||||
|
def evaluate_good(model, dataloader):
|
||||||
|
model.eval() # Disable dropout, use running stats for BatchNorm
|
||||||
|
with torch.no_grad():
|
||||||
|
for batch in dataloader:
|
||||||
|
output = model(batch)
|
||||||
|
model.train() # Restore training mode if continuing
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Proper Tensor Creation on Device
|
||||||
|
|
||||||
|
```python
|
||||||
|
# BAD - Creates on CPU then moves (slow)
|
||||||
|
tensor = torch.zeros(100, 100).to(device)
|
||||||
|
|
||||||
|
# GOOD - Create directly on device
|
||||||
|
tensor = torch.zeros(100, 100, device=device)
|
||||||
|
|
||||||
|
# GOOD - Create with same device/dtype as reference
|
||||||
|
tensor = torch.zeros_like(reference_tensor)
|
||||||
|
tensor = torch.empty(100, 100, device=model.device, dtype=model.dtype)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference Checklist
|
||||||
|
|
||||||
|
- [ ] Use `nn.Module` with proper `__init__` and `forward`
|
||||||
|
- [ ] Initialize weights with `_init_weights` method
|
||||||
|
- [ ] Use `@property device` for device inference
|
||||||
|
- [ ] Always use `.item()` when logging scalar losses
|
||||||
|
- [ ] Use `@torch.no_grad()` decorator for evaluation
|
||||||
|
- [ ] Call `model.train()` and `model.eval()` explicitly
|
||||||
|
- [ ] Use `weights_only=True` when loading checkpoints
|
||||||
|
- [ ] Set seeds for reproducibility at script start
|
||||||
|
- [ ] Avoid in-place operations in forward pass
|
||||||
|
- [ ] Use mixed precision for faster training on GPU
|
||||||
|
- [ ] Clip gradients to prevent exploding gradients
|
||||||
|
- [ ] Use `pin_memory=True` with CUDA for faster data transfer
|
||||||
Loading…
Reference in New Issue
Block a user