feat(skills): add verification skill
Replication result verification methodology.
This commit is contained in:
parent
06282c7314
commit
849cfe5409
190
.opencode/skills/verification/SKILL.md
Normal file
190
.opencode/skills/verification/SKILL.md
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
---
|
||||||
|
name: verification
|
||||||
|
description: Use when verifying replication results against paper's reported values
|
||||||
|
---
|
||||||
|
|
||||||
|
# Replication Verification
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Systematic approach to verifying that replicated code produces results matching the original paper.
|
||||||
|
|
||||||
|
**Announce at start:** "I'm using the verification skill to validate replication accuracy."
|
||||||
|
|
||||||
|
## Verification Levels
|
||||||
|
|
||||||
|
### Level 1: Code Correctness
|
||||||
|
- Unit tests pass
|
||||||
|
- No runtime errors
|
||||||
|
- Gradient flow works
|
||||||
|
|
||||||
|
### Level 2: Behavioral Match
|
||||||
|
- Output shapes correct
|
||||||
|
- Value ranges reasonable
|
||||||
|
- Edge cases handled
|
||||||
|
|
||||||
|
### Level 3: Numerical Match
|
||||||
|
- Results within tolerance of paper
|
||||||
|
- Trends match (even if absolute values differ)
|
||||||
|
- Statistical significance considered
|
||||||
|
|
||||||
|
## Test Design for Replication
|
||||||
|
|
||||||
|
### Shape Tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_model_output_shape():
|
||||||
|
"""Verify model produces correct output shape per paper."""
|
||||||
|
model = MyModel(config)
|
||||||
|
x = torch.randn(batch_size, seq_len, input_dim)
|
||||||
|
out = model(x)
|
||||||
|
|
||||||
|
# Paper Section 3.2: "Output dimension is 512"
|
||||||
|
assert out.shape == (batch_size, seq_len, 512)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Value Range Tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_attention_weights_sum():
|
||||||
|
"""Attention weights should sum to 1 (paper Eq. 3)."""
|
||||||
|
model = AttentionLayer(config)
|
||||||
|
x = torch.randn(batch_size, seq_len, dim)
|
||||||
|
_, attn_weights = model(x, return_attention=True)
|
||||||
|
|
||||||
|
# Softmax output sums to 1
|
||||||
|
assert torch.allclose(attn_weights.sum(dim=-1), torch.ones(batch_size, seq_len))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Gradient Tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_gradient_flow():
|
||||||
|
"""Verify gradients flow through all parameters."""
|
||||||
|
model = MyModel(config)
|
||||||
|
x = torch.randn(batch_size, input_dim, requires_grad=True)
|
||||||
|
out = model(x)
|
||||||
|
loss = out.sum()
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
for name, param in model.named_parameters():
|
||||||
|
assert param.grad is not None, f"No gradient for {name}"
|
||||||
|
assert not torch.isnan(param.grad).any(), f"NaN gradient for {name}"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Numerical Match Tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_loss_value_reasonable():
|
||||||
|
"""Loss should be in expected range per paper Figure 2."""
|
||||||
|
model = MyModel(config)
|
||||||
|
# ... setup ...
|
||||||
|
|
||||||
|
loss = compute_loss(model, data)
|
||||||
|
|
||||||
|
# Paper reports initial loss ~2.3 (cross-entropy on 10 classes)
|
||||||
|
assert 2.0 < loss.item() < 3.0, f"Initial loss {loss.item()} outside expected range"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Comparison Methodology
|
||||||
|
|
||||||
|
### Absolute Comparison
|
||||||
|
|
||||||
|
```python
|
||||||
|
def compare_absolute(paper_value: float, our_value: float, tolerance: float = 0.01):
|
||||||
|
"""Compare with absolute tolerance."""
|
||||||
|
diff = abs(paper_value - our_value)
|
||||||
|
return diff <= tolerance, diff
|
||||||
|
```
|
||||||
|
|
||||||
|
### Relative Comparison
|
||||||
|
|
||||||
|
```python
|
||||||
|
def compare_relative(paper_value: float, our_value: float, tolerance: float = 0.05):
|
||||||
|
"""Compare with relative tolerance (5% default)."""
|
||||||
|
if paper_value == 0:
|
||||||
|
return our_value == 0, abs(our_value)
|
||||||
|
relative_diff = abs(paper_value - our_value) / abs(paper_value)
|
||||||
|
return relative_diff <= tolerance, relative_diff
|
||||||
|
```
|
||||||
|
|
||||||
|
### Statistical Comparison
|
||||||
|
|
||||||
|
```python
|
||||||
|
def compare_with_variance(
|
||||||
|
paper_mean: float,
|
||||||
|
paper_std: float,
|
||||||
|
our_values: List[float],
|
||||||
|
confidence: float = 0.95,
|
||||||
|
):
|
||||||
|
"""Compare considering paper's reported variance."""
|
||||||
|
our_mean = np.mean(our_values)
|
||||||
|
our_std = np.std(our_values)
|
||||||
|
|
||||||
|
# Check if means are within 2 standard deviations
|
||||||
|
combined_std = np.sqrt(paper_std**2 + our_std**2)
|
||||||
|
z_score = abs(paper_mean - our_mean) / combined_std
|
||||||
|
|
||||||
|
return z_score < 2.0, z_score
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Difference Sources
|
||||||
|
|
||||||
|
### Acceptable Differences
|
||||||
|
|
||||||
|
| Source | Typical Impact | Mitigation |
|
||||||
|
|--------|---------------|------------|
|
||||||
|
| Random seed | 1-2% | Run multiple seeds |
|
||||||
|
| Floating point | < 0.1% | Use float64 for verification |
|
||||||
|
| Framework differences | 1-3% | Document and accept |
|
||||||
|
| Hardware differences | 0.5-1% | Note in report |
|
||||||
|
|
||||||
|
### Concerning Differences
|
||||||
|
|
||||||
|
| Source | Typical Impact | Action |
|
||||||
|
|--------|---------------|--------|
|
||||||
|
| Wrong architecture | > 10% | Review code vs paper |
|
||||||
|
| Wrong hyperparameters | 5-20% | Verify all settings |
|
||||||
|
| Data preprocessing | Variable | Match paper exactly |
|
||||||
|
| Evaluation protocol | Variable | Check train/val/test split |
|
||||||
|
|
||||||
|
## Verification Checklist
|
||||||
|
|
||||||
|
### Before Comparison
|
||||||
|
|
||||||
|
- [ ] Seeds set for reproducibility
|
||||||
|
- [ ] Same evaluation data as paper
|
||||||
|
- [ ] Same preprocessing pipeline
|
||||||
|
- [ ] Same evaluation metrics
|
||||||
|
|
||||||
|
### During Comparison
|
||||||
|
|
||||||
|
- [ ] Run multiple times with different seeds
|
||||||
|
- [ ] Record mean and standard deviation
|
||||||
|
- [ ] Compare trends, not just final values
|
||||||
|
- [ ] Check intermediate checkpoints if available
|
||||||
|
|
||||||
|
### After Comparison
|
||||||
|
|
||||||
|
- [ ] Document all differences
|
||||||
|
- [ ] Explain likely causes
|
||||||
|
- [ ] Determine if differences are acceptable
|
||||||
|
- [ ] Suggest improvements if needed
|
||||||
|
|
||||||
|
## Report Template
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
## Verification Result: {Metric Name}
|
||||||
|
|
||||||
|
**Paper Value**: {value} ± {std}
|
||||||
|
**Our Value**: {value} ± {std}
|
||||||
|
**Difference**: {absolute} ({relative}%)
|
||||||
|
|
||||||
|
**Status**: MATCH | ACCEPTABLE | INVESTIGATE | MISMATCH
|
||||||
|
|
||||||
|
**Analysis**:
|
||||||
|
{explanation of difference}
|
||||||
|
|
||||||
|
**Confidence**: {HIGH | MEDIUM | LOW}
|
||||||
|
{reasoning for confidence level}
|
||||||
|
```
|
||||||
Loading…
Reference in New Issue
Block a user