From 849cfe5409617638c3abb9a840d22b63f62b05c7 Mon Sep 17 00:00:00 2001 From: hc <1328308360@qq.com> Date: Tue, 31 Mar 2026 17:43:41 +0800 Subject: [PATCH] feat(skills): add verification skill Replication result verification methodology. --- .opencode/skills/verification/SKILL.md | 190 +++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 .opencode/skills/verification/SKILL.md diff --git a/.opencode/skills/verification/SKILL.md b/.opencode/skills/verification/SKILL.md new file mode 100644 index 0000000..0366d80 --- /dev/null +++ b/.opencode/skills/verification/SKILL.md @@ -0,0 +1,190 @@ +--- +name: verification +description: Use when verifying replication results against paper's reported values +--- + +# Replication Verification + +## Overview + +Systematic approach to verifying that replicated code produces results matching the original paper. + +**Announce at start:** "I'm using the verification skill to validate replication accuracy." + +## Verification Levels + +### Level 1: Code Correctness +- Unit tests pass +- No runtime errors +- Gradient flow works + +### Level 2: Behavioral Match +- Output shapes correct +- Value ranges reasonable +- Edge cases handled + +### Level 3: Numerical Match +- Results within tolerance of paper +- Trends match (even if absolute values differ) +- Statistical significance considered + +## Test Design for Replication + +### Shape Tests + +```python +def test_model_output_shape(): + """Verify model produces correct output shape per paper.""" + model = MyModel(config) + x = torch.randn(batch_size, seq_len, input_dim) + out = model(x) + + # Paper Section 3.2: "Output dimension is 512" + assert out.shape == (batch_size, seq_len, 512) +``` + +### Value Range Tests + +```python +def test_attention_weights_sum(): + """Attention weights should sum to 1 (paper Eq. 3).""" + model = AttentionLayer(config) + x = torch.randn(batch_size, seq_len, dim) + _, attn_weights = model(x, return_attention=True) + + # Softmax output sums to 1 + assert torch.allclose(attn_weights.sum(dim=-1), torch.ones(batch_size, seq_len)) +``` + +### Gradient Tests + +```python +def test_gradient_flow(): + """Verify gradients flow through all parameters.""" + model = MyModel(config) + x = torch.randn(batch_size, input_dim, requires_grad=True) + out = model(x) + loss = out.sum() + loss.backward() + + for name, param in model.named_parameters(): + assert param.grad is not None, f"No gradient for {name}" + assert not torch.isnan(param.grad).any(), f"NaN gradient for {name}" +``` + +### Numerical Match Tests + +```python +def test_loss_value_reasonable(): + """Loss should be in expected range per paper Figure 2.""" + model = MyModel(config) + # ... setup ... + + loss = compute_loss(model, data) + + # Paper reports initial loss ~2.3 (cross-entropy on 10 classes) + assert 2.0 < loss.item() < 3.0, f"Initial loss {loss.item()} outside expected range" +``` + +## Comparison Methodology + +### Absolute Comparison + +```python +def compare_absolute(paper_value: float, our_value: float, tolerance: float = 0.01): + """Compare with absolute tolerance.""" + diff = abs(paper_value - our_value) + return diff <= tolerance, diff +``` + +### Relative Comparison + +```python +def compare_relative(paper_value: float, our_value: float, tolerance: float = 0.05): + """Compare with relative tolerance (5% default).""" + if paper_value == 0: + return our_value == 0, abs(our_value) + relative_diff = abs(paper_value - our_value) / abs(paper_value) + return relative_diff <= tolerance, relative_diff +``` + +### Statistical Comparison + +```python +def compare_with_variance( + paper_mean: float, + paper_std: float, + our_values: List[float], + confidence: float = 0.95, +): + """Compare considering paper's reported variance.""" + our_mean = np.mean(our_values) + our_std = np.std(our_values) + + # Check if means are within 2 standard deviations + combined_std = np.sqrt(paper_std**2 + our_std**2) + z_score = abs(paper_mean - our_mean) / combined_std + + return z_score < 2.0, z_score +``` + +## Common Difference Sources + +### Acceptable Differences + +| Source | Typical Impact | Mitigation | +|--------|---------------|------------| +| Random seed | 1-2% | Run multiple seeds | +| Floating point | < 0.1% | Use float64 for verification | +| Framework differences | 1-3% | Document and accept | +| Hardware differences | 0.5-1% | Note in report | + +### Concerning Differences + +| Source | Typical Impact | Action | +|--------|---------------|--------| +| Wrong architecture | > 10% | Review code vs paper | +| Wrong hyperparameters | 5-20% | Verify all settings | +| Data preprocessing | Variable | Match paper exactly | +| Evaluation protocol | Variable | Check train/val/test split | + +## Verification Checklist + +### Before Comparison + +- [ ] Seeds set for reproducibility +- [ ] Same evaluation data as paper +- [ ] Same preprocessing pipeline +- [ ] Same evaluation metrics + +### During Comparison + +- [ ] Run multiple times with different seeds +- [ ] Record mean and standard deviation +- [ ] Compare trends, not just final values +- [ ] Check intermediate checkpoints if available + +### After Comparison + +- [ ] Document all differences +- [ ] Explain likely causes +- [ ] Determine if differences are acceptable +- [ ] Suggest improvements if needed + +## Report Template + +```markdown +## Verification Result: {Metric Name} + +**Paper Value**: {value} ± {std} +**Our Value**: {value} ± {std} +**Difference**: {absolute} ({relative}%) + +**Status**: MATCH | ACCEPTABLE | INVESTIGATE | MISMATCH + +**Analysis**: +{explanation of difference} + +**Confidence**: {HIGH | MEDIUM | LOW} +{reasoning for confidence level} +```