Root cause: test-runner was giving overly optimistic results due to: 1. Context bias - knew the implementation, tended to defend it 2. No actual visual comparison - just wrote 'ACCEPTABLE' without looking 3. No structural validation - accepted 35x scale differences as 'acceptable' Solution: - New result-verifier agent that performs blind visual comparison - Strict pass/fail criteria for structural validation - Updated test-runner to use result-verifier for each figure - Clear guidelines: structural mismatches = FAIL, not ACCEPTABLE Test result: verifier correctly identified Fig3 as FAIL with 7 specific issues: - Wrong X-axis variable (channels vs power) - Wrong Y-axis scale (5x difference) - Wrong curve count (5 vs 4) - etc.
27 lines
650 B
Python
27 lines
650 B
Python
"""
|
|
tests/test_baselines.py
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from src.models.baselines import BaselineModels
|
|
|
|
|
|
def test_baselines_initialization():
|
|
models = BaselineModels(mu=19.0, L=12)
|
|
assert models.mu == 19.0
|
|
|
|
|
|
def test_calculate_baseline_sse():
|
|
models = BaselineModels(mu=19.0, L=12)
|
|
|
|
snr_linear = np.array([10.0, 100.0, 1000.0])
|
|
|
|
sse_ideal = models.calculate_baseline_sse(snr_linear, "ideal")
|
|
sse_5g = models.calculate_baseline_sse(snr_linear, "5G")
|
|
sse_4g = models.calculate_baseline_sse(snr_linear, "4G")
|
|
|
|
assert len(sse_ideal) == 3
|
|
assert np.all(sse_ideal > sse_5g)
|
|
assert np.all(sse_5g > sse_4g)
|