import numpy as np

"""
Baseline: EqualAllocation (等额分配基线)
=====================================
Purpose (lower bound): 
- This baseline represents a simple heuristic approach with no learning involved.
- It serves as a lower bound for performance comparison, showing the system behavior under a naive, fixed resource allocation strategy.
- 目的（性能下限）：该基线代表了一种不涉及学习的简单启发式方法。它作为性能对比的下限，展示了在朴素的固定资源分配策略下系统的表现。

Difference from Co-MADDPG:
1. Learning: No learning vs Deep Reinforcement Learning.
2. Action Selection: Always fixed at [0.5, 0.5, 0.5] for all resource parameters (subcarrier fraction, power, m_param).
3. 与 Co-MADDPG 的区别：
   - 学习机制：无学习 vs 深度强化学习。
   - 动作选择：所有资源参数（子载波比例、功率、m 参数）始终固定为 [0.5, 0.5, 0.5]。

Contribution:
- Contributes to performance baseline tables as the "Random/Fixed" comparison point.
- 贡献：作为“随机/固定”对比点，用于性能基准表。
"""

class DummyBuffer:
    """
    Dummy replay buffer that satisfies train.py's push/len interface.
    满足 train.py 中 push/len 接口要求的虚拟重放池。
    """
    def push(self, *args):
        # Do nothing as no learning is performed
        # 不执行任何操作，因为没有学习过程
        pass

    def __len__(self):
        # Always return 0 to indicate no samples available
        # 始终返回 0，表示没有可用样本
        return 0


class EqualAllocation:
    """
    EqualAllocation algorithm implementation.
    等额分配算法实现。
    """
    def __init__(self, config):
        # Initialize with configuration and a dummy buffer
        # 使用配置和虚拟重放池进行初始化
        self.config = config
        self.replay_buffer = DummyBuffer()

    def select_action(self, obs_s, obs_b, explore=True):
        """
        Always return a fixed action [0.5, 0.5, 0.5].
        始终返回固定动作 [0.5, 0.5, 0.5]。
        """
        return np.array([0.5, 0.5, 0.5], dtype=np.float32), \
               np.array([0.5, 0.5, 0.5], dtype=np.float32)

    def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
        """
        Compute rewards using a fixed λ=0.5 for consistency in monitoring.
        使用固定 λ=0.5 计算奖励，以保持监测的一致性。
        
        Formula: Balanced combination of coop and comp components.
        公式说明：协作项与竞争项的平衡组合。
        """
        lam = 0.5
        rew_cfg = self.config.get('reward', {})
        coop_self = rew_cfg.get('coop_self', 0.5)
        coop_other = rew_cfg.get('coop_other', 0.3)
        coop_sys = rew_cfg.get('coop_sys', 0.2)
        comp_self = rew_cfg.get('comp_self', 0.8)
        comp_sys = rew_cfg.get('comp_sys', 0.2)

        # Compute reward components for S
        # 计算 S 的奖励组成部分
        r_coop_s = coop_self * qoe_s + coop_other * qoe_b + coop_sys * qoe_sys
        r_comp_s = comp_self * qoe_s + comp_sys * qoe_sys
        r_s = lam * r_coop_s + (1 - lam) * r_comp_s

        # Compute reward components for B
        # 计算 B 的奖励组成部分
        r_coop_b = coop_self * qoe_b + coop_other * qoe_s + coop_sys * qoe_sys
        r_comp_b = comp_self * qoe_b + comp_sys * qoe_sys
        r_b = lam * r_coop_b + (1 - lam) * r_comp_b

        return r_s, r_b, lam

    def update(self):
        """
        No update performed in heuristic baseline.
        启发式基线中不执行更新。
        """
        return None

    def save(self, path):
        """No state to save."""
        pass

    def load(self, path):
        """No state to load."""
        pass