import os
import numpy as np
import torch
import torch.nn.functional as F

from agents.actor import Actor
from agents.critic import Critic
from agents.replay_buffer import ReplayBuffer
from agents.noise import OUNoise

"""
Baseline: FixedLambda (固定 λ 基线)
=====================================
Purpose (ablation): 
- This baseline is used to evaluate the benefit of the dynamic lambda switching mechanism in Co-MADDPG.
- It fixes λ at a constant value (0.5), balancing cooperation and competition equally throughout the training.
- 目的（消融实验）：该基线用于评估 Co-MADDPG 中动态 λ 切换机制的收益。它将 λ 固定为常数（0.5），在整个训练过程中平衡协作与竞争。

Difference from Co-MADDPG:
1. Lambda (λ): Fixed at 0.5, whereas Co-MADDPG dynamically adjusts λ based on system state.
2. Update Order: Retains the Stackelberg update order (follower B first, then leader S), same as Co-MADDPG.
3. 与 Co-MADDPG 的区别：
   - Lambda (λ): 固定为 0.5，而 Co-MADDPG 根据系统状态动态调整 λ。
   - 更新顺序：保留了 Stackelberg 博弈更新顺序（先更新从属者 B，再更新主导者 S），与 Co-MADDPG 一致。

Contribution:
- Contributes to performance sensitivity analysis regarding the choice of λ and shows why a fixed balance is suboptimal.
- 贡献：用于关于 λ 选择的性能敏感性分析，展示为什么固定比例的平衡并非最优。
"""

class FixedLambda:
    """
    FixedLambda algorithm implementation.
    固定 λ 算法实现。
    """
    def __init__(self, config):
        # Initialize configuration and device
        # 初始化配置和设备
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Hyperparameters: Gamma, Tau, Batch Size, and Fixed λ=0.5
        # 超参数：折扣因子、软更新系数、批量大小以及固定 λ=0.5
        self.gamma = config['training']['gamma']
        self.tau = config['training']['tau']
        self.batch_size = config['training']['batch_size']
        self.fixed_lambda = 0.5

        # Dimensions: State and Action
        # 维度信息：状态与动作
        self.obs_dim = config['env']['num_subcarriers'] + 4
        self.act_dim = 3

        # Actor networks and their target networks
        # Actor 网络及其目标网络
        hidden_a = config['network']['actor_hidden']
        hidden_c = config['network']['critic_hidden']

        self.actor_s = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
        self.actor_b = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
        self.actor_s_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
        self.actor_b_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
        self.actor_s_target.load_state_dict(self.actor_s.state_dict())
        self.actor_b_target.load_state_dict(self.actor_b.state_dict())

        # Joint Critics for Centralized Training
        # 用于中心化训练的联合 Critic
        obs_total = self.obs_dim * 2
        act_total = self.act_dim * 2
        self.critic_s = Critic(obs_total, act_total, hidden_c).to(self.device)
        self.critic_b = Critic(obs_total, act_total, hidden_c).to(self.device)
        self.critic_s_target = Critic(obs_total, act_total, hidden_c).to(self.device)
        self.critic_b_target = Critic(obs_total, act_total, hidden_c).to(self.device)
        self.critic_s_target.load_state_dict(self.critic_s.state_dict())
        self.critic_b_target.load_state_dict(self.critic_b.state_dict())

        # Optimizers for actors and critics
        # Actor 与 Critic 的优化器
        self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr'])
        self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr'])
        self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr'])
        self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr'])

        # Experience Replay and OU Noise for exploration
        # 经验重放池与用于探索的 OU 噪声
        self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity'])
        self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
                               sigma_init=config['training']['ou_sigma_init'],
                               sigma_min=config['training']['ou_sigma_min'])
        self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
                               sigma_init=config['training']['ou_sigma_init'],
                               sigma_min=config['training']['ou_sigma_min'])

    def select_action(self, obs_s, obs_b, explore=True):
        """
        Select actions for both agents given observations.
        根据观察结果为两个智能体选择动作。
        """
        self.actor_s.eval()
        self.actor_b.eval()
        with torch.no_grad():
            obs_s_t = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device)
            obs_b_t = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device)
            act_s = self.actor_s(obs_s_t).cpu().numpy()[0]
            act_b = self.actor_b(obs_b_t).cpu().numpy()[0]
        self.actor_s.train()
        self.actor_b.train()

        if explore:
            # Add noise during training exploration
            # 训练探索期间增加噪声
            act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0)
            act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0)
        else:
            act_s = np.clip(act_s, 0.0, 1.0)
            act_b = np.clip(act_b, 0.0, 1.0)

        return act_s, act_b

    def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
        """
        Compute rewards with fixed λ=0.5.
        使用固定 λ=0.5 计算奖励。
        
        Formula: r_i = 0.5 * r_coop + 0.5 * r_comp
        公式说明：奖励是协作项与竞争项的等权之和。
        """
        lam = self.fixed_lambda
        rew_cfg = self.config.get('reward', {})
        coop_self = rew_cfg.get('coop_self', 0.5)
        coop_other = rew_cfg.get('coop_other', 0.3)
        coop_sys = rew_cfg.get('coop_sys', 0.2)
        comp_self = rew_cfg.get('comp_self', 0.8)
        comp_sys = rew_cfg.get('comp_sys', 0.2)

        # Compute Cooperative and Competitive components for S
        # 计算 S 的协作与竞争组成部分
        r_coop_s = coop_self * qoe_s + coop_other * qoe_b + coop_sys * qoe_sys
        r_comp_s = comp_self * qoe_s + comp_sys * qoe_sys
        r_s = lam * r_coop_s + (1 - lam) * r_comp_s

        # Compute Cooperative and Competitive components for B
        # 计算 B 的协作与竞争组成部分
        r_coop_b = coop_self * qoe_b + coop_other * qoe_s + coop_sys * qoe_sys
        r_comp_b = comp_self * qoe_b + comp_sys * qoe_sys
        r_b = lam * r_coop_b + (1 - lam) * r_comp_b

        return r_s, r_b, lam

    def update(self):
        """
        Update networks using Stackelberg update order.
        使用 Stackelberg 博弈顺序更新网络。
        
        Order: Follower B updates first, then Leader S updates considering B's response.
        顺序：从属者 B 先更新，随后主导者 S 在考虑 B 的响应后进行更新。
        """
        if len(self.replay_buffer) < self.batch_size:
            return None

        # Sample from replay buffer
        # 从经验池采样
        obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = \
            self.replay_buffer.sample(self.batch_size)

        # Convert to tensors
        # 转换为张量
        obs_s = torch.FloatTensor(obs_s).to(self.device)
        obs_b = torch.FloatTensor(obs_b).to(self.device)
        act_s = torch.FloatTensor(act_s).to(self.device)
        act_b = torch.FloatTensor(act_b).to(self.device)
        rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device)
        rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device)
        next_obs_s = torch.FloatTensor(next_obs_s).to(self.device)
        next_obs_b = torch.FloatTensor(next_obs_b).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        # Centralized observation and next observation
        # 中心化观察与下一状态观察
        joint_obs = torch.cat([obs_s, obs_b], dim=1)
        joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1)
        joint_act = torch.cat([act_s, act_b], dim=1)

        # Compute targets for critics
        # 计算 Critic 的目标值
        with torch.no_grad():
            next_act_s = self.actor_s_target(next_obs_s)
            next_act_b = self.actor_b_target(next_obs_b)
            joint_next_act = torch.cat([next_act_s, next_act_b], dim=1)
            target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act)
            target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act)

        # --- Stackelberg: update follower B first ---
        # --- Stackelberg 博弈：首先更新从属者 B ---
        
        # Update Critic B
        # 更新 Critic B
        current_q_b = self.critic_b(joint_obs, joint_act)
        critic_loss_b = F.mse_loss(current_q_b, target_q_b)
        self.critic_b_optimizer.zero_grad()
        critic_loss_b.backward()
        self.critic_b_optimizer.step()

        # Update Actor B (Follower)
        # 更新 Actor B (从属者)
        new_act_b = self.actor_b(obs_b)
        actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean()
        self.actor_b_optimizer.zero_grad()
        actor_loss_b.backward()
        self.actor_b_optimizer.step()

        # --- Then update leader S ---
        # --- 然后更新主导者 S ---
        
        # Re-compute follower's best response for leader's critic update
        # 为主导者的 Critic 更新重新计算从属者的最佳响应
        with torch.no_grad():
            act_b_br = self.actor_b(obs_b)
        joint_act_leader = torch.cat([act_s, act_b_br], dim=1)

        # Update Critic S
        # 更新 Critic S
        current_q_s = self.critic_s(joint_obs, joint_act_leader)
        critic_loss_s = F.mse_loss(current_q_s, target_q_s)
        self.critic_s_optimizer.zero_grad()
        critic_loss_s.backward()
        self.critic_s_optimizer.step()

        # Update Actor S (Leader) considering Follower's best response
        # 考虑从属者的最佳响应，更新 Actor S (主导者)
        with torch.no_grad():
            act_b_br2 = self.actor_b(obs_b)
        new_act_s = self.actor_s(obs_s)
        actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b_br2], dim=1)).mean()
        self.actor_s_optimizer.zero_grad()
        actor_loss_s.backward()
        self.actor_s_optimizer.step()

        # Soft update target networks
        # 目标网络软更新
        for target, source in [
            (self.critic_s_target, self.critic_s),
            (self.critic_b_target, self.critic_b),
            (self.actor_s_target, self.actor_s),
            (self.actor_b_target, self.actor_b),
        ]:
            for tp, sp in zip(target.parameters(), source.parameters()):
                tp.data.copy_(self.tau * sp.data + (1.0 - self.tau) * tp.data)

        return {
            'actor_loss_s': actor_loss_s.item(),
            'actor_loss_b': actor_loss_b.item(),
            'critic_loss_s': critic_loss_s.item(),
            'critic_loss_b': critic_loss_b.item(),
        }

    def save(self, path):
        """
        Save models to disk.
        将模型保存至磁盘。
        """
        os.makedirs(path, exist_ok=True)
        torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth"))
        torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth"))
        torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth"))
        torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth"))

    def load(self, path):
        """
        Load models from disk.
        从磁盘加载模型。
        """
        self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device))
        self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device))
        self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device))
        self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device))
        self.actor_s_target.load_state_dict(self.actor_s.state_dict())
        self.actor_b_target.load_state_dict(self.actor_b.state_dict())
        self.critic_s_target.load_state_dict(self.critic_s.state_dict())
        self.critic_b_target.load_state_dict(self.critic_b.state_dict())