import os import numpy as np import torch import torch.nn.functional as F from agents.actor import Actor from agents.critic import Critic from agents.replay_buffer import ReplayBuffer from agents.noise import OUNoise """ Baseline: FixedLambda (固定 λ 基线) ===================================== Purpose (ablation): - This baseline is used to evaluate the benefit of the dynamic lambda switching mechanism in Co-MADDPG. - It fixes λ at a constant value (0.5), balancing cooperation and competition equally throughout the training. - 目的(消融实验):该基线用于评估 Co-MADDPG 中动态 λ 切换机制的收益。它将 λ 固定为常数(0.5),在整个训练过程中平衡协作与竞争。 Difference from Co-MADDPG: 1. Lambda (λ): Fixed at 0.5, whereas Co-MADDPG dynamically adjusts λ based on system state. 2. Update Order: Retains the Stackelberg update order (follower B first, then leader S), same as Co-MADDPG. 3. 与 Co-MADDPG 的区别: - Lambda (λ): 固定为 0.5,而 Co-MADDPG 根据系统状态动态调整 λ。 - 更新顺序:保留了 Stackelberg 博弈更新顺序(先更新从属者 B,再更新主导者 S),与 Co-MADDPG 一致。 Contribution: - Contributes to performance sensitivity analysis regarding the choice of λ and shows why a fixed balance is suboptimal. - 贡献:用于关于 λ 选择的性能敏感性分析,展示为什么固定比例的平衡并非最优。 """ class FixedLambda: """ FixedLambda algorithm implementation. 固定 λ 算法实现。 """ def __init__(self, config): # Initialize configuration and device # 初始化配置和设备 self.config = config self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyperparameters: Gamma, Tau, Batch Size, and Fixed λ=0.5 # 超参数:折扣因子、软更新系数、批量大小以及固定 λ=0.5 self.gamma = config['training']['gamma'] self.tau = config['training']['tau'] self.batch_size = config['training']['batch_size'] self.fixed_lambda = 0.5 # Dimensions: State and Action # 维度信息:状态与动作 self.obs_dim = config['env']['num_subcarriers'] + 4 self.act_dim = 3 # Actor networks and their target networks # Actor 网络及其目标网络 hidden_a = config['network']['actor_hidden'] hidden_c = config['network']['critic_hidden'] self.actor_s = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device) self.actor_b = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device) self.actor_s_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device) self.actor_b_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device) self.actor_s_target.load_state_dict(self.actor_s.state_dict()) self.actor_b_target.load_state_dict(self.actor_b.state_dict()) # Joint Critics for Centralized Training # 用于中心化训练的联合 Critic obs_total = self.obs_dim * 2 act_total = self.act_dim * 2 self.critic_s = Critic(obs_total, act_total, hidden_c).to(self.device) self.critic_b = Critic(obs_total, act_total, hidden_c).to(self.device) self.critic_s_target = Critic(obs_total, act_total, hidden_c).to(self.device) self.critic_b_target = Critic(obs_total, act_total, hidden_c).to(self.device) self.critic_s_target.load_state_dict(self.critic_s.state_dict()) self.critic_b_target.load_state_dict(self.critic_b.state_dict()) # Optimizers for actors and critics # Actor 与 Critic 的优化器 self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr']) self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr']) self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr']) self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr']) # Experience Replay and OU Noise for exploration # 经验重放池与用于探索的 OU 噪声 self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity']) self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min']) self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min']) def select_action(self, obs_s, obs_b, explore=True): """ Select actions for both agents given observations. 根据观察结果为两个智能体选择动作。 """ self.actor_s.eval() self.actor_b.eval() with torch.no_grad(): obs_s_t = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device) obs_b_t = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device) act_s = self.actor_s(obs_s_t).cpu().numpy()[0] act_b = self.actor_b(obs_b_t).cpu().numpy()[0] self.actor_s.train() self.actor_b.train() if explore: # Add noise during training exploration # 训练探索期间增加噪声 act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0) act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0) else: act_s = np.clip(act_s, 0.0, 1.0) act_b = np.clip(act_b, 0.0, 1.0) return act_s, act_b def compute_rewards(self, qoe_s, qoe_b, qoe_sys): """ Compute rewards with fixed λ=0.5. 使用固定 λ=0.5 计算奖励。 Formula: r_i = 0.5 * r_coop + 0.5 * r_comp 公式说明:奖励是协作项与竞争项的等权之和。 """ lam = self.fixed_lambda rew_cfg = self.config.get('reward', {}) coop_self = rew_cfg.get('coop_self', 0.5) coop_other = rew_cfg.get('coop_other', 0.3) coop_sys = rew_cfg.get('coop_sys', 0.2) comp_self = rew_cfg.get('comp_self', 0.8) comp_sys = rew_cfg.get('comp_sys', 0.2) # Compute Cooperative and Competitive components for S # 计算 S 的协作与竞争组成部分 r_coop_s = coop_self * qoe_s + coop_other * qoe_b + coop_sys * qoe_sys r_comp_s = comp_self * qoe_s + comp_sys * qoe_sys r_s = lam * r_coop_s + (1 - lam) * r_comp_s # Compute Cooperative and Competitive components for B # 计算 B 的协作与竞争组成部分 r_coop_b = coop_self * qoe_b + coop_other * qoe_s + coop_sys * qoe_sys r_comp_b = comp_self * qoe_b + comp_sys * qoe_sys r_b = lam * r_coop_b + (1 - lam) * r_comp_b return r_s, r_b, lam def update(self): """ Update networks using Stackelberg update order. 使用 Stackelberg 博弈顺序更新网络。 Order: Follower B updates first, then Leader S updates considering B's response. 顺序:从属者 B 先更新,随后主导者 S 在考虑 B 的响应后进行更新。 """ if len(self.replay_buffer) < self.batch_size: return None # Sample from replay buffer # 从经验池采样 obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = \ self.replay_buffer.sample(self.batch_size) # Convert to tensors # 转换为张量 obs_s = torch.FloatTensor(obs_s).to(self.device) obs_b = torch.FloatTensor(obs_b).to(self.device) act_s = torch.FloatTensor(act_s).to(self.device) act_b = torch.FloatTensor(act_b).to(self.device) rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device) rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device) next_obs_s = torch.FloatTensor(next_obs_s).to(self.device) next_obs_b = torch.FloatTensor(next_obs_b).to(self.device) dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device) # Centralized observation and next observation # 中心化观察与下一状态观察 joint_obs = torch.cat([obs_s, obs_b], dim=1) joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1) joint_act = torch.cat([act_s, act_b], dim=1) # Compute targets for critics # 计算 Critic 的目标值 with torch.no_grad(): next_act_s = self.actor_s_target(next_obs_s) next_act_b = self.actor_b_target(next_obs_b) joint_next_act = torch.cat([next_act_s, next_act_b], dim=1) target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act) target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act) # --- Stackelberg: update follower B first --- # --- Stackelberg 博弈:首先更新从属者 B --- # Update Critic B # 更新 Critic B current_q_b = self.critic_b(joint_obs, joint_act) critic_loss_b = F.mse_loss(current_q_b, target_q_b) self.critic_b_optimizer.zero_grad() critic_loss_b.backward() self.critic_b_optimizer.step() # Update Actor B (Follower) # 更新 Actor B (从属者) new_act_b = self.actor_b(obs_b) actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean() self.actor_b_optimizer.zero_grad() actor_loss_b.backward() self.actor_b_optimizer.step() # --- Then update leader S --- # --- 然后更新主导者 S --- # Re-compute follower's best response for leader's critic update # 为主导者的 Critic 更新重新计算从属者的最佳响应 with torch.no_grad(): act_b_br = self.actor_b(obs_b) joint_act_leader = torch.cat([act_s, act_b_br], dim=1) # Update Critic S # 更新 Critic S current_q_s = self.critic_s(joint_obs, joint_act_leader) critic_loss_s = F.mse_loss(current_q_s, target_q_s) self.critic_s_optimizer.zero_grad() critic_loss_s.backward() self.critic_s_optimizer.step() # Update Actor S (Leader) considering Follower's best response # 考虑从属者的最佳响应,更新 Actor S (主导者) with torch.no_grad(): act_b_br2 = self.actor_b(obs_b) new_act_s = self.actor_s(obs_s) actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b_br2], dim=1)).mean() self.actor_s_optimizer.zero_grad() actor_loss_s.backward() self.actor_s_optimizer.step() # Soft update target networks # 目标网络软更新 for target, source in [ (self.critic_s_target, self.critic_s), (self.critic_b_target, self.critic_b), (self.actor_s_target, self.actor_s), (self.actor_b_target, self.actor_b), ]: for tp, sp in zip(target.parameters(), source.parameters()): tp.data.copy_(self.tau * sp.data + (1.0 - self.tau) * tp.data) return { 'actor_loss_s': actor_loss_s.item(), 'actor_loss_b': actor_loss_b.item(), 'critic_loss_s': critic_loss_s.item(), 'critic_loss_b': critic_loss_b.item(), } def save(self, path): """ Save models to disk. 将模型保存至磁盘。 """ os.makedirs(path, exist_ok=True) torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth")) torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth")) torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth")) torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth")) def load(self, path): """ Load models from disk. 从磁盘加载模型。 """ self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device)) self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device)) self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device)) self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device)) self.actor_s_target.load_state_dict(self.actor_s.state_dict()) self.actor_b_target.load_state_dict(self.actor_b.state_dict()) self.critic_s_target.load_state_dict(self.critic_s.state_dict()) self.critic_b_target.load_state_dict(self.critic_b.state_dict())