import os import torch import torch.nn.functional as F import numpy as np from agents.actor import Actor from agents.critic import Critic from agents.replay_buffer import ReplayBuffer from agents.noise import OUNoise """ Baseline: PureCompetitive (纯竞争基线) ===================================== Purpose (ablation): - This baseline removes the cooperative component from the MADDPG framework. - It serves as an ablation study to demonstrate that pure competition (λ=0) leads to resource wastage and suboptimal system-wide utility. - 目的(消融实验):该基线移除了 MADDPG 框架中的协作成分。作为消融实验,用于证明纯竞争模式(λ=0)会导致资源浪费和系统级效用降低。 Difference from Co-MADDPG: 1. Lambda (λ): Fixed at 0.0 (pure competition), whereas Co-MADDPG uses dynamic λ. 2. Update Order: Uses simultaneous updates for both actors, whereas Co-MADDPG uses Stackelberg update order. 3. 与 Co-MADDPG 的区别: - Lambda (λ): 固定为 0.0(纯竞争),而 Co-MADDPG 使用动态 λ。 - 更新顺序:两个参与者同时更新(Simultaneous Update),而 Co-MADDPG 使用 Stackelberg 博弈更新顺序。 Contribution: - Contributes to comparison figures showing the "Price of Anarchy" in resource allocation. - 贡献:用于对比图表,展示资源分配中的“无政府代价”。 """ class PureCompetitive: """ PureCompetitive algorithm implementation. 纯竞争算法实现。 """ def __init__(self, config): # Initialize configuration and device # 初始化配置和设备 self.config = config self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyperparameters: Gamma (discount), Tau (soft update), Batch Size # 超参数:折扣因子、软更新系数、批量大小 self.gamma = config['training']['gamma'] self.tau = config['training']['tau'] self.batch_size = config['training']['batch_size'] # Dimensions: State (subcarriers + 4), Action (3) # 维度:状态(子载波 + 4)、动作(3) self.obs_dim = config['env']['num_subcarriers'] + 4 self.act_dim = 3 # Agents: Semantic (s) and Traditional (b) actors and target networks # 智能体:语义 (s) 与 传统 (b) 参与者的 Actor 及其目标网络 self.actor_s = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device) self.actor_b = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device) self.actor_s_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device) self.actor_b_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device) self.actor_s_target.load_state_dict(self.actor_s.state_dict()) self.actor_b_target.load_state_dict(self.actor_b.state_dict()) # Joint Critics: Uses Centralized Training (obs_dim*2, act_dim*2) # 联合 Critic:使用中心化训练(输入为两体观察与动作的并集) self.critic_s = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device) self.critic_b = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device) self.critic_s_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device) self.critic_b_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device) self.critic_s_target.load_state_dict(self.critic_s.state_dict()) self.critic_b_target.load_state_dict(self.critic_b.state_dict()) # Optimizers for all networks # 所有网络的优化器 self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr']) self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr']) self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr']) self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr']) # Experience Replay and Noise for exploration # 经验重放池与用于探索的噪声 self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity']) self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min']) self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min']) def select_action(self, obs_s, obs_b, explore=True): """ Select actions for both agents given observations. 根据观察结果为两个智能体选择动作。 """ obs_s = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device) obs_b = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device) self.actor_s.eval() self.actor_b.eval() with torch.no_grad(): # Forward pass through actors # Actor 前向传播 act_s = self.actor_s(obs_s).cpu().numpy()[0] act_b = self.actor_b(obs_b).cpu().numpy()[0] self.actor_s.train() self.actor_b.train() if explore: # Apply OU noise for exploration # 应用 OU 噪声进行探索 act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0) act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0) return act_s, act_b def compute_rewards(self, qoe_s, qoe_b, qoe_sys): """ Compute rewards based on pure competition (λ=0). 基于纯竞争计算奖励 (λ=0)。 Formula: r_i = comp_self * qoe_i + comp_sys * qoe_sys 公式说明:由于 λ=0,奖励完全由竞争项组成,仅考虑自身 QoE 以及系统总 QoE 的惩罚项。 """ lam = 0.0 r_s = self.config['reward']['comp_self'] * qoe_s + self.config['reward']['comp_sys'] * qoe_sys r_b = self.config['reward']['comp_self'] * qoe_b + self.config['reward']['comp_sys'] * qoe_sys return r_s, r_b, lam def update(self): """ Update the networks using sampled experiences. 使用采样的经验更新网络。 Update order: Simultaneous updates (both actors update based on current policy of the other). 更新顺序:同时更新(两个 Actor 基于对方当前的策略进行更新)。 """ if len(self.replay_buffer) < self.batch_size: return None # Sample batch from replay buffer # 从重放池采样批量数据 obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = self.replay_buffer.sample(self.batch_size) # Convert to tensors # 转换为张量 obs_s = torch.FloatTensor(obs_s).to(self.device) obs_b = torch.FloatTensor(obs_b).to(self.device) act_s = torch.FloatTensor(act_s).to(self.device) act_b = torch.FloatTensor(act_b).to(self.device) rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device) rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device) next_obs_s = torch.FloatTensor(next_obs_s).to(self.device) next_obs_b = torch.FloatTensor(next_obs_b).to(self.device) dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device) # Centralized observations and actions # 中心化观察与动作 joint_obs = torch.cat([obs_s, obs_b], dim=1) joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1) joint_act = torch.cat([act_s, act_b], dim=1) # 1. Critics Update (1. Critic 更新) with torch.no_grad(): # Get target actions for next state # 获取下一状态的目标动作 next_act_s = self.actor_s_target(next_obs_s) next_act_b = self.actor_b_target(next_obs_b) joint_next_act = torch.cat([next_act_s, next_act_b], dim=1) # Compute target Q values # 计算目标 Q 值 target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act) target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act) # Compute current Q values and MSE loss # 计算当前 Q 值与均方误差损失 current_q_s = self.critic_s(joint_obs, joint_act) current_q_b = self.critic_b(joint_obs, joint_act) critic_loss_s = F.mse_loss(current_q_s, target_q_s) critic_loss_b = F.mse_loss(current_q_b, target_q_b) # Backpropagation for critics # Critic 的反向传播 self.critic_s_optimizer.zero_grad() critic_loss_s.backward() self.critic_s_optimizer.step() self.critic_b_optimizer.zero_grad() critic_loss_b.backward() self.critic_b_optimizer.step() # 2. Actors Update (Simultaneous) (2. Actor 更新 - 同时进行) new_act_s = self.actor_s(obs_s) new_act_b = self.actor_b(obs_b) # Calculate policy loss using joint critic # 使用联合 Critic 计算策略损失 actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b], dim=1)).mean() actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean() # Backpropagation for actors # Actor 的反向传播 self.actor_s_optimizer.zero_grad() actor_loss_s.backward() self.actor_s_optimizer.step() self.actor_b_optimizer.zero_grad() actor_loss_b.backward() self.actor_b_optimizer.step() # 3. Soft Target Networks Update (3. 目标网络软更新) for target_param, param in zip(self.critic_s_target.parameters(), self.critic_s.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) for target_param, param in zip(self.critic_b_target.parameters(), self.critic_b.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) for target_param, param in zip(self.actor_s_target.parameters(), self.actor_s.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) for target_param, param in zip(self.actor_b_target.parameters(), self.actor_b.parameters()): target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data) return { 'actor_loss_s': actor_loss_s.item(), 'actor_loss_b': actor_loss_b.item(), 'critic_loss_s': critic_loss_s.item(), 'critic_loss_b': critic_loss_b.item() } def save(self, path): """ Save models to disk. 将模型保存至磁盘。 """ os.makedirs(path, exist_ok=True) torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth")) torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth")) torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth")) torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth")) def load(self, path): """ Load models from disk. 从磁盘加载模型。 """ self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device)) self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device)) self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device)) self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device)) self.actor_s_target.load_state_dict(self.actor_s.state_dict()) self.actor_b_target.load_state_dict(self.actor_b.state_dict()) self.critic_s_target.load_state_dict(self.critic_s.state_dict()) self.critic_b_target.load_state_dict(self.critic_b.state_dict())