import os import random from collections import deque import numpy as np import torch import torch.nn as nn import torch.nn.functional as F """ Baseline: SingleAgentDQN (单智能体 DQN 基线) ===================================== Purpose (non-MARL baseline): - This baseline represents a traditional single-agent approach to the resource allocation problem. - It uses a centralized DQN that controls both groups by discretizing the continuous action space. - 目的(非多智能体基线):该基线代表了解决资源分配问题的传统单智能体方法。它使用中心化 DQN,通过对连续动作空间进行离散化,同时控制两个用户组。 Difference from Co-MADDPG: 1. Algorithm Class: Non-MARL (DQN) vs MARL (Co-MADDPG). 2. Action Space: Discrete (48 actions) vs Continuous. 3. Architecture: Centralized control vs Decentralized execution with CTDE. 4. Exploration: Epsilon-greedy vs OU Noise. 5. 与 Co-MADDPG 的区别: - 算法类别:非多智能体 (DQN) vs 多智能体 (Co-MADDPG)。 - 动作空间:离散(48 种动作组合) vs 连续。 - 架构:中心化控制 vs CTDE 架构下的分布式执行。 - 探索机制:ε-greedy vs OU 噪声。 Contribution: - Contributes to performance tables showing the limitations of discretization and centralized control in complex multi-user scenarios. - 贡献:用于性能表,展示在复杂多用户场景下,动作离散化和中心化控制的局限性。 """ # ---- Discrete action mapping (离散动作映射) ---- # 4 levels for subcarrier fraction, 4 for power fraction, 3 for m_param # 子载波比例 4 级,功率比例 4 级,m 参数 3 级 N_SUB_LEVELS = [0.25, 0.5, 0.75, 1.0] P_FRAC_LEVELS = [0.25, 0.5, 0.75, 1.0] M_PARAM_LEVELS = [0.33, 0.66, 1.0] NUM_ACTIONS = len(N_SUB_LEVELS) * len(P_FRAC_LEVELS) * len(M_PARAM_LEVELS) # 48 combinations # Build lookup table: index -> (n_sub_frac, p_frac, m_param) # 构建查找表:索引 -> (子载波比例, 功率比例, m 参数) _ACTION_TABLE = [] for n in N_SUB_LEVELS: for p in P_FRAC_LEVELS: for m in M_PARAM_LEVELS: _ACTION_TABLE.append(np.array([n, p, m], dtype=np.float32)) class DQNNet(nn.Module): """ Simple Fully Connected Q-network. 简单的全连接 Q 网络。 """ def __init__(self, state_dim, num_actions): super().__init__() self.net = nn.Sequential( nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, num_actions), ) def forward(self, x): """Map state to Q-values for each discrete action.""" return self.net(x) class DQNReplayBuffer: """ Wrapper buffer for SingleAgentDQN. 单智能体 DQN 的封装重放池。 Accepts the multi-agent 9-argument signature but stores transitions suitable for DQN. 接收多智能体的 9 参数签名,但内部存储适合 DQN 的转换数据。 """ def __init__(self, capacity): self.buffer = deque(maxlen=capacity) self._last_action_s_idx = 0 self._last_action_b_idx = 0 def set_last_actions(self, idx_s, idx_b): """Store the discrete action indices used.""" self._last_action_s_idx = idx_s self._last_action_b_idx = idx_b def push(self, obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, done=False): """ Store multi-agent step as a single-agent transition. 将多智能体步骤作为单智能体转换存储。 """ # Concatenate observations for centralized state # 拼接观察值以形成中心化状态 state = np.concatenate([np.asarray(obs_s, dtype=np.float32), np.asarray(obs_b, dtype=np.float32)]) next_state = np.concatenate([np.asarray(next_obs_s, dtype=np.float32), np.asarray(next_obs_b, dtype=np.float32)]) # Average rewards for single-agent scalar reward # 对奖励求平均以获得单智能体标量奖励 reward = 0.5 * (float(rew_s) + float(rew_b)) self.buffer.append((state, self._last_action_s_idx, self._last_action_b_idx, reward, next_state, float(done))) def sample(self, batch_size): """Sample a batch of transitions.""" batch = random.sample(self.buffer, batch_size) states, a_s, a_b, rewards, next_states, dones = zip(*batch) return (np.array(states), np.array(a_s), np.array(a_b), np.array(rewards, dtype=np.float32), np.array(next_states), np.array(dones, dtype=np.float32)) def __len__(self): return len(self.buffer) class SingleAgentDQN: """ SingleAgentDQN algorithm implementation. 单智能体 DQN 算法实现。 """ def __init__(self, config): # Initialize configuration and device # 初始化配置和设备 self.config = config self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hyperparameters # 超参数 self.gamma = config['training']['gamma'] self.batch_size = config['training']['batch_size'] self.tau = config['training']['tau'] # Dimensions: Concentated state # 维度:拼接后的状态 self.obs_dim = config['env']['num_subcarriers'] + 4 self.state_dim = self.obs_dim * 2 self.num_actions = NUM_ACTIONS # Two DQN heads: one for semantic (s) actions, one for traditional (b) actions # 两个 DQN 头:一个用于语义动作 (s),一个用于传统动作 (b) self.q_net_s = DQNNet(self.state_dim, self.num_actions).to(self.device) self.q_net_b = DQNNet(self.state_dim, self.num_actions).to(self.device) self.q_target_s = DQNNet(self.state_dim, self.num_actions).to(self.device) self.q_target_b = DQNNet(self.state_dim, self.num_actions).to(self.device) self.q_target_s.load_state_dict(self.q_net_s.state_dict()) self.q_target_b.load_state_dict(self.q_net_b.state_dict()) # Optimizers # 优化器 lr = config['training'].get('actor_lr', 1e-4) self.optimizer_s = torch.optim.Adam(self.q_net_s.parameters(), lr=lr) self.optimizer_b = torch.optim.Adam(self.q_net_b.parameters(), lr=lr) # Epsilon-greedy exploration parameters # ε-greedy 探索参数 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay_episodes = 3000 # Specialized Replay Buffer # 专用的重放池 self.replay_buffer = DQNReplayBuffer(config['training']['buffer_capacity']) # Discrete action index tracking # 离散动作索引追踪 self._last_action_s_idx = 0 self._last_action_b_idx = 0 # EpsilonAdapter: Hack to allow epsilon decay via train.py's existing loop # EpsilonAdapter:用于通过 train.py 现有循环触发 ε 衰减的技巧 self.noise_s = type('EpsilonAdapter', (), { 'decay_sigma': lambda _, ep: self._decay_epsilon(ep) })() def select_action(self, obs_s, obs_b, explore=True): """ Select discrete actions using epsilon-greedy policy. 使用 ε-greedy 策略选择离散动作。 """ state = np.concatenate([obs_s, obs_b]).astype(np.float32) state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device) if explore and random.random() < self.epsilon: # Random exploration # 随机探索 idx_s = random.randrange(self.num_actions) idx_b = random.randrange(self.num_actions) else: # Exploit learned Q-values # 利用已学习的 Q 值 self.q_net_s.eval() self.q_net_b.eval() with torch.no_grad(): q_s = self.q_net_s(state_t) q_b = self.q_net_b(state_t) self.q_net_s.train() self.q_net_b.train() idx_s = q_s.argmax(dim=1).item() idx_b = q_b.argmax(dim=1).item() # Update last indices for the buffer push # 更新用于存入重放池的最后索引 self._last_action_s_idx = idx_s self._last_action_b_idx = idx_b self.replay_buffer.set_last_actions(idx_s, idx_b) # Return continuous actions from lookup table # 从查找表中返回对应的连续动作 return _ACTION_TABLE[idx_s].copy(), _ACTION_TABLE[idx_b].copy() def compute_rewards(self, qoe_s, qoe_b, qoe_sys): """ Compute scalar reward for single agent. 为单智能体计算标量奖励。 Formula: r = 0.5 * (qoe_s + qoe_b) 公式说明:由于是单智能体控制全局,奖励取两组用户 QoE 的均值。 """ lam = 0.5 r = 0.5 * (qoe_s + qoe_b) return r, r, lam def update(self): """ Update the Q-networks. 更新 Q 网络。 """ if len(self.replay_buffer) < self.batch_size: return None # Sample batch # 采样批量数据 states, a_s, a_b, rewards, next_states, dones = \ self.replay_buffer.sample(self.batch_size) # To tensors # 转换为张量 states_t = torch.FloatTensor(states).to(self.device) next_states_t = torch.FloatTensor(next_states).to(self.device) rewards_t = torch.FloatTensor(rewards).unsqueeze(1).to(self.device) dones_t = torch.FloatTensor(dones).unsqueeze(1).to(self.device) a_s_t = torch.LongTensor(a_s).unsqueeze(1).to(self.device) a_b_t = torch.LongTensor(a_b).unsqueeze(1).to(self.device) # 1. Update Semantic Head (1. 更新语义分支) q_values_s = self.q_net_s(states_t).gather(1, a_s_t) with torch.no_grad(): next_q_s = self.q_target_s(next_states_t).max(1, keepdim=True)[0] target_s = rewards_t + self.gamma * (1 - dones_t) * next_q_s loss_s = F.mse_loss(q_values_s, target_s) self.optimizer_s.zero_grad() loss_s.backward() self.optimizer_s.step() # 2. Update Traditional Head (2. 更新传统分支) q_values_b = self.q_net_b(states_t).gather(1, a_b_t) with torch.no_grad(): next_q_b = self.q_target_b(next_states_t).max(1, keepdim=True)[0] target_b = rewards_t + self.gamma * (1 - dones_t) * next_q_b loss_b = F.mse_loss(q_values_b, target_b) self.optimizer_b.zero_grad() loss_b.backward() self.optimizer_b.step() # 3. Soft update target networks (3. 目标网络软更新) for target, source in [ (self.q_target_s, self.q_net_s), (self.q_target_b, self.q_net_b), ]: for tp, sp in zip(target.parameters(), source.parameters()): tp.data.copy_(self.tau * sp.data + (1.0 - self.tau) * tp.data) return {'loss_s': loss_s.item(), 'loss_b': loss_b.item()} def _decay_epsilon(self, episode): """ Decay epsilon over episodes. 随训练轮数衰减 ε。 """ frac = min(1.0, episode / max(1, self.epsilon_decay_episodes)) self.epsilon = self.epsilon + frac * (self.epsilon_min - self.epsilon) def save(self, path): """Save Q-nets.""" os.makedirs(path, exist_ok=True) torch.save(self.q_net_s.state_dict(), os.path.join(path, "q_net_s.pth")) torch.save(self.q_net_b.state_dict(), os.path.join(path, "q_net_b.pth")) def load(self, path): """Load Q-nets.""" self.q_net_s.load_state_dict(torch.load(os.path.join(path, "q_net_s.pth"), map_location=self.device)) self.q_net_b.load_state_dict(torch.load(os.path.join(path, "q_net_b.pth"), map_location=self.device)) self.q_target_s.load_state_dict(self.q_net_s.state_dict()) self.q_target_b.load_state_dict(self.q_net_b.state_dict())