246 lines
13 KiB
Python
246 lines
13 KiB
Python
import os
|
||
import torch
|
||
import torch.nn.functional as F
|
||
import numpy as np
|
||
from agents.actor import Actor
|
||
from agents.critic import Critic
|
||
from agents.replay_buffer import ReplayBuffer
|
||
from agents.noise import OUNoise
|
||
|
||
"""
|
||
Baseline: PureCooperative (纯协作基线)
|
||
=====================================
|
||
Purpose (ablation):
|
||
- This baseline removes the competitive component from the MADDPG framework.
|
||
- It serves as an ablation study to demonstrate the necessity of competitive modeling (λ < 1) for system performance.
|
||
- 目的(消融实验):该基线移除了 MADDPG 框架中的竞争成分。作为消融实验,用于证明在系统中引入竞争建模(λ < 1)对性能提升的必要性。
|
||
|
||
Difference from Co-MADDPG:
|
||
1. Lambda (λ): Fixed at 1.0 (pure cooperation), whereas Co-MADDPG uses dynamic λ.
|
||
2. Update Order: Uses simultaneous updates for both actors, whereas Co-MADDPG uses Stackelberg update order.
|
||
3. 与 Co-MADDPG 的区别:
|
||
- Lambda (λ): 固定为 1.0(纯协作),而 Co-MADDPG 使用动态 λ。
|
||
- 更新顺序:两个参与者同时更新(Simultaneous Update),而 Co-MADDPG 使用 Stackelberg 博弈更新顺序。
|
||
|
||
Contribution:
|
||
- Contributes to performance comparison figures and tables (e.g., convergence speed and final QoE) to show how pure cooperation handles resource conflicts.
|
||
- 贡献:用于性能对比图表(如收敛速度和最终 QoE),展示纯协作模式在处理资源冲突时的表现。
|
||
"""
|
||
|
||
class PureCooperative:
|
||
"""
|
||
PureCooperative algorithm implementation.
|
||
纯协作算法实现。
|
||
"""
|
||
def __init__(self, config):
|
||
# Initialize configuration and device
|
||
# 初始化配置和设备
|
||
self.config = config
|
||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||
|
||
# Hyperparameters: Gamma (discount), Tau (soft update), Batch Size
|
||
# 超参数:折扣因子、软更新系数、批量大小
|
||
self.gamma = config['training']['gamma']
|
||
self.tau = config['training']['tau']
|
||
self.batch_size = config['training']['batch_size']
|
||
|
||
# Dimensions: State (subcarriers + 4), Action (3)
|
||
# 维度:状态(子载波 + 4)、动作(3)
|
||
self.obs_dim = config['env']['num_subcarriers'] + 4
|
||
self.act_dim = 3
|
||
|
||
# Agents: Semantic (s) and Traditional (b) actors and target networks
|
||
# 智能体:语义 (s) 与 传统 (b) 参与者的 Actor 及其目标网络
|
||
self.actor_s = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
|
||
self.actor_b = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
|
||
self.actor_s_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
|
||
self.actor_b_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
|
||
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
|
||
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
|
||
|
||
# Joint Critics: Uses Centralized Training (obs_dim*2, act_dim*2)
|
||
# 联合 Critic:使用中心化训练(输入为两体观察与动作的并集)
|
||
self.critic_s = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
|
||
self.critic_b = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
|
||
self.critic_s_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
|
||
self.critic_b_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
|
||
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
|
||
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
|
||
|
||
# Optimizers for all networks
|
||
# 所有网络的优化器
|
||
self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr'])
|
||
self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr'])
|
||
self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr'])
|
||
self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr'])
|
||
|
||
# Experience Replay and Noise for exploration
|
||
# 经验重放池与用于探索的噪声
|
||
self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity'])
|
||
self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min'])
|
||
self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min'])
|
||
|
||
def select_action(self, obs_s, obs_b, explore=True):
|
||
"""
|
||
Select actions for both agents given observations.
|
||
根据观察结果为两个智能体选择动作。
|
||
"""
|
||
obs_s = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device)
|
||
obs_b = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device)
|
||
|
||
self.actor_s.eval()
|
||
self.actor_b.eval()
|
||
with torch.no_grad():
|
||
# Forward pass through actors
|
||
# Actor 前向传播
|
||
act_s = self.actor_s(obs_s).cpu().numpy()[0]
|
||
act_b = self.actor_b(obs_b).cpu().numpy()[0]
|
||
self.actor_s.train()
|
||
self.actor_b.train()
|
||
|
||
if explore:
|
||
# Apply OU noise for exploration
|
||
# 应用 OU 噪声进行探索
|
||
act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0)
|
||
act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0)
|
||
|
||
return act_s, act_b
|
||
|
||
def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
|
||
"""
|
||
Compute rewards based on pure cooperation (λ=1).
|
||
基于纯协作计算奖励 (λ=1)。
|
||
|
||
Formula: r_i = coop_self * qoe_i + coop_other * qoe_j + coop_sys * qoe_sys
|
||
公式说明:由于 λ=1,奖励完全由协作项组成,考虑自身 QoE、对方 QoE 以及系统总 QoE。
|
||
"""
|
||
lam = 1.0
|
||
r_s = self.config['reward']['coop_self'] * qoe_s + self.config['reward']['coop_other'] * qoe_b + self.config['reward']['coop_sys'] * qoe_sys
|
||
r_b = self.config['reward']['coop_self'] * qoe_b + self.config['reward']['coop_other'] * qoe_s + self.config['reward']['coop_sys'] * qoe_sys
|
||
return r_s, r_b, lam
|
||
|
||
def update(self):
|
||
"""
|
||
Update the networks using sampled experiences.
|
||
使用采样的经验更新网络。
|
||
|
||
Update order: Simultaneous updates (both actors update based on current policy of the other).
|
||
更新顺序:同时更新(两个 Actor 基于对方当前的策略进行更新)。
|
||
"""
|
||
if len(self.replay_buffer) < self.batch_size:
|
||
return None
|
||
|
||
# Sample batch from replay buffer
|
||
# 从重放池采样批量数据
|
||
obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = self.replay_buffer.sample(self.batch_size)
|
||
|
||
# Convert to tensors
|
||
# 转换为张量
|
||
obs_s = torch.FloatTensor(obs_s).to(self.device)
|
||
obs_b = torch.FloatTensor(obs_b).to(self.device)
|
||
act_s = torch.FloatTensor(act_s).to(self.device)
|
||
act_b = torch.FloatTensor(act_b).to(self.device)
|
||
rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device)
|
||
rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device)
|
||
next_obs_s = torch.FloatTensor(next_obs_s).to(self.device)
|
||
next_obs_b = torch.FloatTensor(next_obs_b).to(self.device)
|
||
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
|
||
|
||
# Centralized observations and actions
|
||
# 中心化观察与动作
|
||
joint_obs = torch.cat([obs_s, obs_b], dim=1)
|
||
joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1)
|
||
joint_act = torch.cat([act_s, act_b], dim=1)
|
||
|
||
# 1. Critics Update (1. Critic 更新)
|
||
with torch.no_grad():
|
||
# Get target actions for next state
|
||
# 获取下一状态的目标动作
|
||
next_act_s = self.actor_s_target(next_obs_s)
|
||
next_act_b = self.actor_b_target(next_obs_b)
|
||
joint_next_act = torch.cat([next_act_s, next_act_b], dim=1)
|
||
|
||
# Compute target Q values
|
||
# 计算目标 Q 值
|
||
target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act)
|
||
target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act)
|
||
|
||
# Compute current Q values and MSE loss
|
||
# 计算当前 Q 值与均方误差损失
|
||
current_q_s = self.critic_s(joint_obs, joint_act)
|
||
current_q_b = self.critic_b(joint_obs, joint_act)
|
||
|
||
critic_loss_s = F.mse_loss(current_q_s, target_q_s)
|
||
critic_loss_b = F.mse_loss(current_q_b, target_q_b)
|
||
|
||
# Backpropagation for critics
|
||
# Critic 的反向传播
|
||
self.critic_s_optimizer.zero_grad()
|
||
critic_loss_s.backward()
|
||
self.critic_s_optimizer.step()
|
||
|
||
self.critic_b_optimizer.zero_grad()
|
||
critic_loss_b.backward()
|
||
self.critic_b_optimizer.step()
|
||
|
||
# 2. Actors Update (Simultaneous) (2. Actor 更新 - 同时进行)
|
||
new_act_s = self.actor_s(obs_s)
|
||
new_act_b = self.actor_b(obs_b)
|
||
|
||
# Calculate policy loss using joint critic
|
||
# 使用联合 Critic 计算策略损失
|
||
actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b], dim=1)).mean()
|
||
actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean()
|
||
|
||
# Backpropagation for actors
|
||
# Actor 的反向传播
|
||
self.actor_s_optimizer.zero_grad()
|
||
actor_loss_s.backward()
|
||
self.actor_s_optimizer.step()
|
||
|
||
self.actor_b_optimizer.zero_grad()
|
||
actor_loss_b.backward()
|
||
self.actor_b_optimizer.step()
|
||
|
||
# 3. Soft Target Networks Update (3. 目标网络软更新)
|
||
for target_param, param in zip(self.critic_s_target.parameters(), self.critic_s.parameters()):
|
||
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
|
||
for target_param, param in zip(self.critic_b_target.parameters(), self.critic_b.parameters()):
|
||
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
|
||
for target_param, param in zip(self.actor_s_target.parameters(), self.actor_s.parameters()):
|
||
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
|
||
for target_param, param in zip(self.actor_b_target.parameters(), self.actor_b.parameters()):
|
||
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
|
||
|
||
return {
|
||
'actor_loss_s': actor_loss_s.item(),
|
||
'actor_loss_b': actor_loss_b.item(),
|
||
'critic_loss_s': critic_loss_s.item(),
|
||
'critic_loss_b': critic_loss_b.item()
|
||
}
|
||
|
||
def save(self, path):
|
||
"""
|
||
Save models to disk.
|
||
将模型保存至磁盘。
|
||
"""
|
||
os.makedirs(path, exist_ok=True)
|
||
torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth"))
|
||
torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth"))
|
||
torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth"))
|
||
torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth"))
|
||
|
||
def load(self, path):
|
||
"""
|
||
Load models from disk.
|
||
从磁盘加载模型。
|
||
"""
|
||
self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device))
|
||
self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device))
|
||
self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device))
|
||
self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device))
|
||
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
|
||
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
|
||
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
|
||
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
|