281 lines
13 KiB
Python
281 lines
13 KiB
Python
import os
|
||
import numpy as np
|
||
import torch
|
||
import torch.nn.functional as F
|
||
|
||
from agents.actor import Actor
|
||
from agents.critic import Critic
|
||
from agents.replay_buffer import ReplayBuffer
|
||
from agents.noise import OUNoise
|
||
|
||
"""
|
||
Baseline: FixedLambda (固定 λ 基线)
|
||
=====================================
|
||
Purpose (ablation):
|
||
- This baseline is used to evaluate the benefit of the dynamic lambda switching mechanism in Co-MADDPG.
|
||
- It fixes λ at a constant value (0.5), balancing cooperation and competition equally throughout the training.
|
||
- 目的(消融实验):该基线用于评估 Co-MADDPG 中动态 λ 切换机制的收益。它将 λ 固定为常数(0.5),在整个训练过程中平衡协作与竞争。
|
||
|
||
Difference from Co-MADDPG:
|
||
1. Lambda (λ): Fixed at 0.5, whereas Co-MADDPG dynamically adjusts λ based on system state.
|
||
2. Update Order: Retains the Stackelberg update order (follower B first, then leader S), same as Co-MADDPG.
|
||
3. 与 Co-MADDPG 的区别:
|
||
- Lambda (λ): 固定为 0.5,而 Co-MADDPG 根据系统状态动态调整 λ。
|
||
- 更新顺序:保留了 Stackelberg 博弈更新顺序(先更新从属者 B,再更新主导者 S),与 Co-MADDPG 一致。
|
||
|
||
Contribution:
|
||
- Contributes to performance sensitivity analysis regarding the choice of λ and shows why a fixed balance is suboptimal.
|
||
- 贡献:用于关于 λ 选择的性能敏感性分析,展示为什么固定比例的平衡并非最优。
|
||
"""
|
||
|
||
class FixedLambda:
|
||
"""
|
||
FixedLambda algorithm implementation.
|
||
固定 λ 算法实现。
|
||
"""
|
||
def __init__(self, config):
|
||
# Initialize configuration and device
|
||
# 初始化配置和设备
|
||
self.config = config
|
||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||
|
||
# Hyperparameters: Gamma, Tau, Batch Size, and Fixed λ=0.5
|
||
# 超参数:折扣因子、软更新系数、批量大小以及固定 λ=0.5
|
||
self.gamma = config['training']['gamma']
|
||
self.tau = config['training']['tau']
|
||
self.batch_size = config['training']['batch_size']
|
||
self.fixed_lambda = 0.5
|
||
|
||
# Dimensions: State and Action
|
||
# 维度信息:状态与动作
|
||
self.obs_dim = config['env']['num_subcarriers'] + 4
|
||
self.act_dim = 3
|
||
|
||
# Actor networks and their target networks
|
||
# Actor 网络及其目标网络
|
||
hidden_a = config['network']['actor_hidden']
|
||
hidden_c = config['network']['critic_hidden']
|
||
|
||
self.actor_s = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
|
||
self.actor_b = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
|
||
self.actor_s_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
|
||
self.actor_b_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
|
||
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
|
||
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
|
||
|
||
# Joint Critics for Centralized Training
|
||
# 用于中心化训练的联合 Critic
|
||
obs_total = self.obs_dim * 2
|
||
act_total = self.act_dim * 2
|
||
self.critic_s = Critic(obs_total, act_total, hidden_c).to(self.device)
|
||
self.critic_b = Critic(obs_total, act_total, hidden_c).to(self.device)
|
||
self.critic_s_target = Critic(obs_total, act_total, hidden_c).to(self.device)
|
||
self.critic_b_target = Critic(obs_total, act_total, hidden_c).to(self.device)
|
||
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
|
||
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
|
||
|
||
# Optimizers for actors and critics
|
||
# Actor 与 Critic 的优化器
|
||
self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr'])
|
||
self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr'])
|
||
self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr'])
|
||
self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr'])
|
||
|
||
# Experience Replay and OU Noise for exploration
|
||
# 经验重放池与用于探索的 OU 噪声
|
||
self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity'])
|
||
self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
|
||
sigma_init=config['training']['ou_sigma_init'],
|
||
sigma_min=config['training']['ou_sigma_min'])
|
||
self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
|
||
sigma_init=config['training']['ou_sigma_init'],
|
||
sigma_min=config['training']['ou_sigma_min'])
|
||
|
||
def select_action(self, obs_s, obs_b, explore=True):
|
||
"""
|
||
Select actions for both agents given observations.
|
||
根据观察结果为两个智能体选择动作。
|
||
"""
|
||
self.actor_s.eval()
|
||
self.actor_b.eval()
|
||
with torch.no_grad():
|
||
obs_s_t = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device)
|
||
obs_b_t = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device)
|
||
act_s = self.actor_s(obs_s_t).cpu().numpy()[0]
|
||
act_b = self.actor_b(obs_b_t).cpu().numpy()[0]
|
||
self.actor_s.train()
|
||
self.actor_b.train()
|
||
|
||
if explore:
|
||
# Add noise during training exploration
|
||
# 训练探索期间增加噪声
|
||
act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0)
|
||
act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0)
|
||
else:
|
||
act_s = np.clip(act_s, 0.0, 1.0)
|
||
act_b = np.clip(act_b, 0.0, 1.0)
|
||
|
||
return act_s, act_b
|
||
|
||
def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
|
||
"""
|
||
Compute rewards with fixed λ=0.5.
|
||
使用固定 λ=0.5 计算奖励。
|
||
|
||
Formula: r_i = 0.5 * r_coop + 0.5 * r_comp
|
||
公式说明:奖励是协作项与竞争项的等权之和。
|
||
"""
|
||
lam = self.fixed_lambda
|
||
rew_cfg = self.config.get('reward', {})
|
||
coop_self = rew_cfg.get('coop_self', 0.5)
|
||
coop_other = rew_cfg.get('coop_other', 0.3)
|
||
coop_sys = rew_cfg.get('coop_sys', 0.2)
|
||
comp_self = rew_cfg.get('comp_self', 0.8)
|
||
comp_sys = rew_cfg.get('comp_sys', 0.2)
|
||
|
||
# Compute Cooperative and Competitive components for S
|
||
# 计算 S 的协作与竞争组成部分
|
||
r_coop_s = coop_self * qoe_s + coop_other * qoe_b + coop_sys * qoe_sys
|
||
r_comp_s = comp_self * qoe_s + comp_sys * qoe_sys
|
||
r_s = lam * r_coop_s + (1 - lam) * r_comp_s
|
||
|
||
# Compute Cooperative and Competitive components for B
|
||
# 计算 B 的协作与竞争组成部分
|
||
r_coop_b = coop_self * qoe_b + coop_other * qoe_s + coop_sys * qoe_sys
|
||
r_comp_b = comp_self * qoe_b + comp_sys * qoe_sys
|
||
r_b = lam * r_coop_b + (1 - lam) * r_comp_b
|
||
|
||
return r_s, r_b, lam
|
||
|
||
def update(self):
|
||
"""
|
||
Update networks using Stackelberg update order.
|
||
使用 Stackelberg 博弈顺序更新网络。
|
||
|
||
Order: Follower B updates first, then Leader S updates considering B's response.
|
||
顺序:从属者 B 先更新,随后主导者 S 在考虑 B 的响应后进行更新。
|
||
"""
|
||
if len(self.replay_buffer) < self.batch_size:
|
||
return None
|
||
|
||
# Sample from replay buffer
|
||
# 从经验池采样
|
||
obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = \
|
||
self.replay_buffer.sample(self.batch_size)
|
||
|
||
# Convert to tensors
|
||
# 转换为张量
|
||
obs_s = torch.FloatTensor(obs_s).to(self.device)
|
||
obs_b = torch.FloatTensor(obs_b).to(self.device)
|
||
act_s = torch.FloatTensor(act_s).to(self.device)
|
||
act_b = torch.FloatTensor(act_b).to(self.device)
|
||
rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device)
|
||
rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device)
|
||
next_obs_s = torch.FloatTensor(next_obs_s).to(self.device)
|
||
next_obs_b = torch.FloatTensor(next_obs_b).to(self.device)
|
||
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
|
||
|
||
# Centralized observation and next observation
|
||
# 中心化观察与下一状态观察
|
||
joint_obs = torch.cat([obs_s, obs_b], dim=1)
|
||
joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1)
|
||
joint_act = torch.cat([act_s, act_b], dim=1)
|
||
|
||
# Compute targets for critics
|
||
# 计算 Critic 的目标值
|
||
with torch.no_grad():
|
||
next_act_s = self.actor_s_target(next_obs_s)
|
||
next_act_b = self.actor_b_target(next_obs_b)
|
||
joint_next_act = torch.cat([next_act_s, next_act_b], dim=1)
|
||
target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act)
|
||
target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act)
|
||
|
||
# --- Stackelberg: update follower B first ---
|
||
# --- Stackelberg 博弈:首先更新从属者 B ---
|
||
|
||
# Update Critic B
|
||
# 更新 Critic B
|
||
current_q_b = self.critic_b(joint_obs, joint_act)
|
||
critic_loss_b = F.mse_loss(current_q_b, target_q_b)
|
||
self.critic_b_optimizer.zero_grad()
|
||
critic_loss_b.backward()
|
||
self.critic_b_optimizer.step()
|
||
|
||
# Update Actor B (Follower)
|
||
# 更新 Actor B (从属者)
|
||
new_act_b = self.actor_b(obs_b)
|
||
actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean()
|
||
self.actor_b_optimizer.zero_grad()
|
||
actor_loss_b.backward()
|
||
self.actor_b_optimizer.step()
|
||
|
||
# --- Then update leader S ---
|
||
# --- 然后更新主导者 S ---
|
||
|
||
# Re-compute follower's best response for leader's critic update
|
||
# 为主导者的 Critic 更新重新计算从属者的最佳响应
|
||
with torch.no_grad():
|
||
act_b_br = self.actor_b(obs_b)
|
||
joint_act_leader = torch.cat([act_s, act_b_br], dim=1)
|
||
|
||
# Update Critic S
|
||
# 更新 Critic S
|
||
current_q_s = self.critic_s(joint_obs, joint_act_leader)
|
||
critic_loss_s = F.mse_loss(current_q_s, target_q_s)
|
||
self.critic_s_optimizer.zero_grad()
|
||
critic_loss_s.backward()
|
||
self.critic_s_optimizer.step()
|
||
|
||
# Update Actor S (Leader) considering Follower's best response
|
||
# 考虑从属者的最佳响应,更新 Actor S (主导者)
|
||
with torch.no_grad():
|
||
act_b_br2 = self.actor_b(obs_b)
|
||
new_act_s = self.actor_s(obs_s)
|
||
actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b_br2], dim=1)).mean()
|
||
self.actor_s_optimizer.zero_grad()
|
||
actor_loss_s.backward()
|
||
self.actor_s_optimizer.step()
|
||
|
||
# Soft update target networks
|
||
# 目标网络软更新
|
||
for target, source in [
|
||
(self.critic_s_target, self.critic_s),
|
||
(self.critic_b_target, self.critic_b),
|
||
(self.actor_s_target, self.actor_s),
|
||
(self.actor_b_target, self.actor_b),
|
||
]:
|
||
for tp, sp in zip(target.parameters(), source.parameters()):
|
||
tp.data.copy_(self.tau * sp.data + (1.0 - self.tau) * tp.data)
|
||
|
||
return {
|
||
'actor_loss_s': actor_loss_s.item(),
|
||
'actor_loss_b': actor_loss_b.item(),
|
||
'critic_loss_s': critic_loss_s.item(),
|
||
'critic_loss_b': critic_loss_b.item(),
|
||
}
|
||
|
||
def save(self, path):
|
||
"""
|
||
Save models to disk.
|
||
将模型保存至磁盘。
|
||
"""
|
||
os.makedirs(path, exist_ok=True)
|
||
torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth"))
|
||
torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth"))
|
||
torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth"))
|
||
torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth"))
|
||
|
||
def load(self, path):
|
||
"""
|
||
Load models from disk.
|
||
从磁盘加载模型。
|
||
"""
|
||
self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device))
|
||
self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device))
|
||
self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device))
|
||
self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device))
|
||
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
|
||
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
|
||
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
|
||
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
|