SemanticCommunication/code/baselines/fixed_lambda.py

281 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import numpy as np
import torch
import torch.nn.functional as F
from agents.actor import Actor
from agents.critic import Critic
from agents.replay_buffer import ReplayBuffer
from agents.noise import OUNoise
"""
Baseline: FixedLambda (固定 λ 基线)
=====================================
Purpose (ablation):
- This baseline is used to evaluate the benefit of the dynamic lambda switching mechanism in Co-MADDPG.
- It fixes λ at a constant value (0.5), balancing cooperation and competition equally throughout the training.
- 目的(消融实验):该基线用于评估 Co-MADDPG 中动态 λ 切换机制的收益。它将 λ 固定为常数0.5),在整个训练过程中平衡协作与竞争。
Difference from Co-MADDPG:
1. Lambda (λ): Fixed at 0.5, whereas Co-MADDPG dynamically adjusts λ based on system state.
2. Update Order: Retains the Stackelberg update order (follower B first, then leader S), same as Co-MADDPG.
3. 与 Co-MADDPG 的区别:
- Lambda (λ): 固定为 0.5,而 Co-MADDPG 根据系统状态动态调整 λ。
- 更新顺序:保留了 Stackelberg 博弈更新顺序(先更新从属者 B再更新主导者 S与 Co-MADDPG 一致。
Contribution:
- Contributes to performance sensitivity analysis regarding the choice of λ and shows why a fixed balance is suboptimal.
- 贡献:用于关于 λ 选择的性能敏感性分析,展示为什么固定比例的平衡并非最优。
"""
class FixedLambda:
"""
FixedLambda algorithm implementation.
固定 λ 算法实现。
"""
def __init__(self, config):
# Initialize configuration and device
# 初始化配置和设备
self.config = config
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters: Gamma, Tau, Batch Size, and Fixed λ=0.5
# 超参数:折扣因子、软更新系数、批量大小以及固定 λ=0.5
self.gamma = config['training']['gamma']
self.tau = config['training']['tau']
self.batch_size = config['training']['batch_size']
self.fixed_lambda = 0.5
# Dimensions: State and Action
# 维度信息:状态与动作
self.obs_dim = config['env']['num_subcarriers'] + 4
self.act_dim = 3
# Actor networks and their target networks
# Actor 网络及其目标网络
hidden_a = config['network']['actor_hidden']
hidden_c = config['network']['critic_hidden']
self.actor_s = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
self.actor_b = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
self.actor_s_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
self.actor_b_target = Actor(self.obs_dim, self.act_dim, hidden_a).to(self.device)
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
# Joint Critics for Centralized Training
# 用于中心化训练的联合 Critic
obs_total = self.obs_dim * 2
act_total = self.act_dim * 2
self.critic_s = Critic(obs_total, act_total, hidden_c).to(self.device)
self.critic_b = Critic(obs_total, act_total, hidden_c).to(self.device)
self.critic_s_target = Critic(obs_total, act_total, hidden_c).to(self.device)
self.critic_b_target = Critic(obs_total, act_total, hidden_c).to(self.device)
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
# Optimizers for actors and critics
# Actor 与 Critic 的优化器
self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr'])
self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr'])
self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr'])
self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr'])
# Experience Replay and OU Noise for exploration
# 经验重放池与用于探索的 OU 噪声
self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity'])
self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
sigma_init=config['training']['ou_sigma_init'],
sigma_min=config['training']['ou_sigma_min'])
self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'],
sigma_init=config['training']['ou_sigma_init'],
sigma_min=config['training']['ou_sigma_min'])
def select_action(self, obs_s, obs_b, explore=True):
"""
Select actions for both agents given observations.
根据观察结果为两个智能体选择动作。
"""
self.actor_s.eval()
self.actor_b.eval()
with torch.no_grad():
obs_s_t = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device)
obs_b_t = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device)
act_s = self.actor_s(obs_s_t).cpu().numpy()[0]
act_b = self.actor_b(obs_b_t).cpu().numpy()[0]
self.actor_s.train()
self.actor_b.train()
if explore:
# Add noise during training exploration
# 训练探索期间增加噪声
act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0)
act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0)
else:
act_s = np.clip(act_s, 0.0, 1.0)
act_b = np.clip(act_b, 0.0, 1.0)
return act_s, act_b
def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
"""
Compute rewards with fixed λ=0.5.
使用固定 λ=0.5 计算奖励。
Formula: r_i = 0.5 * r_coop + 0.5 * r_comp
公式说明:奖励是协作项与竞争项的等权之和。
"""
lam = self.fixed_lambda
rew_cfg = self.config.get('reward', {})
coop_self = rew_cfg.get('coop_self', 0.5)
coop_other = rew_cfg.get('coop_other', 0.3)
coop_sys = rew_cfg.get('coop_sys', 0.2)
comp_self = rew_cfg.get('comp_self', 0.8)
comp_sys = rew_cfg.get('comp_sys', 0.2)
# Compute Cooperative and Competitive components for S
# 计算 S 的协作与竞争组成部分
r_coop_s = coop_self * qoe_s + coop_other * qoe_b + coop_sys * qoe_sys
r_comp_s = comp_self * qoe_s + comp_sys * qoe_sys
r_s = lam * r_coop_s + (1 - lam) * r_comp_s
# Compute Cooperative and Competitive components for B
# 计算 B 的协作与竞争组成部分
r_coop_b = coop_self * qoe_b + coop_other * qoe_s + coop_sys * qoe_sys
r_comp_b = comp_self * qoe_b + comp_sys * qoe_sys
r_b = lam * r_coop_b + (1 - lam) * r_comp_b
return r_s, r_b, lam
def update(self):
"""
Update networks using Stackelberg update order.
使用 Stackelberg 博弈顺序更新网络。
Order: Follower B updates first, then Leader S updates considering B's response.
顺序:从属者 B 先更新,随后主导者 S 在考虑 B 的响应后进行更新。
"""
if len(self.replay_buffer) < self.batch_size:
return None
# Sample from replay buffer
# 从经验池采样
obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = \
self.replay_buffer.sample(self.batch_size)
# Convert to tensors
# 转换为张量
obs_s = torch.FloatTensor(obs_s).to(self.device)
obs_b = torch.FloatTensor(obs_b).to(self.device)
act_s = torch.FloatTensor(act_s).to(self.device)
act_b = torch.FloatTensor(act_b).to(self.device)
rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device)
rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device)
next_obs_s = torch.FloatTensor(next_obs_s).to(self.device)
next_obs_b = torch.FloatTensor(next_obs_b).to(self.device)
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
# Centralized observation and next observation
# 中心化观察与下一状态观察
joint_obs = torch.cat([obs_s, obs_b], dim=1)
joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1)
joint_act = torch.cat([act_s, act_b], dim=1)
# Compute targets for critics
# 计算 Critic 的目标值
with torch.no_grad():
next_act_s = self.actor_s_target(next_obs_s)
next_act_b = self.actor_b_target(next_obs_b)
joint_next_act = torch.cat([next_act_s, next_act_b], dim=1)
target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act)
target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act)
# --- Stackelberg: update follower B first ---
# --- Stackelberg 博弈:首先更新从属者 B ---
# Update Critic B
# 更新 Critic B
current_q_b = self.critic_b(joint_obs, joint_act)
critic_loss_b = F.mse_loss(current_q_b, target_q_b)
self.critic_b_optimizer.zero_grad()
critic_loss_b.backward()
self.critic_b_optimizer.step()
# Update Actor B (Follower)
# 更新 Actor B (从属者)
new_act_b = self.actor_b(obs_b)
actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean()
self.actor_b_optimizer.zero_grad()
actor_loss_b.backward()
self.actor_b_optimizer.step()
# --- Then update leader S ---
# --- 然后更新主导者 S ---
# Re-compute follower's best response for leader's critic update
# 为主导者的 Critic 更新重新计算从属者的最佳响应
with torch.no_grad():
act_b_br = self.actor_b(obs_b)
joint_act_leader = torch.cat([act_s, act_b_br], dim=1)
# Update Critic S
# 更新 Critic S
current_q_s = self.critic_s(joint_obs, joint_act_leader)
critic_loss_s = F.mse_loss(current_q_s, target_q_s)
self.critic_s_optimizer.zero_grad()
critic_loss_s.backward()
self.critic_s_optimizer.step()
# Update Actor S (Leader) considering Follower's best response
# 考虑从属者的最佳响应,更新 Actor S (主导者)
with torch.no_grad():
act_b_br2 = self.actor_b(obs_b)
new_act_s = self.actor_s(obs_s)
actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b_br2], dim=1)).mean()
self.actor_s_optimizer.zero_grad()
actor_loss_s.backward()
self.actor_s_optimizer.step()
# Soft update target networks
# 目标网络软更新
for target, source in [
(self.critic_s_target, self.critic_s),
(self.critic_b_target, self.critic_b),
(self.actor_s_target, self.actor_s),
(self.actor_b_target, self.actor_b),
]:
for tp, sp in zip(target.parameters(), source.parameters()):
tp.data.copy_(self.tau * sp.data + (1.0 - self.tau) * tp.data)
return {
'actor_loss_s': actor_loss_s.item(),
'actor_loss_b': actor_loss_b.item(),
'critic_loss_s': critic_loss_s.item(),
'critic_loss_b': critic_loss_b.item(),
}
def save(self, path):
"""
Save models to disk.
将模型保存至磁盘。
"""
os.makedirs(path, exist_ok=True)
torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth"))
torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth"))
torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth"))
torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth"))
def load(self, path):
"""
Load models from disk.
从磁盘加载模型。
"""
self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device))
self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device))
self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device))
self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device))
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
self.critic_b_target.load_state_dict(self.critic_b.state_dict())