SemanticCommunication/code/baselines/pure_comp.py

246 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import torch
import torch.nn.functional as F
import numpy as np
from agents.actor import Actor
from agents.critic import Critic
from agents.replay_buffer import ReplayBuffer
from agents.noise import OUNoise
"""
Baseline: PureCompetitive (纯竞争基线)
=====================================
Purpose (ablation):
- This baseline removes the cooperative component from the MADDPG framework.
- It serves as an ablation study to demonstrate that pure competition (λ=0) leads to resource wastage and suboptimal system-wide utility.
- 目的(消融实验):该基线移除了 MADDPG 框架中的协作成分。作为消融实验,用于证明纯竞争模式(λ=0会导致资源浪费和系统级效用降低。
Difference from Co-MADDPG:
1. Lambda (λ): Fixed at 0.0 (pure competition), whereas Co-MADDPG uses dynamic λ.
2. Update Order: Uses simultaneous updates for both actors, whereas Co-MADDPG uses Stackelberg update order.
3. 与 Co-MADDPG 的区别:
- Lambda (λ): 固定为 0.0(纯竞争),而 Co-MADDPG 使用动态 λ。
- 更新顺序两个参与者同时更新Simultaneous Update而 Co-MADDPG 使用 Stackelberg 博弈更新顺序。
Contribution:
- Contributes to comparison figures showing the "Price of Anarchy" in resource allocation.
- 贡献:用于对比图表,展示资源分配中的“无政府代价”。
"""
class PureCompetitive:
"""
PureCompetitive algorithm implementation.
纯竞争算法实现。
"""
def __init__(self, config):
# Initialize configuration and device
# 初始化配置和设备
self.config = config
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters: Gamma (discount), Tau (soft update), Batch Size
# 超参数:折扣因子、软更新系数、批量大小
self.gamma = config['training']['gamma']
self.tau = config['training']['tau']
self.batch_size = config['training']['batch_size']
# Dimensions: State (subcarriers + 4), Action (3)
# 维度:状态(子载波 + 4、动作3
self.obs_dim = config['env']['num_subcarriers'] + 4
self.act_dim = 3
# Agents: Semantic (s) and Traditional (b) actors and target networks
# 智能体:语义 (s) 与 传统 (b) 参与者的 Actor 及其目标网络
self.actor_s = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
self.actor_b = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
self.actor_s_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
self.actor_b_target = Actor(self.obs_dim, self.act_dim, config['network']['actor_hidden']).to(self.device)
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
# Joint Critics: Uses Centralized Training (obs_dim*2, act_dim*2)
# 联合 Critic使用中心化训练输入为两体观察与动作的并集
self.critic_s = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
self.critic_b = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
self.critic_s_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
self.critic_b_target = Critic(self.obs_dim*2, self.act_dim*2, config['network']['critic_hidden']).to(self.device)
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
self.critic_b_target.load_state_dict(self.critic_b.state_dict())
# Optimizers for all networks
# 所有网络的优化器
self.actor_s_optimizer = torch.optim.Adam(self.actor_s.parameters(), lr=config['training']['actor_lr'])
self.actor_b_optimizer = torch.optim.Adam(self.actor_b.parameters(), lr=config['training']['actor_lr'])
self.critic_s_optimizer = torch.optim.Adam(self.critic_s.parameters(), lr=config['training']['critic_lr'])
self.critic_b_optimizer = torch.optim.Adam(self.critic_b.parameters(), lr=config['training']['critic_lr'])
# Experience Replay and Noise for exploration
# 经验重放池与用于探索的噪声
self.replay_buffer = ReplayBuffer(config['training']['buffer_capacity'])
self.noise_s = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min'])
self.noise_b = OUNoise(self.act_dim, theta=config['training']['ou_theta'], sigma_init=config['training']['ou_sigma_init'], sigma_min=config['training']['ou_sigma_min'])
def select_action(self, obs_s, obs_b, explore=True):
"""
Select actions for both agents given observations.
根据观察结果为两个智能体选择动作。
"""
obs_s = torch.FloatTensor(obs_s).unsqueeze(0).to(self.device)
obs_b = torch.FloatTensor(obs_b).unsqueeze(0).to(self.device)
self.actor_s.eval()
self.actor_b.eval()
with torch.no_grad():
# Forward pass through actors
# Actor 前向传播
act_s = self.actor_s(obs_s).cpu().numpy()[0]
act_b = self.actor_b(obs_b).cpu().numpy()[0]
self.actor_s.train()
self.actor_b.train()
if explore:
# Apply OU noise for exploration
# 应用 OU 噪声进行探索
act_s = np.clip(act_s + self.noise_s.sample(), 0.0, 1.0)
act_b = np.clip(act_b + self.noise_b.sample(), 0.0, 1.0)
return act_s, act_b
def compute_rewards(self, qoe_s, qoe_b, qoe_sys):
"""
Compute rewards based on pure competition (λ=0).
基于纯竞争计算奖励 (λ=0)。
Formula: r_i = comp_self * qoe_i + comp_sys * qoe_sys
公式说明:由于 λ=0奖励完全由竞争项组成仅考虑自身 QoE 以及系统总 QoE 的惩罚项。
"""
lam = 0.0
r_s = self.config['reward']['comp_self'] * qoe_s + self.config['reward']['comp_sys'] * qoe_sys
r_b = self.config['reward']['comp_self'] * qoe_b + self.config['reward']['comp_sys'] * qoe_sys
return r_s, r_b, lam
def update(self):
"""
Update the networks using sampled experiences.
使用采样的经验更新网络。
Update order: Simultaneous updates (both actors update based on current policy of the other).
更新顺序:同时更新(两个 Actor 基于对方当前的策略进行更新)。
"""
if len(self.replay_buffer) < self.batch_size:
return None
# Sample batch from replay buffer
# 从重放池采样批量数据
obs_s, obs_b, act_s, act_b, rew_s, rew_b, next_obs_s, next_obs_b, dones = self.replay_buffer.sample(self.batch_size)
# Convert to tensors
# 转换为张量
obs_s = torch.FloatTensor(obs_s).to(self.device)
obs_b = torch.FloatTensor(obs_b).to(self.device)
act_s = torch.FloatTensor(act_s).to(self.device)
act_b = torch.FloatTensor(act_b).to(self.device)
rew_s = torch.FloatTensor(rew_s).unsqueeze(1).to(self.device)
rew_b = torch.FloatTensor(rew_b).unsqueeze(1).to(self.device)
next_obs_s = torch.FloatTensor(next_obs_s).to(self.device)
next_obs_b = torch.FloatTensor(next_obs_b).to(self.device)
dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)
# Centralized observations and actions
# 中心化观察与动作
joint_obs = torch.cat([obs_s, obs_b], dim=1)
joint_next_obs = torch.cat([next_obs_s, next_obs_b], dim=1)
joint_act = torch.cat([act_s, act_b], dim=1)
# 1. Critics Update (1. Critic 更新)
with torch.no_grad():
# Get target actions for next state
# 获取下一状态的目标动作
next_act_s = self.actor_s_target(next_obs_s)
next_act_b = self.actor_b_target(next_obs_b)
joint_next_act = torch.cat([next_act_s, next_act_b], dim=1)
# Compute target Q values
# 计算目标 Q 值
target_q_s = rew_s + self.gamma * (1 - dones) * self.critic_s_target(joint_next_obs, joint_next_act)
target_q_b = rew_b + self.gamma * (1 - dones) * self.critic_b_target(joint_next_obs, joint_next_act)
# Compute current Q values and MSE loss
# 计算当前 Q 值与均方误差损失
current_q_s = self.critic_s(joint_obs, joint_act)
current_q_b = self.critic_b(joint_obs, joint_act)
critic_loss_s = F.mse_loss(current_q_s, target_q_s)
critic_loss_b = F.mse_loss(current_q_b, target_q_b)
# Backpropagation for critics
# Critic 的反向传播
self.critic_s_optimizer.zero_grad()
critic_loss_s.backward()
self.critic_s_optimizer.step()
self.critic_b_optimizer.zero_grad()
critic_loss_b.backward()
self.critic_b_optimizer.step()
# 2. Actors Update (Simultaneous) (2. Actor 更新 - 同时进行)
new_act_s = self.actor_s(obs_s)
new_act_b = self.actor_b(obs_b)
# Calculate policy loss using joint critic
# 使用联合 Critic 计算策略损失
actor_loss_s = -self.critic_s(joint_obs, torch.cat([new_act_s, act_b], dim=1)).mean()
actor_loss_b = -self.critic_b(joint_obs, torch.cat([act_s, new_act_b], dim=1)).mean()
# Backpropagation for actors
# Actor 的反向传播
self.actor_s_optimizer.zero_grad()
actor_loss_s.backward()
self.actor_s_optimizer.step()
self.actor_b_optimizer.zero_grad()
actor_loss_b.backward()
self.actor_b_optimizer.step()
# 3. Soft Target Networks Update (3. 目标网络软更新)
for target_param, param in zip(self.critic_s_target.parameters(), self.critic_s.parameters()):
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
for target_param, param in zip(self.critic_b_target.parameters(), self.critic_b.parameters()):
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
for target_param, param in zip(self.actor_s_target.parameters(), self.actor_s.parameters()):
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
for target_param, param in zip(self.actor_b_target.parameters(), self.actor_b.parameters()):
target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)
return {
'actor_loss_s': actor_loss_s.item(),
'actor_loss_b': actor_loss_b.item(),
'critic_loss_s': critic_loss_s.item(),
'critic_loss_b': critic_loss_b.item()
}
def save(self, path):
"""
Save models to disk.
将模型保存至磁盘。
"""
os.makedirs(path, exist_ok=True)
torch.save(self.actor_s.state_dict(), os.path.join(path, "actor_s.pth"))
torch.save(self.actor_b.state_dict(), os.path.join(path, "actor_b.pth"))
torch.save(self.critic_s.state_dict(), os.path.join(path, "critic_s.pth"))
torch.save(self.critic_b.state_dict(), os.path.join(path, "critic_b.pth"))
def load(self, path):
"""
Load models from disk.
从磁盘加载模型。
"""
self.actor_s.load_state_dict(torch.load(os.path.join(path, "actor_s.pth"), map_location=self.device))
self.actor_b.load_state_dict(torch.load(os.path.join(path, "actor_b.pth"), map_location=self.device))
self.critic_s.load_state_dict(torch.load(os.path.join(path, "critic_s.pth"), map_location=self.device))
self.critic_b.load_state_dict(torch.load(os.path.join(path, "critic_b.pth"), map_location=self.device))
self.actor_s_target.load_state_dict(self.actor_s.state_dict())
self.actor_b_target.load_state_dict(self.actor_b.state_dict())
self.critic_s_target.load_state_dict(self.critic_s.state_dict())
self.critic_b_target.load_state_dict(self.critic_b.state_dict())