583 lines
23 KiB
Python
583 lines
23 KiB
Python
"""
|
||
Skill A: Analyst (情报分析师)
|
||
==============================
|
||
职能:NLP 提取、新闻分级、叙事打分
|
||
|
||
输入:新闻文本列表
|
||
输出:Narrative_JSON (叙事对象、评分、生命周期阶段)
|
||
|
||
设计原则:
|
||
- 核心分析逻辑必须通过 LLM 完成,而非简单关键词匹配
|
||
- 关键词仅作为 LLM 分析的辅助信息和兜底方案
|
||
- 支持 LLM 失败时的优雅降级
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
from datetime import datetime
|
||
from typing import Dict, List, Optional, Tuple
|
||
from pathlib import Path
|
||
import os
|
||
import sys
|
||
|
||
# 添加项目根目录到路径
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
from core.config import config, get_llm_client, llm_call
|
||
|
||
|
||
class NewsAnalyzer:
|
||
"""
|
||
情报分析师 - 负责新闻文本分析和叙事提取
|
||
|
||
核心能力:
|
||
1. 使用 LLM 进行新闻语义分析(主要方法)
|
||
2. 新闻分级(Level A/B/C)
|
||
3. 叙事主题提取与关联
|
||
4. 热度评分与市场影响判断
|
||
5. 生命周期阶段推断
|
||
|
||
分析流程:
|
||
1. 首先尝试使用 LLM 进行深度分析
|
||
2. LLM 返回结构化的分析结果
|
||
3. 如果 LLM 调用失败,降级到规则引擎
|
||
"""
|
||
|
||
# LLM 分析系统提示词
|
||
SYSTEM_PROMPT = """你是专业的 A 股市场情报分析师,专门负责解读政策新闻和市场信息。
|
||
|
||
你的核心任务是:
|
||
1. 判断新闻的重要性级别 (A/B/C级)
|
||
2. 识别受影响的行业板块
|
||
3. 评估新闻对市场的影响强度
|
||
4. 判断叙事所处的生命周期阶段
|
||
|
||
评级标准:
|
||
- A级:央行/国务院级别的重大政策(如降准降息、印花税调整、重大改革),通常能引发市场大幅波动
|
||
- B级:部委级别的行业政策(如工信部新规、发改委规划),对特定行业有显著影响
|
||
- C级:一般性新闻、行业动态、公司公告,影响有限
|
||
|
||
生命周期阶段:
|
||
- incubation (潜伏期): 政策风向初现,尚未被市场充分认知
|
||
- fermentation (发酵期): 市场开始关注,资金试探性流入
|
||
- realization (兑现期): 政策落地,市场充分反应
|
||
- decay (衰退期): 利好出尽,边际效应递减
|
||
|
||
请严格按照 JSON 格式返回分析结果。"""
|
||
|
||
def __init__(self, llm_client=None, use_llm: bool = True):
|
||
"""
|
||
Args:
|
||
llm_client: OpenAI SDK 客户端(兼容 GLM/Claude),如果不提供则使用配置文件中的默认客户端
|
||
use_llm: 是否使用 LLM 进行分析(默认 True)
|
||
"""
|
||
self.llm_client = llm_client or get_llm_client()
|
||
self.use_llm = use_llm and (self.llm_client is not None)
|
||
self.asset_map = self._load_asset_map()
|
||
|
||
# 板块关键词映射 - 仅作为 LLM 的参考信息
|
||
self.sector_keywords = {}
|
||
self.sector_names = {} # 资产ID到中文名的映射
|
||
if self.asset_map:
|
||
for asset_id, asset_data in self.asset_map.get("assets", {}).items():
|
||
self.sector_keywords[asset_id] = asset_data.get("keywords", [])
|
||
self.sector_names[asset_id] = asset_data.get("name", asset_id)
|
||
|
||
if self.use_llm:
|
||
print("[Analyst] ✅ LLM 模式启用 - 将使用大模型进行智能分析")
|
||
else:
|
||
print("[Analyst] ⚠️ 降级模式 - 将使用规则引擎(关键词匹配)")
|
||
|
||
def _load_asset_map(self) -> Optional[Dict]:
|
||
"""加载资产映射表"""
|
||
asset_map_path = Path("core") / "asset_map.json"
|
||
if not asset_map_path.exists():
|
||
print("[Analyst] 警告: 未找到 asset_map.json")
|
||
return None
|
||
|
||
with open(asset_map_path, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
|
||
def _get_available_sectors_prompt(self) -> str:
|
||
"""生成可用板块列表的提示信息"""
|
||
if not self.asset_map:
|
||
return ""
|
||
|
||
sectors_info = []
|
||
for asset_id, asset_data in self.asset_map.get("assets", {}).items():
|
||
name = asset_data.get("name", asset_id)
|
||
keywords = asset_data.get("keywords", [])[:5] # 取前5个关键词
|
||
sectors_info.append(f"- {asset_id} ({name}): {', '.join(keywords)}")
|
||
|
||
return "\n".join(sectors_info)
|
||
|
||
def _build_analysis_prompt(self, news_text: str, title: str) -> str:
|
||
"""构建 LLM 分析的用户提示词"""
|
||
sectors_info = self._get_available_sectors_prompt()
|
||
|
||
return f"""请分析以下新闻,并以 JSON 格式返回结果:
|
||
|
||
【新闻标题】
|
||
{title}
|
||
|
||
【新闻正文】
|
||
{news_text}
|
||
|
||
【可用板块列表】
|
||
{sectors_info}
|
||
|
||
请返回如下格式的 JSON(不要包含其他文字):
|
||
{{
|
||
"level": "A/B/C",
|
||
"level_reason": "判断级别的理由(50字内)",
|
||
"sectors": ["asset_id1", "asset_id2"],
|
||
"sector_reason": "选择这些板块的理由(50字内)",
|
||
"score": 0-100,
|
||
"score_reason": "评分依据(30字内)",
|
||
"lifecycle_stage": "incubation/fermentation/realization/decay",
|
||
"stage_reason": "生命周期判断理由(30字内)",
|
||
"sentiment": "positive/negative/neutral",
|
||
"key_signal": "新闻中最关键的投资信号(30字内)"
|
||
}}"""
|
||
|
||
def analyze_news(self, news_text: str, title: str = "") -> Dict:
|
||
"""
|
||
单条新闻分析(主要方法)
|
||
|
||
优先使用 LLM 进行智能分析,失败时降级到规则引擎
|
||
|
||
Args:
|
||
news_text: 新闻正文
|
||
title: 新闻标题(可选)
|
||
|
||
Returns:
|
||
分析结果字典
|
||
"""
|
||
if self.use_llm:
|
||
result = self._analyze_with_llm(news_text, title)
|
||
if result:
|
||
return result
|
||
print("[Analyst] ⚠️ LLM 分析失败,降级到规则引擎")
|
||
|
||
# 降级到规则引擎
|
||
return self._analyze_with_rules(news_text, title)
|
||
|
||
def _analyze_with_llm(self, news_text: str, title: str) -> Optional[Dict]:
|
||
"""
|
||
使用 LLM 进行深度新闻分析
|
||
|
||
Args:
|
||
news_text: 新闻正文
|
||
title: 新闻标题
|
||
|
||
Returns:
|
||
分析结果字典,失败返回 None
|
||
"""
|
||
full_text = f"{title}\n{news_text}" if title else news_text
|
||
|
||
# 构建提示词
|
||
user_prompt = self._build_analysis_prompt(news_text, title)
|
||
|
||
try:
|
||
# 调用 LLM
|
||
llm_output = llm_call(
|
||
messages=[
|
||
{"role": "system", "content": self.SYSTEM_PROMPT},
|
||
{"role": "user", "content": user_prompt}
|
||
],
|
||
temperature=0.3,
|
||
max_tokens=800
|
||
)
|
||
|
||
if not llm_output:
|
||
return None
|
||
|
||
# 清理并解析 JSON
|
||
llm_output = llm_output.strip()
|
||
# 移除可能的 markdown 代码块标记
|
||
if llm_output.startswith("```"):
|
||
llm_output = re.sub(r'^```(?:json)?\s*', '', llm_output)
|
||
llm_output = re.sub(r'\s*```$', '', llm_output)
|
||
|
||
llm_result = json.loads(llm_output)
|
||
|
||
# 验证并规范化结果
|
||
level = llm_result.get("level", "C").upper()
|
||
if level not in ["A", "B", "C"]:
|
||
level = "C"
|
||
|
||
sectors = llm_result.get("sectors", [])
|
||
# 验证板块是否存在于 asset_map 中
|
||
valid_sectors = [s for s in sectors if s in self.sector_keywords]
|
||
|
||
score = float(llm_result.get("score", 50))
|
||
score = max(0, min(100, score)) # 限制在 0-100
|
||
|
||
lifecycle_stage = llm_result.get("lifecycle_stage", "incubation")
|
||
if lifecycle_stage not in ["incubation", "fermentation", "realization", "decay"]:
|
||
lifecycle_stage = "incubation"
|
||
|
||
# 关联 ETF
|
||
related_etfs = []
|
||
if self.asset_map:
|
||
for sector in valid_sectors:
|
||
asset_data = self.asset_map.get("assets", {}).get(sector, {})
|
||
related_etfs.extend(asset_data.get("etfs", []))
|
||
related_etfs = list(set(related_etfs))
|
||
|
||
return {
|
||
"title": title,
|
||
"level": level,
|
||
"level_reason": llm_result.get("level_reason", ""),
|
||
"sectors": valid_sectors,
|
||
"sector_reason": llm_result.get("sector_reason", ""),
|
||
"score": round(score, 2),
|
||
"score_reason": llm_result.get("score_reason", ""),
|
||
"lifecycle_stage": lifecycle_stage,
|
||
"stage_reason": llm_result.get("stage_reason", ""),
|
||
"related_etfs": related_etfs,
|
||
"sentiment": llm_result.get("sentiment", "neutral"),
|
||
"key_signal": llm_result.get("key_signal", ""),
|
||
"timestamp": datetime.now().isoformat(),
|
||
"analysis_method": "llm",
|
||
"summary": f"{level}级 {len(valid_sectors)}板块 评分{score:.0f} [{llm_result.get('sentiment', 'neutral')}]"
|
||
}
|
||
|
||
except json.JSONDecodeError as e:
|
||
print(f"[Analyst] LLM 返回格式解析失败: {e}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"[Analyst] LLM 分析异常: {e}")
|
||
return None
|
||
|
||
def _analyze_with_rules(self, news_text: str, title: str) -> Dict:
|
||
"""
|
||
规则引擎分析(降级方案)
|
||
|
||
当 LLM 不可用时,使用关键词匹配进行基础分析
|
||
|
||
Args:
|
||
news_text: 新闻正文
|
||
title: 新闻标题
|
||
|
||
Returns:
|
||
分析结果字典
|
||
"""
|
||
full_text = f"{title}\n{news_text}" if title else news_text
|
||
|
||
# 1. 新闻分级(关键词匹配)
|
||
level = self._classify_level_by_keywords(full_text)
|
||
|
||
# 2. 板块识别(关键词匹配)
|
||
sectors = self._extract_sectors_by_keywords(full_text)
|
||
|
||
# 3. 评分计算
|
||
score = self._calculate_score_by_rules(full_text, level, sectors)
|
||
|
||
# 4. 生命周期阶段
|
||
stage = self._determine_stage_by_rules(level, score)
|
||
|
||
# 5. 关联 ETF
|
||
related_etfs = []
|
||
if self.asset_map:
|
||
for sector in sectors:
|
||
asset_data = self.asset_map.get("assets", {}).get(sector, {})
|
||
related_etfs.extend(asset_data.get("etfs", []))
|
||
related_etfs = list(set(related_etfs))
|
||
|
||
return {
|
||
"title": title,
|
||
"level": level,
|
||
"level_reason": "基于关键词匹配(降级模式)",
|
||
"sectors": sectors,
|
||
"sector_reason": "基于关键词匹配(降级模式)",
|
||
"score": score,
|
||
"score_reason": "规则引擎计算",
|
||
"lifecycle_stage": stage,
|
||
"stage_reason": "基于级别和评分推断",
|
||
"related_etfs": related_etfs,
|
||
"sentiment": "neutral",
|
||
"key_signal": "",
|
||
"timestamp": datetime.now().isoformat(),
|
||
"analysis_method": "rules",
|
||
"summary": f"{level}级 {len(sectors)}板块 评分{score} [降级模式]"
|
||
}
|
||
|
||
# ==================== 规则引擎辅助方法(降级方案) ====================
|
||
|
||
# 新闻分级关键词库
|
||
LEVEL_A_KEYWORDS = [
|
||
"印花税", "降准", "降息", "降低存款准备金率",
|
||
"央行宣布", "国务院", "重磅", "历史性", "里程碑",
|
||
"全面降息", "万亿", "大规模刺激", "救市", "重大改革"
|
||
]
|
||
|
||
LEVEL_B_KEYWORDS = [
|
||
"部委", "指导意见", "规划", "通知",
|
||
"发改委", "工信部", "证监会", "银保监会",
|
||
"新政", "改革", "试点", "专项", "行动计划"
|
||
]
|
||
|
||
def _classify_level_by_keywords(self, text: str) -> str:
|
||
"""基于关键词的新闻分级"""
|
||
for keyword in self.LEVEL_A_KEYWORDS:
|
||
if keyword in text:
|
||
return "A"
|
||
for keyword in self.LEVEL_B_KEYWORDS:
|
||
if keyword in text:
|
||
return "B"
|
||
return "C"
|
||
|
||
def _extract_sectors_by_keywords(self, text: str) -> List[str]:
|
||
"""基于关键词的板块提取"""
|
||
matched_sectors = []
|
||
for sector, keywords in self.sector_keywords.items():
|
||
for keyword in keywords:
|
||
if keyword in text:
|
||
matched_sectors.append(sector)
|
||
break
|
||
return matched_sectors
|
||
|
||
def _calculate_score_by_rules(self, text: str, level: str, sectors: List[str]) -> float:
|
||
"""基于规则的评分计算"""
|
||
base_score = {"A": 85, "B": 65, "C": 45}.get(level, 45)
|
||
sector_bonus = min(len(sectors) * 5, 15)
|
||
|
||
# 情绪词检测
|
||
positive_keywords = ["利好", "刺激", "支持", "鼓励", "加快", "推动", "大力"]
|
||
emotion_bonus = 10 if any(kw in text for kw in positive_keywords) else 0
|
||
|
||
return min(base_score + sector_bonus + emotion_bonus, 100)
|
||
|
||
def _determine_stage_by_rules(self, level: str, score: float) -> str:
|
||
"""基于规则的生命周期阶段判断"""
|
||
if level == "A" and score >= 80:
|
||
return "realization"
|
||
elif level in ["A", "B"] and score >= 60:
|
||
return "fermentation"
|
||
else:
|
||
return "incubation"
|
||
|
||
def batch_analyze(
|
||
self,
|
||
news_list: List[Dict[str, str]]
|
||
) -> List[Dict]:
|
||
"""
|
||
批量分析新闻
|
||
|
||
Args:
|
||
news_list: 新闻列表,每条包含 {"title": "", "text": ""}
|
||
|
||
Returns:
|
||
分析结果列表
|
||
"""
|
||
results = []
|
||
llm_count = 0
|
||
rules_count = 0
|
||
|
||
mode = "LLM智能分析" if self.use_llm else "规则引擎(降级模式)"
|
||
print(f"[Analyst] 开始批量分析 {len(news_list)} 条新闻 | 模式: {mode}")
|
||
|
||
for i, news in enumerate(news_list, 1):
|
||
title = news.get("title", "")
|
||
text = news.get("text", "")
|
||
|
||
# 简化标题显示
|
||
display_title = title[:25] + "..." if len(title) > 25 else title
|
||
|
||
result = self.analyze_news(text, title)
|
||
results.append(result)
|
||
|
||
# 统计分析方法使用情况
|
||
if result.get("analysis_method") == "llm":
|
||
llm_count += 1
|
||
method_icon = "🤖"
|
||
else:
|
||
rules_count += 1
|
||
method_icon = "📏"
|
||
|
||
print(f"[Analyst] {method_icon} ({i}/{len(news_list)}) {result['level']}级 | {result['score']:.0f}分 | {display_title}")
|
||
|
||
print(f"[Analyst] ✅ 批量分析完成 | LLM: {llm_count}条, 规则: {rules_count}条")
|
||
return results
|
||
|
||
def generate_narrative_json(
|
||
self,
|
||
analysis_results: List[Dict]
|
||
) -> Dict:
|
||
"""
|
||
生成标准化的 Narrative_JSON(供 Strategist 使用)
|
||
|
||
聚合逻辑:
|
||
1. 按板块(sector)聚合所有相关新闻
|
||
2. 计算平均分和最高分
|
||
3. 汇总 LLM 分析的关键信号
|
||
4. 确定最终的生命周期阶段
|
||
|
||
Args:
|
||
analysis_results: batch_analyze 的输出
|
||
|
||
Returns:
|
||
标准化 JSON 格式
|
||
"""
|
||
# 按板块聚合新闻
|
||
sector_narratives = {}
|
||
|
||
for result in analysis_results:
|
||
for sector in result['sectors']:
|
||
if sector not in sector_narratives:
|
||
sector_narratives[sector] = {
|
||
"topic": sector,
|
||
"topic_name": self.sector_names.get(sector, sector),
|
||
"news_count": 0,
|
||
"avg_score": 0,
|
||
"max_score": 0,
|
||
"level_a_count": 0,
|
||
"level_b_count": 0,
|
||
"related_etfs": set(),
|
||
"lifecycle_stage": "incubation",
|
||
"key_signals": [], # 汇总关键信号
|
||
"sentiments": [], # 情绪统计
|
||
"analysis_methods": {"llm": 0, "rules": 0}
|
||
}
|
||
|
||
narrative = sector_narratives[sector]
|
||
narrative["news_count"] += 1
|
||
narrative["avg_score"] += result['score']
|
||
narrative["max_score"] = max(narrative["max_score"], result['score'])
|
||
narrative["related_etfs"].update(result['related_etfs'])
|
||
|
||
# 统计新闻级别
|
||
if result['level'] == "A":
|
||
narrative["level_a_count"] += 1
|
||
narrative["lifecycle_stage"] = result['lifecycle_stage']
|
||
elif result['level'] == "B":
|
||
narrative["level_b_count"] += 1
|
||
if narrative["lifecycle_stage"] == "incubation":
|
||
narrative["lifecycle_stage"] = "fermentation"
|
||
|
||
# 收集关键信号(来自 LLM 分析)
|
||
if result.get("key_signal"):
|
||
narrative["key_signals"].append(result["key_signal"])
|
||
|
||
# 收集情绪
|
||
sentiment = result.get("sentiment", "neutral")
|
||
narrative["sentiments"].append(sentiment)
|
||
|
||
# 统计分析方法
|
||
method = result.get("analysis_method", "rules")
|
||
narrative["analysis_methods"][method] = narrative["analysis_methods"].get(method, 0) + 1
|
||
|
||
# 后处理:计算平均分、汇总情绪等
|
||
for narrative in sector_narratives.values():
|
||
narrative["avg_score"] = round(
|
||
narrative["avg_score"] / narrative["news_count"], 2
|
||
)
|
||
narrative["related_etfs"] = list(narrative["related_etfs"])
|
||
|
||
# 计算情绪倾向
|
||
sentiments = narrative["sentiments"]
|
||
positive_count = sentiments.count("positive")
|
||
negative_count = sentiments.count("negative")
|
||
if positive_count > negative_count:
|
||
narrative["overall_sentiment"] = "positive"
|
||
elif negative_count > positive_count:
|
||
narrative["overall_sentiment"] = "negative"
|
||
else:
|
||
narrative["overall_sentiment"] = "neutral"
|
||
|
||
# 去重关键信号
|
||
narrative["key_signals"] = list(set(narrative["key_signals"]))[:5]
|
||
|
||
# 清理临时字段
|
||
del narrative["sentiments"]
|
||
|
||
# 排序输出
|
||
sorted_narratives = sorted(
|
||
sector_narratives.values(),
|
||
key=lambda x: x['max_score'],
|
||
reverse=True
|
||
)
|
||
|
||
return {
|
||
"timestamp": datetime.now().isoformat(),
|
||
"total_news": len(analysis_results),
|
||
"narratives": sorted_narratives
|
||
}
|
||
|
||
|
||
# ==================== 测试代码 ====================
|
||
|
||
if __name__ == "__main__":
|
||
print("=" * 60)
|
||
print("Skill A: Analyst 情报分析师测试")
|
||
print("=" * 60)
|
||
|
||
# 创建分析器实例(默认启用 LLM)
|
||
analyst = NewsAnalyzer()
|
||
|
||
# 测试新闻样本
|
||
test_news = [
|
||
{
|
||
"title": "央行宣布降准0.5个百分点 释放长期资金约1万亿",
|
||
"text": "中国人民银行决定于2025年1月5日降低金融机构存款准备金率0.5个百分点,"
|
||
"此次降准为全面降准,除已执行5%存款准备金率的部分法人金融机构外,"
|
||
"对其他金融机构普遍下调存款准备金率0.5个百分点,此次降准共计释放长期资金约1万亿元。"
|
||
},
|
||
{
|
||
"title": "工信部发布低空经济发展指导意见 推动eVTOL产业化",
|
||
"text": "工业和信息化部发布《低空经济高质量发展指导意见》,提出到2027年,"
|
||
"低空经济规模达到1万亿元,培育10家以上龙头企业。重点支持无人机、"
|
||
"电动垂直起降飞行器(eVTOL)等装备的研发和产业化。"
|
||
},
|
||
{
|
||
"title": "AI算力需求持续旺盛 多家数据中心公司业绩预增",
|
||
"text": "随着人工智能应用的快速发展,算力需求呈现爆发式增长。多家上市公司发布业绩预告,"
|
||
"预计2024年净利润增长50%以上。分析师认为,智算中心建设将成为未来3年的投资主线。"
|
||
}
|
||
]
|
||
|
||
# 批量分析
|
||
print("\n")
|
||
results = analyst.batch_analyze(test_news)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("详细分析结果:")
|
||
print("=" * 60)
|
||
|
||
for i, result in enumerate(results, 1):
|
||
print(f"\n【新闻 {i}】{result['title'][:50]}")
|
||
print(f" 📊 分析方法: {result.get('analysis_method', 'unknown').upper()}")
|
||
print(f" 📌 级别: {result['level']} | 评分: {result['score']}")
|
||
print(f" 🎯 板块: {', '.join(result['sectors']) if result['sectors'] else '未识别'}")
|
||
print(f" 🔄 周期阶段: {result['lifecycle_stage']}")
|
||
print(f" 📈 情绪: {result.get('sentiment', 'neutral')}")
|
||
if result.get('key_signal'):
|
||
print(f" 💡 关键信号: {result['key_signal']}")
|
||
if result.get('level_reason'):
|
||
print(f" 📝 级别理由: {result['level_reason']}")
|
||
print(f" 🏷️ 关联 ETF: {', '.join(result['related_etfs'][:4]) if result['related_etfs'] else '无'}...")
|
||
|
||
# 生成叙事 JSON
|
||
print("\n" + "=" * 60)
|
||
print("聚合叙事 JSON:")
|
||
print("=" * 60)
|
||
|
||
narrative_json = analyst.generate_narrative_json(results)
|
||
print(f"\n📰 总新闻数: {narrative_json['total_news']}")
|
||
print(f"🎯 识别叙事: {len(narrative_json['narratives'])} 个\n")
|
||
|
||
for narrative in narrative_json['narratives']:
|
||
topic_name = narrative.get('topic_name', narrative['topic'])
|
||
level_info = f"A级×{narrative['level_a_count']}" if narrative['level_a_count'] > 0 else \
|
||
f"B级×{narrative.get('level_b_count', 0)}" if narrative.get('level_b_count', 0) > 0 else "C级"
|
||
|
||
print(f" 📍 {topic_name} ({narrative['topic']})")
|
||
print(f" 评分: {narrative['max_score']:.0f} (平均{narrative['avg_score']:.0f}) | "
|
||
f"新闻: {narrative['news_count']}条 | {level_info}")
|
||
print(f" 阶段: {narrative['lifecycle_stage']} | 情绪: {narrative.get('overall_sentiment', 'neutral')}")
|
||
if narrative.get('key_signals'):
|
||
print(f" 信号: {'; '.join(narrative['key_signals'][:2])}")
|
||
print()
|
||
|
||
print("✅ Analyst 模块测试完成")
|