MND-IA/skills/analyst.py
2025-12-31 19:58:09 +08:00

583 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Skill A: Analyst (情报分析师)
==============================
职能NLP 提取、新闻分级、叙事打分
输入:新闻文本列表
输出Narrative_JSON (叙事对象、评分、生命周期阶段)
设计原则:
- 核心分析逻辑必须通过 LLM 完成,而非简单关键词匹配
- 关键词仅作为 LLM 分析的辅助信息和兜底方案
- 支持 LLM 失败时的优雅降级
"""
import json
import re
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from pathlib import Path
import os
import sys
# 添加项目根目录到路径
sys.path.insert(0, str(Path(__file__).parent.parent))
from core.config import config, get_llm_client, llm_call
class NewsAnalyzer:
"""
情报分析师 - 负责新闻文本分析和叙事提取
核心能力:
1. 使用 LLM 进行新闻语义分析(主要方法)
2. 新闻分级Level A/B/C
3. 叙事主题提取与关联
4. 热度评分与市场影响判断
5. 生命周期阶段推断
分析流程:
1. 首先尝试使用 LLM 进行深度分析
2. LLM 返回结构化的分析结果
3. 如果 LLM 调用失败,降级到规则引擎
"""
# LLM 分析系统提示词
SYSTEM_PROMPT = """你是专业的 A 股市场情报分析师,专门负责解读政策新闻和市场信息。
你的核心任务是:
1. 判断新闻的重要性级别 (A/B/C级)
2. 识别受影响的行业板块
3. 评估新闻对市场的影响强度
4. 判断叙事所处的生命周期阶段
评级标准:
- A级央行/国务院级别的重大政策(如降准降息、印花税调整、重大改革),通常能引发市场大幅波动
- B级部委级别的行业政策如工信部新规、发改委规划对特定行业有显著影响
- C级一般性新闻、行业动态、公司公告影响有限
生命周期阶段:
- incubation (潜伏期): 政策风向初现,尚未被市场充分认知
- fermentation (发酵期): 市场开始关注,资金试探性流入
- realization (兑现期): 政策落地,市场充分反应
- decay (衰退期): 利好出尽,边际效应递减
请严格按照 JSON 格式返回分析结果。"""
def __init__(self, llm_client=None, use_llm: bool = True):
"""
Args:
llm_client: OpenAI SDK 客户端(兼容 GLM/Claude如果不提供则使用配置文件中的默认客户端
use_llm: 是否使用 LLM 进行分析(默认 True
"""
self.llm_client = llm_client or get_llm_client()
self.use_llm = use_llm and (self.llm_client is not None)
self.asset_map = self._load_asset_map()
# 板块关键词映射 - 仅作为 LLM 的参考信息
self.sector_keywords = {}
self.sector_names = {} # 资产ID到中文名的映射
if self.asset_map:
for asset_id, asset_data in self.asset_map.get("assets", {}).items():
self.sector_keywords[asset_id] = asset_data.get("keywords", [])
self.sector_names[asset_id] = asset_data.get("name", asset_id)
if self.use_llm:
print("[Analyst] ✅ LLM 模式启用 - 将使用大模型进行智能分析")
else:
print("[Analyst] ⚠️ 降级模式 - 将使用规则引擎(关键词匹配)")
def _load_asset_map(self) -> Optional[Dict]:
"""加载资产映射表"""
asset_map_path = Path("core") / "asset_map.json"
if not asset_map_path.exists():
print("[Analyst] 警告: 未找到 asset_map.json")
return None
with open(asset_map_path, 'r', encoding='utf-8') as f:
return json.load(f)
def _get_available_sectors_prompt(self) -> str:
"""生成可用板块列表的提示信息"""
if not self.asset_map:
return ""
sectors_info = []
for asset_id, asset_data in self.asset_map.get("assets", {}).items():
name = asset_data.get("name", asset_id)
keywords = asset_data.get("keywords", [])[:5] # 取前5个关键词
sectors_info.append(f"- {asset_id} ({name}): {', '.join(keywords)}")
return "\n".join(sectors_info)
def _build_analysis_prompt(self, news_text: str, title: str) -> str:
"""构建 LLM 分析的用户提示词"""
sectors_info = self._get_available_sectors_prompt()
return f"""请分析以下新闻,并以 JSON 格式返回结果:
【新闻标题】
{title}
【新闻正文】
{news_text}
【可用板块列表】
{sectors_info}
请返回如下格式的 JSON不要包含其他文字
{{
"level": "A/B/C",
"level_reason": "判断级别的理由50字内",
"sectors": ["asset_id1", "asset_id2"],
"sector_reason": "选择这些板块的理由50字内",
"score": 0-100,
"score_reason": "评分依据30字内",
"lifecycle_stage": "incubation/fermentation/realization/decay",
"stage_reason": "生命周期判断理由30字内",
"sentiment": "positive/negative/neutral",
"key_signal": "新闻中最关键的投资信号30字内"
}}"""
def analyze_news(self, news_text: str, title: str = "") -> Dict:
"""
单条新闻分析(主要方法)
优先使用 LLM 进行智能分析,失败时降级到规则引擎
Args:
news_text: 新闻正文
title: 新闻标题(可选)
Returns:
分析结果字典
"""
if self.use_llm:
result = self._analyze_with_llm(news_text, title)
if result:
return result
print("[Analyst] ⚠️ LLM 分析失败,降级到规则引擎")
# 降级到规则引擎
return self._analyze_with_rules(news_text, title)
def _analyze_with_llm(self, news_text: str, title: str) -> Optional[Dict]:
"""
使用 LLM 进行深度新闻分析
Args:
news_text: 新闻正文
title: 新闻标题
Returns:
分析结果字典,失败返回 None
"""
full_text = f"{title}\n{news_text}" if title else news_text
# 构建提示词
user_prompt = self._build_analysis_prompt(news_text, title)
try:
# 调用 LLM
llm_output = llm_call(
messages=[
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": user_prompt}
],
temperature=0.3,
max_tokens=800
)
if not llm_output:
return None
# 清理并解析 JSON
llm_output = llm_output.strip()
# 移除可能的 markdown 代码块标记
if llm_output.startswith("```"):
llm_output = re.sub(r'^```(?:json)?\s*', '', llm_output)
llm_output = re.sub(r'\s*```$', '', llm_output)
llm_result = json.loads(llm_output)
# 验证并规范化结果
level = llm_result.get("level", "C").upper()
if level not in ["A", "B", "C"]:
level = "C"
sectors = llm_result.get("sectors", [])
# 验证板块是否存在于 asset_map 中
valid_sectors = [s for s in sectors if s in self.sector_keywords]
score = float(llm_result.get("score", 50))
score = max(0, min(100, score)) # 限制在 0-100
lifecycle_stage = llm_result.get("lifecycle_stage", "incubation")
if lifecycle_stage not in ["incubation", "fermentation", "realization", "decay"]:
lifecycle_stage = "incubation"
# 关联 ETF
related_etfs = []
if self.asset_map:
for sector in valid_sectors:
asset_data = self.asset_map.get("assets", {}).get(sector, {})
related_etfs.extend(asset_data.get("etfs", []))
related_etfs = list(set(related_etfs))
return {
"title": title,
"level": level,
"level_reason": llm_result.get("level_reason", ""),
"sectors": valid_sectors,
"sector_reason": llm_result.get("sector_reason", ""),
"score": round(score, 2),
"score_reason": llm_result.get("score_reason", ""),
"lifecycle_stage": lifecycle_stage,
"stage_reason": llm_result.get("stage_reason", ""),
"related_etfs": related_etfs,
"sentiment": llm_result.get("sentiment", "neutral"),
"key_signal": llm_result.get("key_signal", ""),
"timestamp": datetime.now().isoformat(),
"analysis_method": "llm",
"summary": f"{level}{len(valid_sectors)}板块 评分{score:.0f} [{llm_result.get('sentiment', 'neutral')}]"
}
except json.JSONDecodeError as e:
print(f"[Analyst] LLM 返回格式解析失败: {e}")
return None
except Exception as e:
print(f"[Analyst] LLM 分析异常: {e}")
return None
def _analyze_with_rules(self, news_text: str, title: str) -> Dict:
"""
规则引擎分析(降级方案)
当 LLM 不可用时,使用关键词匹配进行基础分析
Args:
news_text: 新闻正文
title: 新闻标题
Returns:
分析结果字典
"""
full_text = f"{title}\n{news_text}" if title else news_text
# 1. 新闻分级(关键词匹配)
level = self._classify_level_by_keywords(full_text)
# 2. 板块识别(关键词匹配)
sectors = self._extract_sectors_by_keywords(full_text)
# 3. 评分计算
score = self._calculate_score_by_rules(full_text, level, sectors)
# 4. 生命周期阶段
stage = self._determine_stage_by_rules(level, score)
# 5. 关联 ETF
related_etfs = []
if self.asset_map:
for sector in sectors:
asset_data = self.asset_map.get("assets", {}).get(sector, {})
related_etfs.extend(asset_data.get("etfs", []))
related_etfs = list(set(related_etfs))
return {
"title": title,
"level": level,
"level_reason": "基于关键词匹配(降级模式)",
"sectors": sectors,
"sector_reason": "基于关键词匹配(降级模式)",
"score": score,
"score_reason": "规则引擎计算",
"lifecycle_stage": stage,
"stage_reason": "基于级别和评分推断",
"related_etfs": related_etfs,
"sentiment": "neutral",
"key_signal": "",
"timestamp": datetime.now().isoformat(),
"analysis_method": "rules",
"summary": f"{level}{len(sectors)}板块 评分{score} [降级模式]"
}
# ==================== 规则引擎辅助方法(降级方案) ====================
# 新闻分级关键词库
LEVEL_A_KEYWORDS = [
"印花税", "降准", "降息", "降低存款准备金率",
"央行宣布", "国务院", "重磅", "历史性", "里程碑",
"全面降息", "万亿", "大规模刺激", "救市", "重大改革"
]
LEVEL_B_KEYWORDS = [
"部委", "指导意见", "规划", "通知",
"发改委", "工信部", "证监会", "银保监会",
"新政", "改革", "试点", "专项", "行动计划"
]
def _classify_level_by_keywords(self, text: str) -> str:
"""基于关键词的新闻分级"""
for keyword in self.LEVEL_A_KEYWORDS:
if keyword in text:
return "A"
for keyword in self.LEVEL_B_KEYWORDS:
if keyword in text:
return "B"
return "C"
def _extract_sectors_by_keywords(self, text: str) -> List[str]:
"""基于关键词的板块提取"""
matched_sectors = []
for sector, keywords in self.sector_keywords.items():
for keyword in keywords:
if keyword in text:
matched_sectors.append(sector)
break
return matched_sectors
def _calculate_score_by_rules(self, text: str, level: str, sectors: List[str]) -> float:
"""基于规则的评分计算"""
base_score = {"A": 85, "B": 65, "C": 45}.get(level, 45)
sector_bonus = min(len(sectors) * 5, 15)
# 情绪词检测
positive_keywords = ["利好", "刺激", "支持", "鼓励", "加快", "推动", "大力"]
emotion_bonus = 10 if any(kw in text for kw in positive_keywords) else 0
return min(base_score + sector_bonus + emotion_bonus, 100)
def _determine_stage_by_rules(self, level: str, score: float) -> str:
"""基于规则的生命周期阶段判断"""
if level == "A" and score >= 80:
return "realization"
elif level in ["A", "B"] and score >= 60:
return "fermentation"
else:
return "incubation"
def batch_analyze(
self,
news_list: List[Dict[str, str]]
) -> List[Dict]:
"""
批量分析新闻
Args:
news_list: 新闻列表,每条包含 {"title": "", "text": ""}
Returns:
分析结果列表
"""
results = []
llm_count = 0
rules_count = 0
mode = "LLM智能分析" if self.use_llm else "规则引擎(降级模式)"
print(f"[Analyst] 开始批量分析 {len(news_list)} 条新闻 | 模式: {mode}")
for i, news in enumerate(news_list, 1):
title = news.get("title", "")
text = news.get("text", "")
# 简化标题显示
display_title = title[:25] + "..." if len(title) > 25 else title
result = self.analyze_news(text, title)
results.append(result)
# 统计分析方法使用情况
if result.get("analysis_method") == "llm":
llm_count += 1
method_icon = "🤖"
else:
rules_count += 1
method_icon = "📏"
print(f"[Analyst] {method_icon} ({i}/{len(news_list)}) {result['level']}级 | {result['score']:.0f}分 | {display_title}")
print(f"[Analyst] ✅ 批量分析完成 | LLM: {llm_count}条, 规则: {rules_count}")
return results
def generate_narrative_json(
self,
analysis_results: List[Dict]
) -> Dict:
"""
生成标准化的 Narrative_JSON供 Strategist 使用)
聚合逻辑:
1. 按板块sector聚合所有相关新闻
2. 计算平均分和最高分
3. 汇总 LLM 分析的关键信号
4. 确定最终的生命周期阶段
Args:
analysis_results: batch_analyze 的输出
Returns:
标准化 JSON 格式
"""
# 按板块聚合新闻
sector_narratives = {}
for result in analysis_results:
for sector in result['sectors']:
if sector not in sector_narratives:
sector_narratives[sector] = {
"topic": sector,
"topic_name": self.sector_names.get(sector, sector),
"news_count": 0,
"avg_score": 0,
"max_score": 0,
"level_a_count": 0,
"level_b_count": 0,
"related_etfs": set(),
"lifecycle_stage": "incubation",
"key_signals": [], # 汇总关键信号
"sentiments": [], # 情绪统计
"analysis_methods": {"llm": 0, "rules": 0}
}
narrative = sector_narratives[sector]
narrative["news_count"] += 1
narrative["avg_score"] += result['score']
narrative["max_score"] = max(narrative["max_score"], result['score'])
narrative["related_etfs"].update(result['related_etfs'])
# 统计新闻级别
if result['level'] == "A":
narrative["level_a_count"] += 1
narrative["lifecycle_stage"] = result['lifecycle_stage']
elif result['level'] == "B":
narrative["level_b_count"] += 1
if narrative["lifecycle_stage"] == "incubation":
narrative["lifecycle_stage"] = "fermentation"
# 收集关键信号(来自 LLM 分析)
if result.get("key_signal"):
narrative["key_signals"].append(result["key_signal"])
# 收集情绪
sentiment = result.get("sentiment", "neutral")
narrative["sentiments"].append(sentiment)
# 统计分析方法
method = result.get("analysis_method", "rules")
narrative["analysis_methods"][method] = narrative["analysis_methods"].get(method, 0) + 1
# 后处理:计算平均分、汇总情绪等
for narrative in sector_narratives.values():
narrative["avg_score"] = round(
narrative["avg_score"] / narrative["news_count"], 2
)
narrative["related_etfs"] = list(narrative["related_etfs"])
# 计算情绪倾向
sentiments = narrative["sentiments"]
positive_count = sentiments.count("positive")
negative_count = sentiments.count("negative")
if positive_count > negative_count:
narrative["overall_sentiment"] = "positive"
elif negative_count > positive_count:
narrative["overall_sentiment"] = "negative"
else:
narrative["overall_sentiment"] = "neutral"
# 去重关键信号
narrative["key_signals"] = list(set(narrative["key_signals"]))[:5]
# 清理临时字段
del narrative["sentiments"]
# 排序输出
sorted_narratives = sorted(
sector_narratives.values(),
key=lambda x: x['max_score'],
reverse=True
)
return {
"timestamp": datetime.now().isoformat(),
"total_news": len(analysis_results),
"narratives": sorted_narratives
}
# ==================== 测试代码 ====================
if __name__ == "__main__":
print("=" * 60)
print("Skill A: Analyst 情报分析师测试")
print("=" * 60)
# 创建分析器实例(默认启用 LLM
analyst = NewsAnalyzer()
# 测试新闻样本
test_news = [
{
"title": "央行宣布降准0.5个百分点 释放长期资金约1万亿",
"text": "中国人民银行决定于2025年1月5日降低金融机构存款准备金率0.5个百分点,"
"此次降准为全面降准除已执行5%存款准备金率的部分法人金融机构外,"
"对其他金融机构普遍下调存款准备金率0.5个百分点此次降准共计释放长期资金约1万亿元。"
},
{
"title": "工信部发布低空经济发展指导意见 推动eVTOL产业化",
"text": "工业和信息化部发布《低空经济高质量发展指导意见》提出到2027年"
"低空经济规模达到1万亿元培育10家以上龙头企业。重点支持无人机、"
"电动垂直起降飞行器eVTOL等装备的研发和产业化。"
},
{
"title": "AI算力需求持续旺盛 多家数据中心公司业绩预增",
"text": "随着人工智能应用的快速发展,算力需求呈现爆发式增长。多家上市公司发布业绩预告,"
"预计2024年净利润增长50%以上。分析师认为智算中心建设将成为未来3年的投资主线。"
}
]
# 批量分析
print("\n")
results = analyst.batch_analyze(test_news)
print("\n" + "=" * 60)
print("详细分析结果:")
print("=" * 60)
for i, result in enumerate(results, 1):
print(f"\n【新闻 {i}{result['title'][:50]}")
print(f" 📊 分析方法: {result.get('analysis_method', 'unknown').upper()}")
print(f" 📌 级别: {result['level']} | 评分: {result['score']}")
print(f" 🎯 板块: {', '.join(result['sectors']) if result['sectors'] else '未识别'}")
print(f" 🔄 周期阶段: {result['lifecycle_stage']}")
print(f" 📈 情绪: {result.get('sentiment', 'neutral')}")
if result.get('key_signal'):
print(f" 💡 关键信号: {result['key_signal']}")
if result.get('level_reason'):
print(f" 📝 级别理由: {result['level_reason']}")
print(f" 🏷️ 关联 ETF: {', '.join(result['related_etfs'][:4]) if result['related_etfs'] else ''}...")
# 生成叙事 JSON
print("\n" + "=" * 60)
print("聚合叙事 JSON:")
print("=" * 60)
narrative_json = analyst.generate_narrative_json(results)
print(f"\n📰 总新闻数: {narrative_json['total_news']}")
print(f"🎯 识别叙事: {len(narrative_json['narratives'])}\n")
for narrative in narrative_json['narratives']:
topic_name = narrative.get('topic_name', narrative['topic'])
level_info = f"A级×{narrative['level_a_count']}" if narrative['level_a_count'] > 0 else \
f"B级×{narrative.get('level_b_count', 0)}" if narrative.get('level_b_count', 0) > 0 else "C级"
print(f" 📍 {topic_name} ({narrative['topic']})")
print(f" 评分: {narrative['max_score']:.0f} (平均{narrative['avg_score']:.0f}) | "
f"新闻: {narrative['news_count']}条 | {level_info}")
print(f" 阶段: {narrative['lifecycle_stage']} | 情绪: {narrative.get('overall_sentiment', 'neutral')}")
if narrative.get('key_signals'):
print(f" 信号: {'; '.join(narrative['key_signals'][:2])}")
print()
print("✅ Analyst 模块测试完成")