MND-IA/skills/crawler.py
2025-12-31 19:58:09 +08:00

299 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Skill F: Crawler (新闻联播爬虫)
================================
职能:每日抓取新闻联播文字稿
数据源https://cn.govopendata.com/xinwenlianbo/
输出:符合 Skill A 要求的新闻列表 [{"title": "", "text": ""}]
"""
import cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
from typing import List, Dict, Optional
from pathlib import Path
import json
import time
class XinwenLianboCrawler:
"""
新闻联播爬虫 - 抓取每日新闻联播文字稿
核心功能:
1. 自动构建当日 URL
2. 使用 cloudscraper 绕过 Cloudflare 防护
3. 解析新闻条目
4. 输出符合 Analyst 要求的格式
"""
def __init__(self, base_url: str = "https://cn.govopendata.com/xinwenlianbo/"):
"""
Args:
base_url: 新闻联播网站基础 URL
"""
self.base_url = base_url
self.cache_dir = Path("data") / "news_cache"
self.cache_dir.mkdir(parents=True, exist_ok=True)
# 创建 cloudscraper 会话
self.scraper = cloudscraper.create_scraper(
browser={
'browser': 'chrome',
'platform': 'windows',
'desktop': True
}
)
def _build_url(self, date: Optional[str] = None) -> str:
"""
构建目标 URL
Args:
date: 日期字符串 (格式: YYYYMMDD),默认今天
Returns:
完整 URL
"""
if date is None:
date = datetime.now().strftime("%Y%m%d")
return f"{self.base_url}{date}/"
def crawl_xinwenlianbo(self, date: Optional[str] = None) -> List[Dict[str, str]]:
"""
爬取新闻联播
Args:
date: 日期字符串 (YYYYMMDD),默认今天
Returns:
新闻列表 [{"title": "", "text": ""}]
"""
url = self._build_url(date)
display_date = date if date else datetime.now().strftime("%Y%m%d")
print(f"[Crawler] 开始抓取 {display_date} 新闻联播")
print(f"[Crawler] URL: {url}")
try:
# 使用 cloudscraper 发起请求
response = self.scraper.get(url, timeout=30)
response.raise_for_status()
response.encoding = 'utf-8'
# 解析 HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 提取新闻条目
news_list = self._parse_content(soup)
if not news_list:
print(f"[Crawler] 未找到新闻内容,可能网站结构变化或当日无更新")
print(f"[Crawler] 页面标题: {soup.title.string if soup.title else 'No Title'}")
# 尝试从缓存读取
cached = self._load_from_cache(display_date)
if cached:
print(f"[Crawler] 使用缓存数据 ({len(cached)} 条)")
return cached
return []
print(f"[Crawler] 成功提取 {len(news_list)} 条新闻")
# 缓存结果
self._cache_news(news_list, display_date)
return news_list
except Exception as e:
print(f"[Crawler] 爬取失败: {e}")
# 尝试从缓存读取
cached = self._load_from_cache(display_date)
if cached:
print(f"[Crawler] 使用缓存数据 ({len(cached)} 条)")
return cached
return []
def _parse_content(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
"""
解析网页内容,提取新闻条目
网站结构:
- 容器: .content-section
- 标题: .content-heading
- 内容: .content-body
Args:
soup: BeautifulSoup 对象
Returns:
新闻列表
"""
news_list = []
# 查找所有新闻板块
sections = soup.select('.content-section')
if not sections:
print("[Crawler] 未找到 .content-section 元素")
return []
for section in sections:
# 提取标题
title_tag = section.select_one('.content-heading')
if not title_tag:
continue
title = title_tag.get_text(strip=True)
# 提取内容
content_tag = section.select_one('.content-body')
if not content_tag:
continue
# 尝试提取段落以保持格式
paragraphs = content_tag.find_all('p')
if paragraphs:
# 多段落用换行分隔
content = ' '.join([p.get_text(strip=True) for p in paragraphs])
else:
content = content_tag.get_text(strip=True)
# 过滤过短的内容
if len(content) < 10:
continue
news_list.append({
"title": title,
"text": content
})
return news_list
def _cache_news(self, news_list: List[Dict], date: str) -> None:
"""缓存新闻到本地"""
cache_file = self.cache_dir / f"xinwenlianbo_{date}.json"
try:
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
"date": date,
"timestamp": datetime.now().isoformat(),
"count": len(news_list),
"news": news_list
}, f, ensure_ascii=False, indent=2)
print(f"[Crawler] 已缓存到 {cache_file}")
except Exception as e:
print(f"[Crawler] 缓存失败: {e}")
def _load_from_cache(self, date: str) -> Optional[List[Dict]]:
"""从缓存加载新闻"""
cache_file = self.cache_dir / f"xinwenlianbo_{date}.json"
if not cache_file.exists():
return None
try:
with open(cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get('news', [])
except Exception as e:
print(f"[Crawler] 读取缓存失败: {e}")
return None
def crawl_sync(
self,
date: Optional[str] = None
) -> List[Dict[str, str]]:
"""
同步爬取接口(主要方法)
Args:
date: 日期字符串 (YYYYMMDD)
Returns:
新闻列表
"""
return self.crawl_xinwenlianbo(date)
def crawl_date_range(
self,
start_date: str,
end_date: str
) -> Dict[str, List[Dict]]:
"""
批量爬取日期范围内的新闻
Args:
start_date: 开始日期 (YYYYMMDD)
end_date: 结束日期 (YYYYMMDD)
Returns:
{date: news_list} 字典
"""
from datetime import datetime, timedelta
start = datetime.strptime(start_date, "%Y%m%d")
end = datetime.strptime(end_date, "%Y%m%d")
results = {}
current = start
while current <= end:
date_str = current.strftime("%Y%m%d")
print(f"\n[Crawler] 处理 {date_str}...")
news_list = self.crawl_sync(date_str)
results[date_str] = news_list
current += timedelta(days=1)
# 礼貌延迟,避免频繁请求
time.sleep(2)
return results
# ==================== 测试代码 ====================
def test_sync():
"""同步测试(用于命令行直接运行)"""
print("=" * 50)
print("Skill F: Crawler 新闻联播爬虫测试")
print("=" * 50)
crawler = XinwenLianboCrawler()
# 同步爬取今日新闻
print("\n测试 1: 爬取今日新闻")
news_list = crawler.crawl_sync()
print(f"\n获取 {len(news_list)} 条新闻")
if news_list:
print("\n前 3 条新闻预览:")
for i, news in enumerate(news_list[:3], 1):
print(f"\n[{i}] {news['title']}")
print(f" {news['text'][:150]}...")
# 测试指定日期
print("\n" + "=" * 50)
print("测试 2: 爬取指定日期 (20251229)")
specific_news = crawler.crawl_sync(date="20251229")
print(f"获取 {len(specific_news)} 条新闻")
print("\n✅ Crawler 模块测试完成")
return news_list
if __name__ == "__main__":
test_sync()