299 lines
8.7 KiB
Python
299 lines
8.7 KiB
Python
"""
|
||
Skill F: Crawler (新闻联播爬虫)
|
||
================================
|
||
职能:每日抓取新闻联播文字稿
|
||
|
||
数据源:https://cn.govopendata.com/xinwenlianbo/
|
||
输出:符合 Skill A 要求的新闻列表 [{"title": "", "text": ""}]
|
||
"""
|
||
|
||
import cloudscraper
|
||
from bs4 import BeautifulSoup
|
||
from datetime import datetime
|
||
from typing import List, Dict, Optional
|
||
from pathlib import Path
|
||
import json
|
||
import time
|
||
|
||
|
||
class XinwenLianboCrawler:
|
||
"""
|
||
新闻联播爬虫 - 抓取每日新闻联播文字稿
|
||
|
||
核心功能:
|
||
1. 自动构建当日 URL
|
||
2. 使用 cloudscraper 绕过 Cloudflare 防护
|
||
3. 解析新闻条目
|
||
4. 输出符合 Analyst 要求的格式
|
||
"""
|
||
|
||
def __init__(self, base_url: str = "https://cn.govopendata.com/xinwenlianbo/"):
|
||
"""
|
||
Args:
|
||
base_url: 新闻联播网站基础 URL
|
||
"""
|
||
self.base_url = base_url
|
||
self.cache_dir = Path("data") / "news_cache"
|
||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 创建 cloudscraper 会话
|
||
self.scraper = cloudscraper.create_scraper(
|
||
browser={
|
||
'browser': 'chrome',
|
||
'platform': 'windows',
|
||
'desktop': True
|
||
}
|
||
)
|
||
|
||
def _build_url(self, date: Optional[str] = None) -> str:
|
||
"""
|
||
构建目标 URL
|
||
|
||
Args:
|
||
date: 日期字符串 (格式: YYYYMMDD),默认今天
|
||
|
||
Returns:
|
||
完整 URL
|
||
"""
|
||
if date is None:
|
||
date = datetime.now().strftime("%Y%m%d")
|
||
|
||
return f"{self.base_url}{date}/"
|
||
|
||
def crawl_xinwenlianbo(self, date: Optional[str] = None) -> List[Dict[str, str]]:
|
||
"""
|
||
爬取新闻联播
|
||
|
||
Args:
|
||
date: 日期字符串 (YYYYMMDD),默认今天
|
||
|
||
Returns:
|
||
新闻列表 [{"title": "", "text": ""}]
|
||
"""
|
||
url = self._build_url(date)
|
||
display_date = date if date else datetime.now().strftime("%Y%m%d")
|
||
|
||
print(f"[Crawler] 开始抓取 {display_date} 新闻联播")
|
||
print(f"[Crawler] URL: {url}")
|
||
|
||
try:
|
||
# 使用 cloudscraper 发起请求
|
||
response = self.scraper.get(url, timeout=30)
|
||
response.raise_for_status()
|
||
response.encoding = 'utf-8'
|
||
|
||
# 解析 HTML
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# 提取新闻条目
|
||
news_list = self._parse_content(soup)
|
||
|
||
if not news_list:
|
||
print(f"[Crawler] 未找到新闻内容,可能网站结构变化或当日无更新")
|
||
print(f"[Crawler] 页面标题: {soup.title.string if soup.title else 'No Title'}")
|
||
|
||
# 尝试从缓存读取
|
||
cached = self._load_from_cache(display_date)
|
||
if cached:
|
||
print(f"[Crawler] 使用缓存数据 ({len(cached)} 条)")
|
||
return cached
|
||
|
||
return []
|
||
|
||
print(f"[Crawler] 成功提取 {len(news_list)} 条新闻")
|
||
|
||
# 缓存结果
|
||
self._cache_news(news_list, display_date)
|
||
|
||
return news_list
|
||
|
||
except Exception as e:
|
||
print(f"[Crawler] 爬取失败: {e}")
|
||
|
||
# 尝试从缓存读取
|
||
cached = self._load_from_cache(display_date)
|
||
if cached:
|
||
print(f"[Crawler] 使用缓存数据 ({len(cached)} 条)")
|
||
return cached
|
||
|
||
return []
|
||
|
||
def _parse_content(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
||
"""
|
||
解析网页内容,提取新闻条目
|
||
|
||
网站结构:
|
||
- 容器: .content-section
|
||
- 标题: .content-heading
|
||
- 内容: .content-body
|
||
|
||
Args:
|
||
soup: BeautifulSoup 对象
|
||
|
||
Returns:
|
||
新闻列表
|
||
"""
|
||
news_list = []
|
||
|
||
# 查找所有新闻板块
|
||
sections = soup.select('.content-section')
|
||
|
||
if not sections:
|
||
print("[Crawler] 未找到 .content-section 元素")
|
||
return []
|
||
|
||
for section in sections:
|
||
# 提取标题
|
||
title_tag = section.select_one('.content-heading')
|
||
if not title_tag:
|
||
continue
|
||
|
||
title = title_tag.get_text(strip=True)
|
||
|
||
# 提取内容
|
||
content_tag = section.select_one('.content-body')
|
||
if not content_tag:
|
||
continue
|
||
|
||
# 尝试提取段落以保持格式
|
||
paragraphs = content_tag.find_all('p')
|
||
if paragraphs:
|
||
# 多段落用换行分隔
|
||
content = ' '.join([p.get_text(strip=True) for p in paragraphs])
|
||
else:
|
||
content = content_tag.get_text(strip=True)
|
||
|
||
# 过滤过短的内容
|
||
if len(content) < 10:
|
||
continue
|
||
|
||
news_list.append({
|
||
"title": title,
|
||
"text": content
|
||
})
|
||
|
||
return news_list
|
||
|
||
def _cache_news(self, news_list: List[Dict], date: str) -> None:
|
||
"""缓存新闻到本地"""
|
||
cache_file = self.cache_dir / f"xinwenlianbo_{date}.json"
|
||
|
||
try:
|
||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||
json.dump({
|
||
"date": date,
|
||
"timestamp": datetime.now().isoformat(),
|
||
"count": len(news_list),
|
||
"news": news_list
|
||
}, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"[Crawler] 已缓存到 {cache_file}")
|
||
|
||
except Exception as e:
|
||
print(f"[Crawler] 缓存失败: {e}")
|
||
|
||
def _load_from_cache(self, date: str) -> Optional[List[Dict]]:
|
||
"""从缓存加载新闻"""
|
||
cache_file = self.cache_dir / f"xinwenlianbo_{date}.json"
|
||
|
||
if not cache_file.exists():
|
||
return None
|
||
|
||
try:
|
||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
return data.get('news', [])
|
||
|
||
except Exception as e:
|
||
print(f"[Crawler] 读取缓存失败: {e}")
|
||
return None
|
||
|
||
def crawl_sync(
|
||
self,
|
||
date: Optional[str] = None
|
||
) -> List[Dict[str, str]]:
|
||
"""
|
||
同步爬取接口(主要方法)
|
||
|
||
Args:
|
||
date: 日期字符串 (YYYYMMDD)
|
||
|
||
Returns:
|
||
新闻列表
|
||
"""
|
||
return self.crawl_xinwenlianbo(date)
|
||
|
||
def crawl_date_range(
|
||
self,
|
||
start_date: str,
|
||
end_date: str
|
||
) -> Dict[str, List[Dict]]:
|
||
"""
|
||
批量爬取日期范围内的新闻
|
||
|
||
Args:
|
||
start_date: 开始日期 (YYYYMMDD)
|
||
end_date: 结束日期 (YYYYMMDD)
|
||
|
||
Returns:
|
||
{date: news_list} 字典
|
||
"""
|
||
from datetime import datetime, timedelta
|
||
|
||
start = datetime.strptime(start_date, "%Y%m%d")
|
||
end = datetime.strptime(end_date, "%Y%m%d")
|
||
|
||
results = {}
|
||
current = start
|
||
|
||
while current <= end:
|
||
date_str = current.strftime("%Y%m%d")
|
||
print(f"\n[Crawler] 处理 {date_str}...")
|
||
|
||
news_list = self.crawl_sync(date_str)
|
||
results[date_str] = news_list
|
||
|
||
current += timedelta(days=1)
|
||
|
||
# 礼貌延迟,避免频繁请求
|
||
time.sleep(2)
|
||
|
||
return results
|
||
|
||
|
||
# ==================== 测试代码 ====================
|
||
|
||
def test_sync():
|
||
"""同步测试(用于命令行直接运行)"""
|
||
print("=" * 50)
|
||
print("Skill F: Crawler 新闻联播爬虫测试")
|
||
print("=" * 50)
|
||
|
||
crawler = XinwenLianboCrawler()
|
||
|
||
# 同步爬取今日新闻
|
||
print("\n测试 1: 爬取今日新闻")
|
||
news_list = crawler.crawl_sync()
|
||
|
||
print(f"\n获取 {len(news_list)} 条新闻")
|
||
|
||
if news_list:
|
||
print("\n前 3 条新闻预览:")
|
||
for i, news in enumerate(news_list[:3], 1):
|
||
print(f"\n[{i}] {news['title']}")
|
||
print(f" {news['text'][:150]}...")
|
||
|
||
# 测试指定日期
|
||
print("\n" + "=" * 50)
|
||
print("测试 2: 爬取指定日期 (20251229)")
|
||
specific_news = crawler.crawl_sync(date="20251229")
|
||
print(f"获取 {len(specific_news)} 条新闻")
|
||
|
||
print("\n✅ Crawler 模块测试完成")
|
||
return news_list
|
||
|
||
|
||
if __name__ == "__main__":
|
||
test_sync()
|