This commit is contained in:
2025-11-10 19:13:54 +08:00
parent 81ec0f0fc9
commit 3d742bf322
7 changed files with 364 additions and 90 deletions

View File

@@ -2,7 +2,7 @@ from pydantic import BaseModel, Field, HttpUrl
from typing import Any, List, Optional
class ResultDomain(BaseModel):
code: int = Field(..., description="状态码")
code: int = Field(..., description="状态码",)
message: str = Field(..., description="消息")
success: bool = Field(..., description="是否成功")
data: Optional[Any] = Field(default=None, description="数据")

View File

View File

@@ -1,5 +1,5 @@
# 定义基础爬虫类
from typing import Dict, Optional, List, Any
from typing import Callable, Dict, Optional, List, Any, Union
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
@@ -12,7 +12,7 @@ class UrlConfig(BaseModel):
url: str = Field(..., description="请求URL")
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
method: str = Field(default="GET", description="请求方法")
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
class Config:
# 允许任意类型
arbitrary_types_allowed = True
@@ -123,15 +123,6 @@ class BaseCrawler(ABC):
logger.error(f"HTML解析失败: {str(e)}")
return None
@abstractmethod
def crawl(self) -> List[NewsItem]:
"""
爬取新闻(子类必须实现)
Returns:
新闻列表
"""
pass
@abstractmethod
def parse_news_detail(self, url: str) -> Optional[NewsItem]:

View File

@@ -1,11 +1,11 @@
# 人民日报爬虫
from typing import List, Optional
from core import ResultDomain
from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger
import re
import chardet
from datetime import datetime
from datetime import datetime, timedelta
class RmrbCrawler(BaseCrawler):
@@ -50,7 +50,7 @@ class RmrbCrawler(BaseCrawler):
}
),
"one_day_trending_news": UrlConfig(
url=lambda date: f"http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
url= "http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
method="GET",
params={},
headers={
@@ -63,7 +63,7 @@ class RmrbCrawler(BaseCrawler):
)
super().__init__(config)
def search(self, key: str, total: int = 10, news_type: int = 0) -> ResultDomain:
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
"""
搜索人民日报新闻
@@ -76,7 +76,7 @@ class RmrbCrawler(BaseCrawler):
新闻列表
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
# 获取搜索配置
@@ -98,7 +98,7 @@ class RmrbCrawler(BaseCrawler):
while len(news_list) < total:
search_data["page"] = page
response = self.fetch(search_config.url, method=search_config.method, data=search_data, headers=search_config.headers)
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
response_json = response.json()
if response_json.get("code") == 0:
records = response_json.get("data", {}).get("records", [])
@@ -130,7 +130,7 @@ class RmrbCrawler(BaseCrawler):
"""
try:
hot_point_rank_config = self.config.urls.get("hot_point_rank")
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
@@ -169,7 +169,7 @@ class RmrbCrawler(BaseCrawler):
获取人民日报一天内的热点新闻
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
@@ -177,7 +177,8 @@ class RmrbCrawler(BaseCrawler):
logger.info(f"获取人民日报一天内的热点新闻成功")
date_str = date.strftime("%Y%m%d")
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
one_day_trending_news_config.url = one_day_trending_news_config.url(date_str)
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
if not response:
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
@@ -194,12 +195,12 @@ class RmrbCrawler(BaseCrawler):
return resultDomain
all_doc_urls = []
all_doc_urls.extend(a_tags)
bg01 = soup.find('td', class_="bg01")
indexfont13 = bg01.find('td', class_='indexfont13')
# 获取该 td 下的所有 a 标签
a_tags = indexfont13.find_all('a')
all_doc_urls.extend(a_tags)
bg02 = soup.find('td', class_="bg02")
p6 = bg02.find('td', class_='p6')
@@ -223,19 +224,21 @@ class RmrbCrawler(BaseCrawler):
获取人民日报多天内的热点新闻
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0,message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
resultDomain.code = 0
resultDomain.message = "获取人民日报多天内的热点新闻成功"
for date in range(start_date, end_date):
resultDomain = self.getOneDayTrendingNews(date)
if not resultDomain.success:
continue
news_list.extend(resultDomain.dataList)
logger.info(f"获取人民日报多天内的热点新闻成功")
current_date = start_date
while current_date <= end_date:
day_result = self.getOneDayTrendingNews(current_date)
if day_result.success and day_result.dataList:
news_list.extend(day_result.dataList)
current_date += timedelta(days=1)
logger.info(f"获取人民日报多天内的热点新闻成功,共 {len(news_list)}")
return resultDomain
except Exception as e:
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
@@ -322,14 +325,18 @@ class RmrbCrawler(BaseCrawler):
if p.find('img'):
tag = "img"
src = p.find('img').get('src')
if not src.startswith("http") and src:
if src:
src = str(src) # 转换为字符串
if not src.startswith("http"):
src = self.config.base_url + src
content = f"<img style='{p_style}' src='{src}' />"
elif p.find('video'):
tag = "video"
src = p.find('video').get('src')
if not src.startswith("http") and src:
if src:
src = str(src) # 转换为字符串
if not src.startswith("http"):
src = self.config.base_url + src
content = f"<video style='align-items: center;' src='{src}' />"
else:
@@ -356,60 +363,3 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
return None
"""
解析人民日报新闻详情并保存为HTML文件UTF-8编码
Args:
url: 新闻详情页URL
output_file: 输出文件路径,默认为 "crawler/response.html"
"""
try:
response = self.fetch(url)
if not response:
logger.error(f"获取响应失败: {url}")
return
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
return
# 保存为UTF-8编码的文件BeautifulSoup 已经自动解码为 Unicode 字符串)
with open(output_file, "w", encoding="utf-8") as f:
f.write(soup.prettify())
logger.info(f"成功保存HTML文件: {output_file}")
except Exception as e:
logger.error(f"解析并保存新闻详情失败 [{url}]: {str(e)}")
import traceback
logger.error(traceback.format_exc())
"""
解析时间字符串
Args:
time_text: 时间文本
Returns:
标准化的时间字符串
"""
try:
# 尝试匹配常见的时间格式
patterns = [
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
]
for pattern in patterns:
match = re.search(pattern, time_text)
if match:
return time_text
return time_text
except Exception as e:
logger.warning(f"时间解析失败: {str(e)}")
return None

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
人民日报热点排行爬虫命令行工具
用法: python RmrbHotPoint.py
"""
import argparse
import json
import sys
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawler.RmrbCrawler import RmrbCrawler
from loguru import logger
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='人民日报热点排行获取工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python RmrbHotPoint.py
"""
)
args = parser.parse_args()
try:
# 创建爬虫实例
logger.info("开始获取人民日报热点排行")
crawler = RmrbCrawler()
# 执行获取热点排行
result = crawler.hotPointRank()
# 输出JSON结果
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
}
print(json.dumps(output, ensure_ascii=False, indent=2))
# 关闭爬虫
crawler.close()
# 退出码: 成功=0, 失败=1
sys.exit(0 if result.success else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
人民日报搜索爬虫命令行工具
用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
"""
import argparse
import json
import sys
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawler.RmrbCrawler import RmrbCrawler
from loguru import logger
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='人民日报新闻搜索工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python RmrbSearch.py --key "教育改革" --total 20
python RmrbSearch.py -k "科技创新" -t 15 -n 1
新闻类型说明:
0 - 所有类型 (默认)
1 - 新闻
2 - 互动
3 - 报刊
4 - 图片
5 - 视频
"""
)
parser.add_argument(
'--key', '-k',
type=str,
required=True,
help='搜索关键词 (必需)'
)
parser.add_argument(
'--total', '-t',
type=int,
default=10,
help='获取新闻总数 (默认: 10)'
)
parser.add_argument(
'--type', '-n',
type=int,
default=0,
choices=[0, 1, 2, 3, 4, 5],
help='新闻类型: 0=全部, 1=新闻, 2=互动, 3=报刊, 4=图片, 5=视频 (默认: 0)'
)
args = parser.parse_args()
try:
# 创建爬虫实例
logger.info(f"开始搜索: 关键词='{args.key}', 数量={args.total}, 类型={args.type}")
crawler = RmrbCrawler()
# 执行搜索
result = crawler.search(key=args.key, total=args.total, news_type=args.type)
# 输出JSON结果
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
}
print(json.dumps(output, ensure_ascii=False, indent=2))
# 关闭爬虫
crawler.close()
# 退出码: 成功=0, 失败=1
sys.exit(0 if result.success else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
人民日报热点新闻爬虫命令行工具
用法:
python RmrbTrending.py --date 20250110
python RmrbTrending.py --start-date 20250101 --end-date 20250110
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawler.RmrbCrawler import RmrbCrawler
from loguru import logger
def parse_date(date_str: str) -> datetime:
"""
解析日期字符串为datetime对象
Args:
date_str: 日期字符串格式为YYYYMMDD
Returns:
datetime对象
"""
try:
return datetime.strptime(date_str, "%Y%m%d")
except ValueError:
raise ValueError(f"日期格式错误: {date_str}正确格式为YYYYMMDD例如: 20250110")
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='人民日报热点新闻获取工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 获取单日热点新闻
python RmrbTrending.py --date 20250110
python RmrbTrending.py -d 20250110
# 获取日期范围内的热点新闻
python RmrbTrending.py --start-date 20250101 --end-date 20250110
python RmrbTrending.py -s 20250101 -e 20250110
# 不指定日期则获取今天的热点新闻
python RmrbTrending.py
"""
)
parser.add_argument(
'--date', '-d',
type=str,
help='指定日期 (格式: YYYYMMDD例如: 20250110)'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (格式: YYYYMMDD需与--end-date一起使用)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (格式: YYYYMMDD需与--start-date一起使用)'
)
args = parser.parse_args()
try:
# 创建爬虫实例
crawler = RmrbCrawler()
# 判断使用哪种模式
if args.date:
# 单日模式
if args.start_date or args.end_date:
raise ValueError("不能同时使用--date和--start-date/--end-date参数")
target_date = parse_date(args.date)
logger.info(f"获取单日热点新闻: {args.date}")
result = crawler.getOneDayTrendingNews(target_date)
elif args.start_date and args.end_date:
# 日期范围模式
start_date = parse_date(args.start_date)
end_date = parse_date(args.end_date)
if start_date > end_date:
raise ValueError("开始日期不能晚于结束日期")
logger.info(f"获取日期范围热点新闻: {args.start_date}{args.end_date}")
result = crawler.getDaysTrendingNews(start_date, end_date)
elif args.start_date or args.end_date:
# 只指定了一个日期
raise ValueError("--start-date和--end-date必须同时使用")
else:
# 默认使用今天的日期
today = datetime.now()
today_str = today.strftime("%Y%m%d")
logger.info(f"获取今日热点新闻: {today_str}")
result = crawler.getOneDayTrendingNews(today)
# 输出JSON结果
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
}
print(json.dumps(output, ensure_ascii=False, indent=2))
# 关闭爬虫
crawler.close()
# 退出码: 成功=0, 失败=1
sys.exit(0 if result.success else 1)
except ValueError as e:
logger.error(f"参数错误: {str(e)}")
error_output = {
"code": 400,
"message": f"参数错误: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()