调试修改爬虫

This commit is contained in:
2025-11-12 19:16:50 +08:00
parent 675e6da7d7
commit e55a52f20b
27 changed files with 1023 additions and 601 deletions

View File

@@ -2,10 +2,10 @@
from typing import Callable, Dict, Optional, List, Any, Union
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString
from loguru import logger
from pydantic import BaseModel, Field, HttpUrl
import json
class UrlConfig(BaseModel):
"""URL配置数据模型"""
@@ -49,6 +49,8 @@ class NewsItem(BaseModel):
author: Optional[str] = Field(default=None, description="作者")
source: Optional[str] = Field(default=None, description="来源")
category: Optional[str] = Field(default=None, description="分类")
executeStatus: Optional[int] = Field(default=0, description="执行状态")
executeMessage: Optional[str] = Field(default=None, description="执行消息")
class BaseCrawler(ABC):

View File

@@ -6,12 +6,15 @@ from loguru import logger
import re
import chardet
from datetime import datetime, timedelta
from bs4 import NavigableString
from urllib.parse import urlparse
import json
class RmrbCrawler(BaseCrawler):
"""人民日报新闻爬虫"""
def __init__(self):
"""初始化人民日报爬虫"""
config = CrawlerConfig(
base_url="http://www.people.com.cn",
@@ -62,6 +65,12 @@ class RmrbCrawler(BaseCrawler):
},
)
super().__init__(config)
self.detail_map = {
"gba": self.parse_base_news_detail,
"politics": self.parse_base_news_detail,
"finance": self.parse_base_news_detail,
"cpc": self.parse_cpc_news_detail,
}
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
"""
@@ -104,17 +113,25 @@ class RmrbCrawler(BaseCrawler):
records = response_json.get("data", {}).get("records", [])
for record in records:
news = self.parse_news_detail(record.get("url"))
if news['title'] == '':
news['title'] = record.get("title")
if news['contentRows'] == []:
news['contentRows'] = record.get("contentOriginal")
if news['publishTime'] == '':
news['publishTime'] = datetime.datetime.fromtimestamp(record.get("displayTime") / 1000).date()
if news['author'] == '':
news['author'] = record.get("author")
if news['source'] == '':
news['source'] = record.get("originName")
if news.title == '':
news.title = record.get("title")
if news.contentRows == []:
# 如果contentOriginal是字符串,转换为列表格式
content_original = record.get("contentOriginal")
if isinstance(content_original, str):
news.contentRows = [{"type": "text", "content": content_original}]
elif isinstance(content_original, list):
news.contentRows = content_original
if not news.contentRows:
news.executeStatus= 1
news.executeMessage = "直接从接口响应获取"
if news.publishTime == '':
news.publishTime = str(datetime.fromtimestamp(record.get("displayTime", 0) / 1000).date())
if news.author == '':
news.author = record.get("author")
if news.source == '':
news.source = record.get("originName")
news_list.append(news)
else:
resultDomain.code = response_json.get("code")
@@ -259,6 +276,27 @@ class RmrbCrawler(BaseCrawler):
return resultDomain
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
# 从 URL 中提取 category
netloc = urlparse(url).netloc
category = "gba"
if netloc.endswith('.people.com.cn'):
category = netloc.split('.')[0]
# 从 detail_map 中获取对应的解析函数
print(category)
parser_func = self.detail_map.get(category)
if parser_func is None:
logger.error(f"未找到对应解析器category={category}, url={url}")
return NewsItem(
url=url,
executeStatus=0,
executeMessage=f"不支持的新闻类型: {category}"
)
# 调用对应的解析方法(注意:这些方法是实例方法,需通过 self 调用)
return parser_func(url)
def parse_base_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析人民日报新闻详情
@@ -277,10 +315,14 @@ class RmrbCrawler(BaseCrawler):
publishTime="",
author="",
source="人民网",
category=""
category="",
executeStatus=1,
executeMessage="成功解析新闻"
)
if not response:
logger.error(f"获取响应失败: {url}")
news.executeStatus = 0
news.executeMessage = f"获取响应失败: {url}"
return news
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
@@ -288,18 +330,24 @@ class RmrbCrawler(BaseCrawler):
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
news.executeStatus = 0
news.executeMessage = f"解析HTML失败"
return news
# 提取主内容区域
main_div = soup.find("div", class_="layout rm_txt cf")
main_div = soup.select_one("div.layout.rm_txt.cf")
if not main_div:
logger.error("未找到主内容区域")
news.executeStatus = 0
news.executeMessage = f"未找到主内容区域"
return news
# 提取文章区域
article_div = main_div.find("div", class_="col col-1")
article_div = main_div.select_one("div.col.col-1")
if not article_div:
logger.error("未找到文章区域")
news.executeStatus = 0
news.executeMessage = f"未找到文章区域"
return news
# 提取标题
@@ -380,4 +428,215 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
return None
news.executeStatus = 0
news.executeMessage = f"解析新闻详情失败: {str(e)}"
return news
def parse_cpc_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析人民日报新闻详情
"""
try:
response = self.fetch(url)
news = NewsItem(
title="",
contentRows=[], # 修复:使用 contents 而不是 content
url=url,
publishTime="",
author="",
source="人民网",
category="",
executeStatus=1,
executeMessage="成功解析新闻"
)
if not response:
logger.error(f"获取响应失败: {url}")
news.executeStatus = 0
news.executeMessage = f"获取响应失败: {url}"
return news
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
news.executeStatus = 0
news.executeMessage = f"解析HTML失败"
return news
# 提取主内容区域
main_div = soup.select_one("div.text_con.text_con01")
if not main_div:
logger.error("未找到主内容区域")
news.executeStatus = 0
news.executeMessage = f"未找到主内容区域"
return news
# 提取文章区域
article_div = main_div.select_one("div.text_c")
if not article_div:
logger.error("未找到文章区域")
news.executeStatus = 0
news.executeMessage = f"未找到文章区域"
return news
# 提取标题
title_tag = article_div.select_one("h1")
title = title_tag.get_text(strip=True) if title_tag else ""
# 提取作者
author_tag = article_div.select_one("div.author.cf")
author = author_tag.get_text(strip=True) if author_tag else ""
# 提取发布时间和来源
channel_div = article_div.select_one("div.sou")
publish_time = ""
source = ""
if channel_div:
# 提取时间:取第一个非空文本节点
for child in channel_div.children:
if isinstance(child, str) and child.strip():
publish_time = child.strip().split("来源:")[0].strip()
break
# 提取来源
a_tag = channel_div.find("a")
source = a_tag.get_text(strip=True) if a_tag else ""
# 清理不可见空格
publish_time = publish_time.replace("\xa0", " ").replace(" ", " ").strip()
# 提取内容
content_div = article_div.select_one('div.show_text')
contents = [] # 构建一个富文本内容
pList = content_div.find_all("p") # 所有p标签
# 解析p标签 变为quill富文本
# 遍历 show_text 下的所有直接子节点(保持顺序)
for child in content_div.children:
# 跳过纯文本节点(如换行、空格)
if isinstance(child, NavigableString):
continue
tag_name = child.name
if tag_name is None:
continue
# 情况1检测是否是视频容器根据 id 特征或内部结构)
video_tag = child.find('video') if tag_name != 'video' else child
if video_tag and video_tag.get('src'):
src = str(video_tag['src'])
p_style = video_tag.get("style", "")
if not src.startswith("http"):
src = self.config.base_url + src
contents.append({
"tag": "video",
"content": f"<video style='{p_style}' src='{src}'></video>"
})
continue
img_tag = child.find('img') if tag_name != 'img' else child
if img_tag and img_tag.get('src'):
src = str(img_tag['src'])
p_style = child.get("style", "")
if not src.startswith("http"):
src = self.config.base_url + src
contents.append({
"tag": "img",
"content": f"<img style='{p_style}' src='{src}' />"
})
continue
if tag_name == 'p':
p_style = child.get("style", "")
img_tag = child.find('img')
video_tag = child.find('video')
# 情况1存在 <img> 或 <video> 标签(静态资源)
if img_tag or video_tag:
src = img_tag.get('src') if img_tag else video_tag.get('src')
if src:
src = str(src)
if not src.startswith(('http://', 'https://')):
src = self.config.base_url.rstrip('/') + '/' + src.lstrip('/')
tag_type = "img" if img_tag else "video"
if img_tag:
content_html = f"<img style='{p_style}' src='{src}' />"
else:
content_html = f"<video style='{p_style}' src='{src}' controls></video>"
contents.append({
"tag": tag_type,
"content": content_html
})
else:
# 无 src当作普通段落
contents.append({"tag": "p", "content": str(child)})
continue
# 情况2检查是否包含人民网的 showPlayer 脚本(动态视频)
script_tags = child.find_all('script', string=True)
video_src = None
poster_url = None
for script in script_tags:
script_text = script.string or ""
if "showPlayer" not in script_text:
continue
# 使用正则精准提取 src 和 posterUrl支持空格、换行
src_match = re.search(r"src\s*:\s*'([^']*)'", script_text)
poster_match = re.search(r"posterUrl\s*:\s*'([^']*)'", script_text)
if src_match:
video_src = src_match.group(1)
if poster_match:
poster_url = poster_match.group(1)
if video_src:
break # 找到视频源即可退出
if video_src:
# 补全 URL确保是绝对路径
if not video_src.startswith(('http://', 'https://')):
video_src = self.config.base_url.rstrip('/') + '/' + video_src.lstrip('/')
if poster_url and not poster_url.startswith(('http://', 'https://')):
poster_url = self.config.base_url.rstrip('/') + '/' + poster_url.lstrip('/')
# 构造 video 标签属性
attrs_parts = []
if p_style:
attrs_parts.append(f"style='{p_style}'")
if poster_url:
attrs_parts.append(f"poster='{poster_url}'")
attrs_parts.append("controls")
attrs = " ".join(attrs_parts)
contents.append({
"tag": "video",
"content": f"<video {attrs} src='{video_src}'></video>"
})
else:
# 普通段落文本
contents.append({
"tag": "p",
"content": str(child)
})
continue
news.title=title
news.contentRows=contents # 修复:使用 contents 而不是 content
news.url=url
news.publishTime=publish_time
news.author=author
news.source=source or "人民网"
news.category=""
logger.info(f"成功解析新闻: {title}")
return news
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")

View File

@@ -51,7 +51,7 @@ def main():
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
if output_file:

View File

@@ -81,20 +81,19 @@ def main():
try:
logger.info(f"开始搜索: 关键词='{key}', 数量={total}, 类型={news_type}")
crawler = RmrbCrawler()
# result = crawler.search(key=key.strip(), total=total, news_type=news_type)
result = crawler.search(key=key.strip(), total=total, news_type=news_type)
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
result = None
with open("../output/output.json", "r", encoding="utf-8") as f:
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
result = json.load(f)
print(result)
output = result
# output = {
# "code": result["code"],
# "message": result["message"],
# "success": result["success"],
# "data": None,
# "dataList": [item.model_dump() for item in result["dataList"]] if result["dataList"] else []
# }
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -102,8 +101,11 @@ def main():
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}")
print(json.dumps(output, ensure_ascii=False, indent=2))
crawler.close()
# sys.exit(0 if result.success else 1)
# print(json.dumps(output, ensure_ascii=False, indent=2))
sys.exit(0 if result["success"] else 1)
except Exception as e:

View File

@@ -132,7 +132,7 @@ def main():
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
# 保存到文件

File diff suppressed because one or more lines are too long