调试修改爬虫
This commit is contained in:
@@ -2,10 +2,10 @@
|
||||
from typing import Callable, Dict, Optional, List, Any, Union
|
||||
from abc import ABC, abstractmethod
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
import json
|
||||
|
||||
class UrlConfig(BaseModel):
|
||||
"""URL配置数据模型"""
|
||||
@@ -49,6 +49,8 @@ class NewsItem(BaseModel):
|
||||
author: Optional[str] = Field(default=None, description="作者")
|
||||
source: Optional[str] = Field(default=None, description="来源")
|
||||
category: Optional[str] = Field(default=None, description="分类")
|
||||
executeStatus: Optional[int] = Field(default=0, description="执行状态")
|
||||
executeMessage: Optional[str] = Field(default=None, description="执行消息")
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
|
||||
@@ -6,12 +6,15 @@ from loguru import logger
|
||||
import re
|
||||
import chardet
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import NavigableString
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
|
||||
class RmrbCrawler(BaseCrawler):
|
||||
"""人民日报新闻爬虫"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
|
||||
"""初始化人民日报爬虫"""
|
||||
config = CrawlerConfig(
|
||||
base_url="http://www.people.com.cn",
|
||||
@@ -62,6 +65,12 @@ class RmrbCrawler(BaseCrawler):
|
||||
},
|
||||
)
|
||||
super().__init__(config)
|
||||
self.detail_map = {
|
||||
"gba": self.parse_base_news_detail,
|
||||
"politics": self.parse_base_news_detail,
|
||||
"finance": self.parse_base_news_detail,
|
||||
"cpc": self.parse_cpc_news_detail,
|
||||
}
|
||||
|
||||
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
|
||||
"""
|
||||
@@ -104,17 +113,25 @@ class RmrbCrawler(BaseCrawler):
|
||||
records = response_json.get("data", {}).get("records", [])
|
||||
for record in records:
|
||||
news = self.parse_news_detail(record.get("url"))
|
||||
if news['title'] == '':
|
||||
news['title'] = record.get("title")
|
||||
if news['contentRows'] == []:
|
||||
news['contentRows'] = record.get("contentOriginal")
|
||||
if news['publishTime'] == '':
|
||||
news['publishTime'] = datetime.datetime.fromtimestamp(record.get("displayTime") / 1000).date()
|
||||
if news['author'] == '':
|
||||
news['author'] = record.get("author")
|
||||
if news['source'] == '':
|
||||
news['source'] = record.get("originName")
|
||||
|
||||
if news.title == '':
|
||||
news.title = record.get("title")
|
||||
if news.contentRows == []:
|
||||
# 如果contentOriginal是字符串,转换为列表格式
|
||||
content_original = record.get("contentOriginal")
|
||||
if isinstance(content_original, str):
|
||||
news.contentRows = [{"type": "text", "content": content_original}]
|
||||
elif isinstance(content_original, list):
|
||||
news.contentRows = content_original
|
||||
if not news.contentRows:
|
||||
news.executeStatus= 1
|
||||
news.executeMessage = "直接从接口响应获取"
|
||||
if news.publishTime == '':
|
||||
news.publishTime = str(datetime.fromtimestamp(record.get("displayTime", 0) / 1000).date())
|
||||
if news.author == '':
|
||||
news.author = record.get("author")
|
||||
if news.source == '':
|
||||
news.source = record.get("originName")
|
||||
|
||||
news_list.append(news)
|
||||
else:
|
||||
resultDomain.code = response_json.get("code")
|
||||
@@ -259,6 +276,27 @@ class RmrbCrawler(BaseCrawler):
|
||||
return resultDomain
|
||||
|
||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
# 从 URL 中提取 category
|
||||
netloc = urlparse(url).netloc
|
||||
category = "gba"
|
||||
if netloc.endswith('.people.com.cn'):
|
||||
category = netloc.split('.')[0]
|
||||
# 从 detail_map 中获取对应的解析函数
|
||||
print(category)
|
||||
parser_func = self.detail_map.get(category)
|
||||
|
||||
if parser_func is None:
|
||||
logger.error(f"未找到对应解析器,category={category}, url={url}")
|
||||
return NewsItem(
|
||||
url=url,
|
||||
executeStatus=0,
|
||||
executeMessage=f"不支持的新闻类型: {category}"
|
||||
)
|
||||
|
||||
# 调用对应的解析方法(注意:这些方法是实例方法,需通过 self 调用)
|
||||
return parser_func(url)
|
||||
|
||||
def parse_base_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
"""
|
||||
解析人民日报新闻详情
|
||||
|
||||
@@ -277,10 +315,14 @@ class RmrbCrawler(BaseCrawler):
|
||||
publishTime="",
|
||||
author="",
|
||||
source="人民网",
|
||||
category=""
|
||||
category="",
|
||||
executeStatus=1,
|
||||
executeMessage="成功解析新闻"
|
||||
)
|
||||
if not response:
|
||||
logger.error(f"获取响应失败: {url}")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"获取响应失败: {url}"
|
||||
return news
|
||||
|
||||
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
|
||||
@@ -288,18 +330,24 @@ class RmrbCrawler(BaseCrawler):
|
||||
soup = self.parse_html(response.content)
|
||||
if not soup:
|
||||
logger.error("解析HTML失败")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"解析HTML失败"
|
||||
return news
|
||||
|
||||
# 提取主内容区域
|
||||
main_div = soup.find("div", class_="layout rm_txt cf")
|
||||
main_div = soup.select_one("div.layout.rm_txt.cf")
|
||||
if not main_div:
|
||||
logger.error("未找到主内容区域")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"未找到主内容区域"
|
||||
return news
|
||||
|
||||
# 提取文章区域
|
||||
article_div = main_div.find("div", class_="col col-1")
|
||||
article_div = main_div.select_one("div.col.col-1")
|
||||
if not article_div:
|
||||
logger.error("未找到文章区域")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"未找到文章区域"
|
||||
return news
|
||||
|
||||
# 提取标题
|
||||
@@ -380,4 +428,215 @@ class RmrbCrawler(BaseCrawler):
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
|
||||
return None
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"解析新闻详情失败: {str(e)}"
|
||||
return news
|
||||
|
||||
def parse_cpc_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
"""
|
||||
解析人民日报新闻详情
|
||||
"""
|
||||
try:
|
||||
response = self.fetch(url)
|
||||
news = NewsItem(
|
||||
title="",
|
||||
contentRows=[], # 修复:使用 contents 而不是 content
|
||||
url=url,
|
||||
publishTime="",
|
||||
author="",
|
||||
source="人民网",
|
||||
category="",
|
||||
executeStatus=1,
|
||||
executeMessage="成功解析新闻"
|
||||
)
|
||||
if not response:
|
||||
logger.error(f"获取响应失败: {url}")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"获取响应失败: {url}"
|
||||
return news
|
||||
|
||||
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
|
||||
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
||||
soup = self.parse_html(response.content)
|
||||
if not soup:
|
||||
logger.error("解析HTML失败")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"解析HTML失败"
|
||||
return news
|
||||
|
||||
# 提取主内容区域
|
||||
main_div = soup.select_one("div.text_con.text_con01")
|
||||
if not main_div:
|
||||
logger.error("未找到主内容区域")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"未找到主内容区域"
|
||||
return news
|
||||
|
||||
# 提取文章区域
|
||||
article_div = main_div.select_one("div.text_c")
|
||||
if not article_div:
|
||||
logger.error("未找到文章区域")
|
||||
news.executeStatus = 0
|
||||
news.executeMessage = f"未找到文章区域"
|
||||
return news
|
||||
|
||||
# 提取标题
|
||||
title_tag = article_div.select_one("h1")
|
||||
title = title_tag.get_text(strip=True) if title_tag else ""
|
||||
|
||||
# 提取作者
|
||||
author_tag = article_div.select_one("div.author.cf")
|
||||
author = author_tag.get_text(strip=True) if author_tag else ""
|
||||
|
||||
# 提取发布时间和来源
|
||||
channel_div = article_div.select_one("div.sou")
|
||||
publish_time = ""
|
||||
source = ""
|
||||
|
||||
if channel_div:
|
||||
# 提取时间:取第一个非空文本节点
|
||||
for child in channel_div.children:
|
||||
if isinstance(child, str) and child.strip():
|
||||
publish_time = child.strip().split("来源:")[0].strip()
|
||||
break
|
||||
|
||||
# 提取来源
|
||||
a_tag = channel_div.find("a")
|
||||
source = a_tag.get_text(strip=True) if a_tag else ""
|
||||
|
||||
# 清理不可见空格
|
||||
publish_time = publish_time.replace("\xa0", " ").replace(" ", " ").strip()
|
||||
|
||||
# 提取内容
|
||||
content_div = article_div.select_one('div.show_text')
|
||||
contents = [] # 构建一个富文本内容
|
||||
pList = content_div.find_all("p") # 所有p标签
|
||||
# 解析p标签 变为quill富文本
|
||||
|
||||
# 遍历 show_text 下的所有直接子节点(保持顺序)
|
||||
for child in content_div.children:
|
||||
# 跳过纯文本节点(如换行、空格)
|
||||
if isinstance(child, NavigableString):
|
||||
continue
|
||||
|
||||
tag_name = child.name
|
||||
if tag_name is None:
|
||||
continue
|
||||
|
||||
# 情况1:检测是否是视频容器(根据 id 特征或内部结构)
|
||||
video_tag = child.find('video') if tag_name != 'video' else child
|
||||
if video_tag and video_tag.get('src'):
|
||||
src = str(video_tag['src'])
|
||||
p_style = video_tag.get("style", "")
|
||||
if not src.startswith("http"):
|
||||
src = self.config.base_url + src
|
||||
contents.append({
|
||||
"tag": "video",
|
||||
"content": f"<video style='{p_style}' src='{src}'></video>"
|
||||
})
|
||||
continue
|
||||
img_tag = child.find('img') if tag_name != 'img' else child
|
||||
if img_tag and img_tag.get('src'):
|
||||
src = str(img_tag['src'])
|
||||
p_style = child.get("style", "")
|
||||
|
||||
if not src.startswith("http"):
|
||||
src = self.config.base_url + src
|
||||
contents.append({
|
||||
"tag": "img",
|
||||
"content": f"<img style='{p_style}' src='{src}' />"
|
||||
})
|
||||
continue
|
||||
|
||||
if tag_name == 'p':
|
||||
p_style = child.get("style", "")
|
||||
img_tag = child.find('img')
|
||||
video_tag = child.find('video')
|
||||
|
||||
# 情况1:存在 <img> 或 <video> 标签(静态资源)
|
||||
if img_tag or video_tag:
|
||||
src = img_tag.get('src') if img_tag else video_tag.get('src')
|
||||
if src:
|
||||
src = str(src)
|
||||
if not src.startswith(('http://', 'https://')):
|
||||
src = self.config.base_url.rstrip('/') + '/' + src.lstrip('/')
|
||||
tag_type = "img" if img_tag else "video"
|
||||
if img_tag:
|
||||
content_html = f"<img style='{p_style}' src='{src}' />"
|
||||
else:
|
||||
content_html = f"<video style='{p_style}' src='{src}' controls></video>"
|
||||
contents.append({
|
||||
"tag": tag_type,
|
||||
"content": content_html
|
||||
})
|
||||
else:
|
||||
# 无 src,当作普通段落
|
||||
contents.append({"tag": "p", "content": str(child)})
|
||||
continue
|
||||
|
||||
# 情况2:检查是否包含人民网的 showPlayer 脚本(动态视频)
|
||||
script_tags = child.find_all('script', string=True)
|
||||
video_src = None
|
||||
poster_url = None
|
||||
|
||||
for script in script_tags:
|
||||
script_text = script.string or ""
|
||||
if "showPlayer" not in script_text:
|
||||
continue
|
||||
|
||||
# 使用正则精准提取 src 和 posterUrl(支持空格、换行)
|
||||
src_match = re.search(r"src\s*:\s*'([^']*)'", script_text)
|
||||
poster_match = re.search(r"posterUrl\s*:\s*'([^']*)'", script_text)
|
||||
|
||||
if src_match:
|
||||
video_src = src_match.group(1)
|
||||
if poster_match:
|
||||
poster_url = poster_match.group(1)
|
||||
|
||||
if video_src:
|
||||
break # 找到视频源即可退出
|
||||
|
||||
if video_src:
|
||||
# 补全 URL(确保是绝对路径)
|
||||
if not video_src.startswith(('http://', 'https://')):
|
||||
video_src = self.config.base_url.rstrip('/') + '/' + video_src.lstrip('/')
|
||||
if poster_url and not poster_url.startswith(('http://', 'https://')):
|
||||
poster_url = self.config.base_url.rstrip('/') + '/' + poster_url.lstrip('/')
|
||||
|
||||
# 构造 video 标签属性
|
||||
attrs_parts = []
|
||||
if p_style:
|
||||
attrs_parts.append(f"style='{p_style}'")
|
||||
if poster_url:
|
||||
attrs_parts.append(f"poster='{poster_url}'")
|
||||
attrs_parts.append("controls")
|
||||
attrs = " ".join(attrs_parts)
|
||||
|
||||
contents.append({
|
||||
"tag": "video",
|
||||
"content": f"<video {attrs} src='{video_src}'></video>"
|
||||
})
|
||||
else:
|
||||
# 普通段落文本
|
||||
contents.append({
|
||||
"tag": "p",
|
||||
"content": str(child)
|
||||
})
|
||||
continue
|
||||
|
||||
|
||||
news.title=title
|
||||
news.contentRows=contents # 修复:使用 contents 而不是 content
|
||||
news.url=url
|
||||
news.publishTime=publish_time
|
||||
news.author=author
|
||||
news.source=source or "人民网"
|
||||
news.category=""
|
||||
|
||||
logger.info(f"成功解析新闻: {title}")
|
||||
return news
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ def main():
|
||||
"message": result.message,
|
||||
"success": result.success,
|
||||
"data": None,
|
||||
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
|
||||
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||
}
|
||||
|
||||
if output_file:
|
||||
|
||||
@@ -81,20 +81,19 @@ def main():
|
||||
try:
|
||||
logger.info(f"开始搜索: 关键词='{key}', 数量={total}, 类型={news_type}")
|
||||
crawler = RmrbCrawler()
|
||||
# result = crawler.search(key=key.strip(), total=total, news_type=news_type)
|
||||
result = crawler.search(key=key.strip(), total=total, news_type=news_type)
|
||||
output = {
|
||||
"code": result.code,
|
||||
"message": result.message,
|
||||
"success": result.success,
|
||||
"data": None,
|
||||
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||
}
|
||||
result = None
|
||||
with open("../output/output.json", "r", encoding="utf-8") as f:
|
||||
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
||||
result = json.load(f)
|
||||
|
||||
print(result)
|
||||
output = result
|
||||
# output = {
|
||||
# "code": result["code"],
|
||||
# "message": result["message"],
|
||||
# "success": result["success"],
|
||||
# "data": None,
|
||||
# "dataList": [item.model_dump() for item in result["dataList"]] if result["dataList"] else []
|
||||
# }
|
||||
|
||||
if output_file:
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -102,8 +101,11 @@ def main():
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"结果已保存到: {output_file}")
|
||||
|
||||
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
|
||||
crawler.close()
|
||||
# sys.exit(0 if result.success else 1)
|
||||
# print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
|
||||
sys.exit(0 if result["success"] else 1)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -132,7 +132,7 @@ def main():
|
||||
"message": result.message,
|
||||
"success": result.success,
|
||||
"data": None,
|
||||
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
|
||||
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||
}
|
||||
|
||||
# 保存到文件
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user