学习时评+综合新闻+中宣部发布
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -61,7 +61,9 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
"important": UrlConfig(
|
"important": UrlConfig(
|
||||||
url="https://www.xuexi.cn/98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784.html",
|
url="https://www.xuexi.cn/98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784.html",
|
||||||
method="GET",
|
method="GET",
|
||||||
params={},
|
params={
|
||||||
|
"path": "98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784"
|
||||||
|
},
|
||||||
headers={
|
headers={
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
@@ -79,10 +81,12 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
'sec-ch-ua-platform': '"Windows"'
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
"home": UrlConfig(
|
"xuexishiping":UrlConfig(
|
||||||
url="https://www.xuexi.cn/",
|
url="https://www.xuexi.cn/d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b.html",
|
||||||
method="GET",
|
method="GET",
|
||||||
params={},
|
params={
|
||||||
|
"path": "d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b"
|
||||||
|
},
|
||||||
headers={
|
headers={
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
@@ -99,7 +103,53 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
'sec-ch-ua-mobile': '?0',
|
'sec-ch-ua-mobile': '?0',
|
||||||
'sec-ch-ua-platform': '"Windows"'
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
}
|
}
|
||||||
)
|
),
|
||||||
|
"zongheshiping": UrlConfig(
|
||||||
|
url="https://www.xuexi.cn/7097477a9643eacffe4cc101e4906fdb/9a3668c13f6e303932b5e0e100fc248b.html",
|
||||||
|
method="GET",
|
||||||
|
params={
|
||||||
|
"path": "7097477a9643eacffe4cc101e4906fdb/9a3668c13f6e303932b5e0e100fc248b"
|
||||||
|
},
|
||||||
|
headers={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Referer': 'https://www.xuexi.cn/',
|
||||||
|
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"zhongxuanbu": UrlConfig(
|
||||||
|
url="https://www.xuexi.cn/105c2fa2843fa9e6d17440e172115c92/9a3668c13f6e303932b5e0e100fc248b.html",
|
||||||
|
method="GET",
|
||||||
|
params={
|
||||||
|
"path": "105c2fa2843fa9e6d17440e172115c92/9a3668c13f6e303932b5e0e100fc248b"
|
||||||
|
},
|
||||||
|
headers={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Referer': 'https://www.xuexi.cn/',
|
||||||
|
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
|
}
|
||||||
|
),
|
||||||
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -229,7 +279,7 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
logger.warning(f"提取发布时间失败: {e}")
|
logger.warning(f"提取发布时间失败: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
source_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-source")
|
source_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-resource")
|
||||||
news_item.source = source_div.text.strip().split(":")[1]
|
news_item.source = source_div.text.strip().split(":")[1]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"提取来源失败: {e}")
|
logger.warning(f"提取来源失败: {e}")
|
||||||
@@ -472,21 +522,11 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
with open("Xxqg_news_list.json", "w", encoding="utf-8") as f:
|
# with open("Xxqg_news_list.json", "w", encoding="utf-8") as f:
|
||||||
json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
|
# json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
|
||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
def crawl_important(self, total=10) -> ResultDomain:
|
def crawl_base(self, config: CrawlerConfig, yesterday=True, start:Optional[str]=None, end:Optional[str]=None) -> ResultDomain:
|
||||||
"""
|
|
||||||
爬取重要新闻栏目
|
|
||||||
参考旧版myQiangguo爬虫方式,使用requests获取文章列表,然后用Selenium解析详情
|
|
||||||
|
|
||||||
Args:
|
|
||||||
total: 最多爬取的文章数量,默认10
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ResultDomain: 包含新闻列表的结果对象
|
|
||||||
"""
|
|
||||||
news_list = []
|
news_list = []
|
||||||
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||||
|
|
||||||
@@ -497,16 +537,7 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
resultDomain.message = "WebDriver未初始化"
|
resultDomain.message = "WebDriver未初始化"
|
||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
# 获取important配置
|
self.driver.get(config.url)
|
||||||
important_config = self.config.urls.get("important")
|
|
||||||
if not important_config:
|
|
||||||
logger.error("未找到important配置")
|
|
||||||
resultDomain.code = 1
|
|
||||||
resultDomain.success = False
|
|
||||||
resultDomain.message = "未找到important配置"
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
self.driver.get(important_config.url)
|
|
||||||
try:
|
try:
|
||||||
if self.driver is None:
|
if self.driver is None:
|
||||||
resultDomain.message="driver未初始化"
|
resultDomain.message="driver未初始化"
|
||||||
@@ -525,7 +556,7 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
time.sleep(3) # 等待所有请求完成
|
time.sleep(3) # 等待所有请求完成
|
||||||
request_list = self.driver.requests
|
request_list = self.driver.requests
|
||||||
json_request = []
|
json_request = []
|
||||||
target_path = "98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784"
|
target_path = config.params.get("path")
|
||||||
target_request = None
|
target_request = None
|
||||||
logger.info(f"开始查找目标JSON请求,共有 {len(request_list)} 个请求")
|
logger.info(f"开始查找目标JSON请求,共有 {len(request_list)} 个请求")
|
||||||
|
|
||||||
@@ -602,29 +633,123 @@ class XxqgCrawler(BaseCrawler):
|
|||||||
resultDomain.message = f"解析文章数据失败: {str(e)}"
|
resultDomain.message = f"解析文章数据失败: {str(e)}"
|
||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
|
# 确定时间筛选范围(在循环外计算,避免重复)
|
||||||
|
if not yesterday and start and end:
|
||||||
|
# 自定义时间范围
|
||||||
|
start_date = start
|
||||||
|
end_date = end
|
||||||
|
logger.info(f"使用自定义时间范围: {start_date} 到 {end_date}")
|
||||||
|
else:
|
||||||
|
# 默认昨天
|
||||||
|
yesterday_str = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
||||||
|
start_date = yesterday_str
|
||||||
|
end_date = yesterday_str
|
||||||
|
logger.info(f"使用默认时间范围(昨天): {yesterday_str}")
|
||||||
|
|
||||||
|
# 计算起始日期的前一天,用于提前终止循环(优化性能)
|
||||||
|
day_before_start = (datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
|
||||||
|
|
||||||
for article in article_data:
|
for article in article_data:
|
||||||
# 判断是昨天的新闻 "publishTime": "2025-11-21 10:04:20",
|
# 提取发布日期 "publishTime": "2025-11-21 10:04:20"
|
||||||
publish_date = article['publishTime'].split(" ")[0]
|
publish_date = article['publishTime'].split(" ")[0]
|
||||||
yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
|
||||||
day_before_yesterday = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')
|
|
||||||
|
|
||||||
if publish_date == yesterday:
|
# 跳过未来的新闻(如果有)
|
||||||
news_item = self.parse_news_detail(article['url'])
|
if publish_date > end_date:
|
||||||
news_item.title = article['title']
|
continue
|
||||||
news_item.publishTime = article['publishTime']
|
|
||||||
news_item.source = article['source'].split("_")[1]
|
# 在时间范围内的新闻
|
||||||
news_item.url = article['url']
|
if publish_date >= start_date and publish_date <= end_date:
|
||||||
news_list.append(news_item)
|
try:
|
||||||
logger.info(f"添加昨日新闻: {news_item.title}")
|
# 提取来源,安全处理
|
||||||
elif publish_date == day_before_yesterday:
|
source = article['source'].split("_")[1] if "_" in article.get('source', '') else article.get('source', '')
|
||||||
# 遇到前天的新闻就停止,因为数据是按时间倒序排列的
|
|
||||||
logger.info("已到达前天的新闻,停止遍历")
|
news_item = self.parse_news_detail(article['url'])
|
||||||
|
news_item.title = article['title']
|
||||||
|
news_item.publishTime = article['publishTime']
|
||||||
|
news_item.source = source
|
||||||
|
news_item.url = article['url']
|
||||||
|
news_list.append(news_item)
|
||||||
|
logger.info(f"添加新闻: {news_item.title} ({publish_date})")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"解析文章详情失败: {article.get('title', 'unknown')} - {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果遇到比起始日期还早的新闻,提前终止(数据按时间倒序)
|
||||||
|
elif publish_date < day_before_start:
|
||||||
|
logger.info(f"已到达时间范围之前的新闻({publish_date}),停止遍历")
|
||||||
break
|
break
|
||||||
|
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
# with open("Xxqg_important_news_list.json", "w", encoding="utf-8") as f:
|
# with open("Xxqg_important_news_list.json", "w", encoding="utf-8") as f:
|
||||||
# json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
|
# json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
|
||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
|
def crawl_important(self, total=10) -> ResultDomain:
|
||||||
|
"""
|
||||||
|
爬取重要新闻栏目
|
||||||
|
参考旧版myQiangguo爬虫方式,使用requests获取文章列表,然后用Selenium解析详情
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total: 最多爬取的文章数量,默认10
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResultDomain: 包含新闻列表的结果对象
|
||||||
|
"""
|
||||||
|
news_list = []
|
||||||
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||||
|
|
||||||
|
if self.driver is None:
|
||||||
|
logger.error("WebDriver未初始化,无法继续爬取")
|
||||||
|
resultDomain.code = 1
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.message = "WebDriver未初始化"
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
# 获取important配置
|
||||||
|
important_config = self.config.urls.get("important")
|
||||||
|
if not important_config:
|
||||||
|
logger.error("未找到important配置")
|
||||||
|
resultDomain.code = 1
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.message = "未找到important配置"
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
resultDomain = self.crawl_base(important_config)
|
||||||
|
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
def crawl_xuexishiping(self, total=10) -> ResultDomain:
|
||||||
|
"""
|
||||||
|
爬取学习时评栏目
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total: 最多爬取的文章数量,默认10
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ResultDomain: 包含新闻列表的结果对象
|
||||||
|
"""
|
||||||
|
news_list = []
|
||||||
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||||
|
|
||||||
|
if self.driver is None:
|
||||||
|
logger.error("WebDriver未初始化,无法继续爬取")
|
||||||
|
resultDomain.code = 1
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.message = "WebDriver未初始化"
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
# 获取important配置
|
||||||
|
xuexishiping_config = self.config.urls.get("xuexishiping")
|
||||||
|
if not xuexishiping_config:
|
||||||
|
logger.error("xuexishiping")
|
||||||
|
resultDomain.code = 1
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.message = "xuexishiping"
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
resultDomain = self.crawl_base(xuexishiping_config)
|
||||||
|
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
def home(self, type="") -> ResultDomain:
|
def home(self, type="") -> ResultDomain:
|
||||||
"""获取首页数据"""
|
"""获取首页数据"""
|
||||||
|
|||||||
Reference in New Issue
Block a user