diff --git a/output/out.json b/output/out.json
new file mode 100644
index 0000000..f1515f1
--- /dev/null
+++ b/output/out.json
@@ -0,0 +1,7 @@
+{
+ "code": "0",
+ "message": "获取搜索结果失败",
+ "success": false,
+ "data": null,
+ "dataList": []
+}
\ No newline at end of file
diff --git a/output/output.json b/output/output.json
new file mode 100644
index 0000000..c943667
--- /dev/null
+++ b/output/output.json
@@ -0,0 +1,324 @@
+{
+ "code": 0,
+ "message": "",
+ "success": true,
+ "data": null,
+ "dataList": [
+ {
+ "title": "",
+ "contentRows": [],
+ "url": "http://cpc.people.com.cn/n1/2025/1109/c435113-40599647.html",
+ "publishTime": "",
+ "author": "",
+ "source": "人民网",
+ "category": ""
+ },
+ {
+ "title": "习近平在广东考察",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": "
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ },
+ {
+ "tag": "p",
+ "content": " 11月7日至8日,中共中央总书记、国家主席、中央军委主席习近平在广东考察。这是7日下午,习近平在位于梅州市梅县区雁洋镇的叶剑英纪念馆,参观叶剑英生平事迹陈列。
"
+ },
+ {
+ "tag": "p",
+ "content": " 新华社记者 谢环驰 摄
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://pic.people.com.cn/n1/2025/1108/c426981-40599554.html",
+ "publishTime": "2025年11月08日17:22",
+ "author": "",
+ "source": "新华社",
+ "category": ""
+ },
+ {
+ "title": "",
+ "contentRows": [],
+ "url": "http://cpc.people.com.cn/n1/2025/1031/c64094-40593715.html",
+ "publishTime": "",
+ "author": "",
+ "source": "人民网",
+ "category": ""
+ },
+ {
+ "title": "习近平抵达韩国",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ },
+ {
+ "tag": "p",
+ "content": "当地时间十月三十日上午,国家主席习近平乘专机抵达韩国,应大韩民国总统李在明邀请,出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。这是习近平抵达釜山金海国际机场时,韩国外长赵显等高级官员热情迎接。新华社记者 黄敬文摄
"
+ },
+ {
+ "tag": "p",
+ "content": " 本报韩国釜山10月30日电 (记者莽九晨、杨翘楚)当地时间10月30日上午,国家主席习近平乘专机抵达韩国,应大韩民国总统李在明邀请,出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。
"
+ },
+ {
+ "tag": "p",
+ "content": " 习近平抵达釜山金海国际机场时,韩国外长赵显等高级官员热情迎接。礼兵分列红地毯两侧致敬,军乐团演奏行进乐,机场鸣放21响礼炮。
"
+ },
+ {
+ "tag": "p",
+ "content": " 蔡奇、王毅、何立峰等陪同人员同机抵达。
"
+ },
+ {
+ "tag": "p",
+ "content": " 先期抵达的香港特别行政区行政长官李家超、中国驻韩国大使戴兵也到机场迎接。
"
+ },
+ {
+ "tag": "p",
+ "content": " 中国留学生和中资企业代表挥舞中韩两国国旗,热烈欢迎习近平到访。
"
+ },
+ {
+ "tag": "p",
+ "content": " 本报北京10月30日电 10月30日上午,国家主席习近平乘专机离开北京,应大韩民国总统李在明邀请,赴韩国庆州出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。
"
+ },
+ {
+ "tag": "p",
+ "content": " 陪同习近平出访的有:中共中央政治局常委、中央办公厅主任蔡奇,中共中央政治局委员、外交部部长王毅,中共中央政治局委员、国务院副总理何立峰等。
"
+ },
+ {
+ "tag": "p",
+ "content": " 《人民日报》(2025年10月31日 第01版)
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://korea.people.com.cn/n1/2025/1031/c407366-40594082.html",
+ "publishTime": "2025年10月31日13:38",
+ "author": "",
+ "source": "人民网-人民日报",
+ "category": ""
+ },
+ {
+ "title": "习近平抵达韩国",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": " 当地时间十月三十日上午,国家主席习近平乘专机抵达韩国,应大韩民国总统李在明邀请,出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。这是习近平抵达釜山金海国际机场时,韩国外长赵显等高级官员热情迎接。
新华社记者 黄敬文摄
"
+ },
+ {
+ "tag": "p",
+ "content": " 本报韩国釜山10月30日电 (记者莽九晨、杨翘楚)当地时间10月30日上午,国家主席习近平乘专机抵达韩国,应大韩民国总统李在明邀请,出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。
"
+ },
+ {
+ "tag": "p",
+ "content": " 习近平抵达釜山金海国际机场时,韩国外长赵显等高级官员热情迎接。礼兵分列红地毯两侧致敬,军乐团演奏行进乐,机场鸣放21响礼炮。
"
+ },
+ {
+ "tag": "p",
+ "content": " 蔡奇、王毅、何立峰等陪同人员同机抵达。
"
+ },
+ {
+ "tag": "p",
+ "content": " 先期抵达的香港特别行政区行政长官李家超、中国驻韩国大使戴兵也到机场迎接。
"
+ },
+ {
+ "tag": "p",
+ "content": " 中国留学生和中资企业代表挥舞中韩两国国旗,热烈欢迎习近平到访。
"
+ },
+ {
+ "tag": "p",
+ "content": " 本报北京10月30日电 10月30日上午,国家主席习近平乘专机离开北京,应大韩民国总统李在明邀请,赴韩国庆州出席亚太经合组织第三十二次领导人非正式会议并对韩国进行国事访问。
"
+ },
+ {
+ "tag": "p",
+ "content": " 陪同习近平出访的有:中共中央政治局常委、中央办公厅主任蔡奇,中共中央政治局委员、外交部部长王毅,中共中央政治局委员、国务院副总理何立峰等。
"
+ },
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": " 《 人民日报 》( 2025年10月31日 01 版)
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://politics.people.com.cn/n1/2025/1031/c1024-40593454.html",
+ "publishTime": "2025年10月31日06:10",
+ "author": "",
+ "source": "人民网-人民日报",
+ "category": ""
+ },
+ {
+ "title": "习近平回到北京",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": "本报北京11月1日电 11月1日晚,国家主席习近平结束出席亚太经合组织第三十二次领导人非正式会议和对韩国的国事访问后回到北京。
"
+ },
+ {
+ "tag": "p",
+ "content": "中共中央政治局常委、中央办公厅主任蔡奇,中共中央政治局委员、外交部部长王毅等陪同人员同机返回。
"
+ },
+ {
+ "tag": "p",
+ "content": "本报韩国釜山11月1日电 (记者王嵘、朱笑熺)当地时间11月1日晚,国家主席习近平结束出席亚太经合组织第三十二次领导人非正式会议和对韩国的国事访问返回北京。
"
+ },
+ {
+ "tag": "p",
+ "content": "离开釜山时,韩国外长赵显等高级官员到机场送行。
"
+ },
+ {
+ "tag": "p",
+ "content": "前往机场途中,中国留学生和中资企业代表在道路两旁挥舞中韩两国国旗,热烈祝贺习近平主席访问圆满成功。
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://gd.people.com.cn/n2/2025/1102/c123932-41398959.html",
+ "publishTime": "2025年11月02日11:15",
+ "author": "",
+ "source": "人民网-人民日报",
+ "category": ""
+ },
+ {
+ "title": "习近平回到北京",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": " 本报北京11月1日电 11月1日晚,国家主席习近平结束出席亚太经合组织第三十二次领导人非正式会议和对韩国的国事访问后回到北京。
"
+ },
+ {
+ "tag": "p",
+ "content": " 中共中央政治局常委、中央办公厅主任蔡奇,中共中央政治局委员、外交部部长王毅等陪同人员同机返回。
"
+ },
+ {
+ "tag": "p",
+ "content": " 本报韩国釜山11月1日电 (记者王嵘、朱笑熺)当地时间11月1日晚,国家主席习近平结束出席亚太经合组织第三十二次领导人非正式会议和对韩国的国事访问返回北京。
"
+ },
+ {
+ "tag": "p",
+ "content": " 离开釜山时,韩国外长赵显等高级官员到机场送行。
"
+ },
+ {
+ "tag": "p",
+ "content": " 前往机场途中,中国留学生和中资企业代表在道路两旁挥舞中韩两国国旗,热烈祝贺习近平主席访问圆满成功。
"
+ },
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": " 《 人民日报 》( 2025年11月02日 01 版)
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://politics.people.com.cn/n1/2025/1102/c1024-40594763.html",
+ "publishTime": "2025年11月02日05:46",
+ "author": "",
+ "source": "人民网-人民日报",
+ "category": ""
+ },
+ {
+ "title": "",
+ "contentRows": [],
+ "url": "http://cpc.people.com.cn/n1/2025/1102/c64094-40594809.html",
+ "publishTime": "",
+ "author": "",
+ "source": "人民网",
+ "category": ""
+ },
+ {
+ "title": "《习近平的文化情缘》《习近平经济思想系列讲读》在澳门启播",
+ "contentRows": [
+ {
+ "tag": "p",
+ "content": ""
+ },
+ {
+ "tag": "p",
+ "content": "人民网澳门9月28日电 (记者富子梅)《习近平的文化情缘》及《习近平经济思想系列讲读》两部专题片在澳门启播仪式28日举行。澳门特区行政长官岑浩辉,中宣部副部长、中央广播电视总台台长兼总编辑慎海雄,中央政府驻澳门特区联络办公室主任郑新聪出席活动并致辞。
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ },
+ {
+ "tag": "p",
+ "content": "《习近平的文化情缘》《习近平经济思想系列讲读》澳门启播仪式。(澳门特区政府新闻局供图)
"
+ },
+ {
+ "tag": "p",
+ "content": "岑浩辉表示,《习近平的文化情缘》《习近平经济思想系列讲读》在澳门落地启播,高度契合澳门中西荟萃、内联外通的优势和功能,具有重大而且深远的意义。期待以此为契机,持续深化推动广大澳门同胞和海内外人士对习近平新时代中国特色社会主义思想的关注、理解和实践,共同讲好中国故事、促进国际交流、不断扩大“朋友圈”
"
+ },
+ {
+ "tag": "p",
+ "content": "慎海雄指出,两部精品节目是助力澳门各界更好学习领会领袖思想的一次生动实践,是让澳门居民深切感悟中华文明深厚底蕴和新时代伟大成就的一场文化盛宴。
"
+ },
+ {
+ "tag": "p",
+ "content": "郑新聪表示,两部精品节目在澳门播出,有力促进习近平文化思想、习近平经济思想的宣传普及、落地生根,将为澳门打造中西文明交流互鉴的重要窗口、推动经济适度多元发展提供精神动力和科学指引。
"
+ },
+ {
+ "tag": "p",
+ "content": "9月28日起,电视专题片《习近平的文化情缘》在澳门广播电视股份有限公司的澳视澳门频道、澳门有线电视股份有限公司互动新闻台、澳门莲花卫视传媒有限公司网站,以及《澳门日报》《大众报》《市民日报》《濠江日报》《正报》《澳门商报》《澳门焦点报》《莲花时报》等媒体的新媒体平台陆续上线。大型专题节目《习近平经济思想系列讲读》9月28日起在澳广视旗下电视频道及新媒体平台上线播出。
"
+ },
+ {
+ "tag": "p",
+ "content": "启播仪式后举行的“盛世莲开颂华章 - 中央广播电视总台与澳门各界深化合作仪式”上,双方代表分别交换《中央广播电视总台与澳门特别行政区政府深化战略合作框架协议》、《国家电影局与澳门特别行政区政府社会文化司关于电影产业合作框架协议》、《十五运会和残特奥会澳门赛区筹备办公室与中央广播电视总台合作意向书》、《中央广播电视总台与澳门广播电视股份有限公司关于整频道转播央视CCTV-5体育频道的协议》、《中央广播电视总台亚太总站与澳门大学深化战略合作框架协议》等5份合作文件。
"
+ },
+ {
+ "tag": "img",
+ "content": "
"
+ }
+ ],
+ "url": "http://gba.people.cn/n1/2025/0928/c42272-40573895.html",
+ "publishTime": "2025年09月28日16:44",
+ "author": "",
+ "source": "人民网-大湾区频道",
+ "category": ""
+ },
+ {
+ "title": "",
+ "contentRows": [],
+ "url": "http://cpc.people.com.cn/n1/2025/0926/c64094-40572435.html",
+ "publishTime": "",
+ "author": "",
+ "source": "人民网",
+ "category": ""
+ }
+ ]
+}
\ No newline at end of file
diff --git a/schoolNewsCrawler/crawler/BaseCrawler.py b/schoolNewsCrawler/crawler/BaseCrawler.py
index 6046c15..f20d190 100644
--- a/schoolNewsCrawler/crawler/BaseCrawler.py
+++ b/schoolNewsCrawler/crawler/BaseCrawler.py
@@ -66,7 +66,7 @@ class BaseCrawler(ABC):
self.session.headers.update(config.headers)
logger.info(f"初始化爬虫: {self.__class__.__name__}")
- def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
+ def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]:
"""
发送HTTP请求
@@ -74,6 +74,7 @@ class BaseCrawler(ABC):
url: 请求URL
method: 请求方法
data: 请求数据
+ headers: 额外的请求头,将与默认请求头合并(额外的优先)
**kwargs: 其他请求参数
Returns:
@@ -82,11 +83,20 @@ class BaseCrawler(ABC):
for attempt in range(self.config.retry_times):
try:
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
+
+ # 合并默认headers与调用方headers(调用方覆盖默认)
+ request_headers = dict(self.config.headers or {})
+ if headers:
+ request_headers.update(headers)
+ # 如果kwargs中意外包含headers,合并后移除,避免重复传参
+ extra_headers = kwargs.pop("headers", None)
+ if extra_headers:
+ request_headers.update(extra_headers)
response = self.session.request(
method=method,
url=url,
- headers=self.config.headers,
+ headers=request_headers,
data=data,
timeout=self.config.timeout,
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
diff --git a/schoolNewsCrawler/crawler/RmrbCrawler.py b/schoolNewsCrawler/crawler/RmrbCrawler.py
index bb15c19..b8f4fa7 100644
--- a/schoolNewsCrawler/crawler/RmrbCrawler.py
+++ b/schoolNewsCrawler/crawler/RmrbCrawler.py
@@ -100,14 +100,25 @@ class RmrbCrawler(BaseCrawler):
search_data["page"] = page
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
response_json = response.json()
- if response_json.get("code") == 0:
+ if response_json.get("code") == '0':
records = response_json.get("data", {}).get("records", [])
for record in records:
news = self.parse_news_detail(record.get("url"))
+ if news['title'] == '':
+ news['title'] = record.get("title")
+ if news['contentRows'] == []:
+ news['contentRows'] = record.get("contentOriginal")
+ if news['publishTime'] == '':
+ news['publishTime'] = datetime.datetime.fromtimestamp(record.get("displayTime") / 1000).date()
+ if news['author'] == '':
+ news['author'] = record.get("author")
+ if news['source'] == '':
+ news['source'] = record.get("originName")
+
news_list.append(news)
else:
resultDomain.code = response_json.get("code")
- resultDomain.message = "获取搜索结果失败" + response_json.get("message")
+ resultDomain.message = f"获取搜索结果失败{response_json.get('message') or ''}"
resultDomain.success = False
return resultDomain
page += 1
@@ -143,14 +154,14 @@ class RmrbCrawler(BaseCrawler):
response = self.fetch(hot_point_rank_config.url, method=hot_point_rank_config.method, headers=hot_point_rank_config.headers)
response_json = response.json()
- if response_json.get("code") == 0:
+ if response_json.get("code") == '0':
records = response_json.get("data", [])
for record in records:
news = self.parse_news_detail(record.get("url"))
news_list.append(news)
else:
resultDomain.code = response_json.get("code")
- resultDomain.message = "获取人民日报热点排行失败" + response_json.get("message")
+ resultDomain.message = f"获取人民日报热点排行失败{response_json.get('message') or ''}"
resultDomain.success = False
return resultDomain
resultDomain.success = True
@@ -160,7 +171,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报热点排行失败: {str(e)}")
resultDomain.code = 0
- resultDomain.message = "获取人民日报热点排行失败" + str(e)
+ resultDomain.message = f"获取人民日报热点排行失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -178,19 +189,19 @@ class RmrbCrawler(BaseCrawler):
date_str = date.strftime("%Y%m%d")
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
- one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
+ one_day_trending_news_config.url = one_day_trending_news_config.url.format(date=date_str)
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
if not response:
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
resultDomain.code = 0
- resultDomain.message = "获取响应失败" + one_day_trending_news_config.url
+ resultDomain.message = f"获取响应失败{one_day_trending_news_config.url or ''}"
resultDomain.success = False
return resultDomain
soup = self.parse_html(response.content)
if not soup:
logger.error(f"解析HTML失败: {one_day_trending_news_config.url}")
resultDomain.code = 0
- resultDomain.message = "解析HTML失败" + one_day_trending_news_config.url
+ resultDomain.message = f"解析HTML失败{one_day_trending_news_config.url or ''}"
resultDomain.success = False
return resultDomain
@@ -215,7 +226,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报一天内的热点新闻失败: {str(e)}")
resultDomain.code = 0
- resultDomain.message = "获取人民日报一天内的热点新闻失败" + str(e)
+ resultDomain.message = f"获取人民日报一天内的热点新闻失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -243,7 +254,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
resultDomain.code = 0
- resultDomain.message = "获取人民日报多天内的热点新闻失败" + str(e)
+ resultDomain.message = f"获取人民日报多天内的热点新闻失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -259,29 +270,37 @@ class RmrbCrawler(BaseCrawler):
"""
try:
response = self.fetch(url)
-
+ news = NewsItem(
+ title="",
+ contentRows=[], # 修复:使用 contents 而不是 content
+ url=url,
+ publishTime="",
+ author="",
+ source="人民网",
+ category=""
+ )
if not response:
logger.error(f"获取响应失败: {url}")
- return None
+ return news
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
# 它会从 HTML 的 标签或响应头自动检测编码
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
- return None
+ return news
# 提取主内容区域
main_div = soup.find("div", class_="layout rm_txt cf")
if not main_div:
logger.error("未找到主内容区域")
- return None
+ return news
# 提取文章区域
article_div = main_div.find("div", class_="col col-1")
if not article_div:
logger.error("未找到文章区域")
- return None
+ return news
# 提取标题
title_tag = article_div.select_one("h1")
@@ -347,15 +366,14 @@ class RmrbCrawler(BaseCrawler):
"content": content
})
- news = NewsItem(
- title=title,
- contentRows=contents, # 修复:使用 contents 而不是 content
- url=url,
- publishTime=publish_time,
- author=author,
- source=source or "人民网",
- category=""
- )
+
+ news.title=title
+ news.contentRows=contents # 修复:使用 contents 而不是 content
+ news.url=url
+ news.publishTime=publish_time
+ news.author=author
+ news.source=source or "人民网"
+ news.category=""
logger.info(f"成功解析新闻: {title}")
return news
diff --git a/schoolNewsCrawler/crawler/RmrbHotPoint.py b/schoolNewsCrawler/crawler/RmrbHotPoint.py
index ff3794d..bbd93c7 100644
--- a/schoolNewsCrawler/crawler/RmrbHotPoint.py
+++ b/schoolNewsCrawler/crawler/RmrbHotPoint.py
@@ -25,20 +25,27 @@ def main():
epilog="""
示例:
python RmrbHotPoint.py
+ python RmrbHotPoint.py --output "output/hotpoint.json"
"""
)
+ # 添加输出文件参数
+ parser.add_argument(
+ '--output', '-o',
+ type=str,
+ help='输出文件路径'
+ )
+
args = parser.parse_args()
+ output_file = args.output
+ logger.info("使用直接参数模式")
+
try:
- # 创建爬虫实例
logger.info("开始获取人民日报热点排行")
crawler = RmrbCrawler()
-
- # 执行获取热点排行
result = crawler.hotPointRank()
- # 输出JSON结果
output = {
"code": result.code,
"message": result.message,
@@ -47,12 +54,15 @@ def main():
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
}
+ if output_file:
+ output_path = Path(output_file)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(output, f, ensure_ascii=False, indent=2)
+ logger.info(f"结果已保存到: {output_file}")
+
print(json.dumps(output, ensure_ascii=False, indent=2))
-
- # 关闭爬虫
crawler.close()
-
- # 退出码: 成功=0, 失败=1
sys.exit(0 if result.success else 1)
except Exception as e:
@@ -67,7 +77,6 @@ def main():
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
-" "
+
if __name__ == "__main__":
- main()
-
+ main()
\ No newline at end of file
diff --git a/schoolNewsCrawler/crawler/RmrbSearch.py b/schoolNewsCrawler/crawler/RmrbSearch.py
index 6e4fbfe..2e67a2c 100644
--- a/schoolNewsCrawler/crawler/RmrbSearch.py
+++ b/schoolNewsCrawler/crawler/RmrbSearch.py
@@ -25,7 +25,8 @@ def main():
epilog="""
示例:
python RmrbSearch.py --key "教育改革" --total 20
- python RmrbSearch.py -k "科技创新" -t 15 -n 1
+ python RmrbSearch.py -k "科技创新" -t 15 --type 1
+ python RmrbSearch.py --key "AI" --total 5 --output "out.json"
新闻类型说明:
0 - 所有类型 (默认)
@@ -38,53 +39,72 @@ def main():
)
parser.add_argument(
- '--key', '-k',
+ '--query', '-q',
type=str,
required=True,
- help='搜索关键词 (必需)'
+ help='搜索关键词'
)
parser.add_argument(
'--total', '-t',
type=int,
default=10,
- help='获取新闻总数 (默认: 10)'
+ help='抓取数量 (默认: 10)'
)
parser.add_argument(
'--type', '-n',
type=int,
default=0,
- choices=[0, 1, 2, 3, 4, 5],
- help='新闻类型: 0=全部, 1=新闻, 2=互动, 3=报刊, 4=图片, 5=视频 (默认: 0)'
+ help='新闻类型 (默认: 0=所有类型)'
+ )
+
+ parser.add_argument(
+ '--output', '-o',
+ type=str,
+ help='输出文件路径'
)
args = parser.parse_args()
+ # 获取参数
+ key = args.query
+ total = args.total
+ news_type = args.type
+ output_file = args.output
+
+ logger.info("使用直接参数模式")
+
+ # 关键校验:key 必须存在
+ if not key or not key.strip():
+ parser.error("搜索关键词不能为空!")
try:
- # 创建爬虫实例
- logger.info(f"开始搜索: 关键词='{args.key}', 数量={args.total}, 类型={args.type}")
+ logger.info(f"开始搜索: 关键词='{key}', 数量={total}, 类型={news_type}")
crawler = RmrbCrawler()
+ # result = crawler.search(key=key.strip(), total=total, news_type=news_type)
+ result = None
+ with open("../output/output.json", "r", encoding="utf-8") as f:
+ result = json.load(f)
- # 执行搜索
- result = crawler.search(key=args.key, total=args.total, news_type=args.type)
+ output = result
+ # output = {
+ # "code": result["code"],
+ # "message": result["message"],
+ # "success": result["success"],
+ # "data": None,
+ # "dataList": [item.model_dump() for item in result["dataList"]] if result["dataList"] else []
+ # }
- # 输出JSON结果
- output = {
- "code": result.code,
- "message": result.message,
- "success": result.success,
- "data": None,
- "dataList": [item.dict() for item in result.dataList] if result.dataList else []
- }
+ if output_file:
+ output_path = Path(output_file)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(output, f, ensure_ascii=False, indent=2)
+ logger.info(f"结果已保存到: {output_file}")
print(json.dumps(output, ensure_ascii=False, indent=2))
-
- # 关闭爬虫
crawler.close()
-
- # 退出码: 成功=0, 失败=1
- sys.exit(0 if result.success else 1)
+ sys.exit(0 if result["success"] else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
@@ -100,4 +120,4 @@ def main():
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file
diff --git a/schoolNewsCrawler/crawler/RmrbTrending.py b/schoolNewsCrawler/crawler/RmrbTrending.py
index 98c0658..b2dc3bb 100644
--- a/schoolNewsCrawler/crawler/RmrbTrending.py
+++ b/schoolNewsCrawler/crawler/RmrbTrending.py
@@ -10,7 +10,7 @@
import argparse
import json
import sys
-from datetime import datetime
+from datetime import datetime, timedelta
from pathlib import Path
# Add parent directory to path to import crawler
@@ -20,20 +20,29 @@ from crawler.RmrbCrawler import RmrbCrawler
from loguru import logger
-def parse_date(date_str: str) -> datetime:
+def parse_date(date_str) -> datetime:
"""
- 解析日期字符串为datetime对象
+ 解析日期字符串或数字为datetime对象 (格式: YYYYMMDD)
Args:
- date_str: 日期字符串,格式为YYYYMMDD
+ date_str: 可为字符串或整数,如 "20250110" 或 20250110
Returns:
datetime对象
+
+ Raises:
+ ValueError: 格式错误
"""
+ # 统一转为字符串并清理
+ if date_str is None:
+ raise ValueError("日期不能为空")
+ date_str = str(date_str).strip()
+ if len(date_str) != 8 or not date_str.isdigit():
+ raise ValueError(f"日期格式错误: '{date_str}',正确格式为YYYYMMDD,例如: '20250110'")
try:
return datetime.strptime(date_str, "%Y%m%d")
except ValueError:
- raise ValueError(f"日期格式错误: {date_str},正确格式为YYYYMMDD,例如: 20250110")
+ raise ValueError(f"日期格式错误: '{date_str}',正确格式为YYYYMMDD,例如: '20250110'")
def main():
@@ -51,68 +60,73 @@ def main():
python RmrbTrending.py --start-date 20250101 --end-date 20250110
python RmrbTrending.py -s 20250101 -e 20250110
- # 不指定日期则获取今天的热点新闻
+ # 不指定日期则根据 isYesterday 决定(默认昨日)
python RmrbTrending.py
"""
)
- parser.add_argument(
- '--date', '-d',
- type=str,
- help='指定日期 (格式: YYYYMMDD,例如: 20250110)'
- )
-
- parser.add_argument(
- '--start-date', '-s',
- type=str,
- help='开始日期 (格式: YYYYMMDD,需与--end-date一起使用)'
- )
-
- parser.add_argument(
- '--end-date', '-e',
- type=str,
- help='结束日期 (格式: YYYYMMDD,需与--start-date一起使用)'
- )
+ parser.add_argument('--date', '-d', type=str, help='指定日期 (格式: YYYYMMDD)')
+ parser.add_argument('--startDate', '-s', type=str, help='开始日期 (需与--end-date一起使用)')
+ parser.add_argument('--endDate', '-e', type=str, help='结束日期 (需与--start-date一起使用)')
+ parser.add_argument('--yesterday', '-y', action='store_true', help='查询昨日 (默认行为)')
+ parser.add_argument('--output', '-o', type=str, help='输出文件路径')
args = parser.parse_args()
+ # 初始化变量
+ output_file = args.output
+ date = args.date
+ start_date = args.startDate
+ end_date = args.endDate
+ is_yesterday = args.yesterday if args.yesterday else True # 默认查昨日
+
+ logger.info("使用直接参数模式")
+
+ # 辅助函数:清理空字符串
+ def clean(s):
+ return s.strip() if s and isinstance(s, str) and s.strip() else None
+
+ date = clean(date)
+ start_date = clean(start_date)
+ end_date = clean(end_date)
+
try:
- # 创建爬虫实例
crawler = RmrbCrawler()
- # 判断使用哪种模式
- if args.date:
- # 单日模式
- if args.start_date or args.end_date:
- raise ValueError("不能同时使用--date和--start-date/--end-date参数")
-
- target_date = parse_date(args.date)
- logger.info(f"获取单日热点新闻: {args.date}")
+ # 单日模式
+ if date:
+ if start_date or end_date:
+ raise ValueError("不能同时使用 date 和 startDate/endDate 参数")
+ target_date = parse_date(date)
+ logger.info(f"获取单日热点新闻: {target_date.strftime('%Y-%m-%d')}")
result = crawler.getOneDayTrendingNews(target_date)
- elif args.start_date and args.end_date:
- # 日期范围模式
- start_date = parse_date(args.start_date)
- end_date = parse_date(args.end_date)
-
- if start_date > end_date:
+ # 日期范围模式
+ elif start_date and end_date:
+ if date:
+ raise ValueError("不能同时使用 date 和 startDate/endDate 参数")
+ start_dt = parse_date(start_date)
+ end_dt = parse_date(end_date)
+ if start_dt > end_dt:
raise ValueError("开始日期不能晚于结束日期")
+ logger.info(f"获取日期范围热点新闻: {start_dt.strftime('%Y-%m-%d')} 至 {end_dt.strftime('%Y-%m-%d')}")
+ result = crawler.getDaysTrendingNews(start_dt, end_dt)
- logger.info(f"获取日期范围热点新闻: {args.start_date} 至 {args.end_date}")
- result = crawler.getDaysTrendingNews(start_date, end_date)
-
- elif args.start_date or args.end_date:
- # 只指定了一个日期
- raise ValueError("--start-date和--end-date必须同时使用")
+ # 只给一个边界
+ elif start_date or end_date:
+ raise ValueError("--start-date 和 --end-date 必须同时指定")
+ # 默认模式
else:
- # 默认使用今天的日期
- today = datetime.now()
- today_str = today.strftime("%Y%m%d")
- logger.info(f"获取今日热点新闻: {today_str}")
- result = crawler.getOneDayTrendingNews(today)
+ if is_yesterday:
+ target_date = datetime.now() - timedelta(days=1)
+ logger.info(f"获取昨日热点新闻: {target_date.strftime('%Y-%m-%d')}")
+ else:
+ target_date = datetime.now()
+ logger.info(f"获取今日热点新闻: {target_date.strftime('%Y-%m-%d')}")
+ result = crawler.getOneDayTrendingNews(target_date)
- # 输出JSON结果
+ # 构造输出
output = {
"code": result.code,
"message": result.message,
@@ -121,12 +135,16 @@ def main():
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
}
+ # 保存到文件
+ if output_file:
+ output_path = Path(output_file)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(output, f, ensure_ascii=False, indent=2)
+ logger.info(f"结果已保存到: {output_file}")
+
print(json.dumps(output, ensure_ascii=False, indent=2))
-
- # 关闭爬虫
crawler.close()
-
- # 退出码: 成功=0, 失败=1
sys.exit(0 if result.success else 1)
except ValueError as e:
@@ -155,4 +173,4 @@ def main():
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file
diff --git a/schoolNewsCrawler/lxml b/schoolNewsCrawler/lxml
new file mode 100644
index 0000000..e69de29
diff --git a/schoolNewsCrawler/main.py b/schoolNewsCrawler/main.py
index dc9103f..6400b05 100644
--- a/schoolNewsCrawler/main.py
+++ b/schoolNewsCrawler/main.py
@@ -5,7 +5,9 @@
import sys
import json
+import argparse
from typing import List
+from pathlib import Path
from loguru import logger
from crawler.RmrbCrawler import RmrbCrawler
from crawler.BaseCrawler import NewsItem
@@ -83,36 +85,81 @@ def save_to_json(news_list: List[dict], output_file: str = "output/news.json"):
def main():
"""主函数"""
- # 解析命令行参数
- category = "politics"
- limit = 20
- output_file = "output/news.json"
-
- if len(sys.argv) > 1:
- category = sys.argv[1]
- if len(sys.argv) > 2:
- limit = int(sys.argv[2])
- if len(sys.argv) > 3:
- output_file = sys.argv[3]
-
+ # 创建参数解析器
+ parser = argparse.ArgumentParser(
+ description='人民日报新闻爬虫主程序',
+ formatter_class=argparse.RawDescriptionHelpFormatter
+ )
+
+ # 添加位置参数(保持向后兼容)
+ parser.add_argument(
+ 'category',
+ nargs='?',
+ default='politics',
+ help='新闻分类 (默认: politics)'
+ )
+
+ parser.add_argument(
+ 'limit',
+ nargs='?',
+ type=int,
+ default=20,
+ help='爬取数量 (默认: 20)'
+ )
+
+ parser.add_argument(
+ 'output_file',
+ nargs='?',
+ default='output/news.json',
+ help='输出文件路径 (默认: output/news.json)'
+ )
+
+ # 添加JSON参数支持
+ parser.add_argument(
+ '--json', '-j',
+ type=str,
+ help='JSON格式参数 (优先级高于其他参数)'
+ )
+
+ args = parser.parse_args()
+
+ # 解析参数: JSON参数优先
+ if args.json:
+ try:
+ json_data = json.loads(args.json)
+ params = json_data.get('params', {})
+ category = params.get('category', 'politics')
+ limit = params.get('limit', 20)
+ output_file = json_data.get('outputFile', 'output/news.json')
+ logger.info("使用JSON参数模式")
+ except Exception as e:
+ logger.error(f"JSON参数解析失败: {e}")
+ sys.exit(1)
+ else:
+ # 使用命令行参数
+ category = args.category
+ limit = args.limit
+ output_file = args.output_file
+ logger.info("使用命令行参数模式")
+
logger.info("=" * 60)
logger.info("新闻爬虫程序启动")
logger.info("=" * 60)
-
+
# 爬取新闻
news_list = crawl_rmrb_news(category=category, limit=limit)
-
+
# 保存结果
if news_list:
save_to_json(news_list, output_file)
-
+
# 输出统计信息
logger.info(f"爬取统计:")
logger.info(f" - 成功: {len(news_list)} 条")
logger.info(f" - 失败: {limit - len(news_list)} 条")
else:
logger.warning("未获取到任何新闻")
-
+
logger.info("=" * 60)
logger.info("新闻爬虫程序结束")
logger.info("=" * 60)
diff --git a/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql b/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql
index 041e048..81fb13b 100644
--- a/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql
+++ b/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql
@@ -66,7 +66,7 @@ CREATE TABLE `tb_data_collection_item` (
`id` VARCHAR(64) NOT NULL COMMENT '主键ID',
`task_id` VARCHAR(64) NOT NULL COMMENT '关联任务ID',
`log_id` VARCHAR(64) NOT NULL COMMENT '关联执行日志ID',
- `title` VARCHAR(255) NOT NULL COMMENT '文章标题',
+ `title` VARCHAR(255) DEFAULT NULL COMMENT '文章标题',
`content` LONGTEXT DEFAULT NULL COMMENT '文章内容(HTML)',
`summary` VARCHAR(500) DEFAULT NULL COMMENT '文章摘要',
`source` VARCHAR(255) DEFAULT NULL COMMENT '来源(如 人民日报)',
diff --git a/schoolNewsServ/admin/src/main/resources/application.yml b/schoolNewsServ/admin/src/main/resources/application.yml
index b3c2d10..e2d73dd 100644
--- a/schoolNewsServ/admin/src/main/resources/application.yml
+++ b/schoolNewsServ/admin/src/main/resources/application.yml
@@ -114,35 +114,49 @@ school-news:
crawler:
- python:
- path: F:\Environment\Conda\envs\shoolNewsCrewer
- base:
- path: F:/Project/schoolNews/schoolNewsCrawler
-
+ # Python 可执行文件路径(Windows 建议指向 python.exe;如已在 PATH,可直接用 "python")
+ pythonPath: F:/Environment/Conda/envs/schoolNewsCrawler/python.exe
+ # 爬虫脚本根目录(NewsCrawlerTask 的工作目录)
+ basePath: F:/Project/schoolNews/schoolNewsCrawler
crontab:
items: #可供前端选择的定时任务列表
- name: 人民日报新闻爬取
methods: #爬取方式
- name: 关键字搜索爬取
- class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
+ clazz: newsCrewerTask
+ excuete_method: execute
path: crawler/RmrbSearch.py
params:
- query: String #搜索关键字
- total: Integer #总新闻数量
+ - name: query
+ description: 搜索关键字
+ type: String
+ value: ""
+ - name: total
+ description: 总新闻数量
+ type: Integer
+ value: 10
- name: 排行榜爬取
- class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
+ clazz: newsCrewerTask
+ excuete_method: execute
path: crawler/RmrbHotPoint.py
- name: 往日精彩头条爬取
- class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
+ clazz: newsCrewerTask
+ excuete_method: execute
path: crawler/RmrbTrending.py
params:
- startDate: String #开始日期
- endDate: String #结束日期
- isYestoday: Boolean #是否是昨天
-
-
-
+ - name: startDate
+ description: 开始日期
+ type: String
+ value: ""
+ - name: endDate
+ description: 结束日期
+ type: String
+ value: ""
+ - name: yesterday
+ description: 是否是昨天
+ type: Boolean
+ value: true
# 文件存储配置
file:
diff --git a/schoolNewsServ/admin/src/main/resources/log4j2-spring.xml b/schoolNewsServ/admin/src/main/resources/log4j2-spring.xml
index fa73629..8d50c49 100644
--- a/schoolNewsServ/admin/src/main/resources/log4j2-spring.xml
+++ b/schoolNewsServ/admin/src/main/resources/log4j2-spring.xml
@@ -111,6 +111,9 @@
+
+
+
@@ -162,6 +165,15 @@
+
+
+
+
+
+
+
+
+
diff --git a/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java b/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java
index 0747fdb..138bfaa 100644
--- a/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java
+++ b/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java
@@ -1,5 +1,7 @@
package org.xyzh.api.crontab;
+import java.util.List;
+
import org.xyzh.common.core.domain.ResultDomain;
import org.xyzh.common.core.page.PageParam;
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
@@ -30,7 +32,7 @@ public interface DataCollectionItemService {
* @author yslg
* @since 2025-11-08
*/
- ResultDomain batchCreateItems(java.util.List itemList);
+ ResultDomain batchCreateItems(List itemList);
/**
* @description 更新采集项
diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java
index 052ba8b..82cff8b 100644
--- a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java
+++ b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java
@@ -1,68 +1,427 @@
package org.xyzh.common.vo;
-import org.xyzh.common.dto.crontab.TbDataCollectionItem;
-import org.xyzh.common.dto.crontab.TbCrontabTask;
-
import java.io.Serializable;
+import java.util.Date;
/**
- * @description 数据采集项VO
+ * @description 数据采集项VO (平铺结构,包含关联的任务和日志信息)
* @filename DataCollectionItemVO.java
* @author yslg
* @copyright xyzh
* @since 2025-11-08
*/
public class DataCollectionItemVO implements Serializable {
-
+
private static final long serialVersionUID = 1L;
-
+
+ // ==================== 采集项基本信息 ====================
+
/**
- * @description 采集项数据
+ * 采集项ID
*/
- private TbDataCollectionItem item;
-
+ private String id;
+
/**
- * @description 关联的定时任务信息
+ * 任务ID
*/
- private TbCrontabTask task;
-
+ private String taskId;
+
/**
- * @description 状态文本(用于前端显示)
+ * 日志ID
*/
- private String statusText;
-
+ private String logId;
+
/**
- * @description 是否可以编辑(未处理和已忽略的可以编辑)
+ * 文章标题
+ */
+ private String title;
+
+ /**
+ * 文章内容(HTML)
+ */
+ private String content;
+
+ /**
+ * 文章摘要
+ */
+ private String summary;
+
+ /**
+ * 来源
+ */
+ private String source;
+
+ /**
+ * 来源URL
+ */
+ private String sourceUrl;
+
+ /**
+ * 分类
+ */
+ private String category;
+
+ /**
+ * 作者
+ */
+ private String author;
+
+ /**
+ * 发布时间
+ */
+ private Date publishTime;
+
+ /**
+ * 封面图片URL
+ */
+ private String coverImage;
+
+ /**
+ * 图片列表(JSON)
+ */
+ private String images;
+
+ /**
+ * 标签
+ */
+ private String tags;
+
+ /**
+ * 状态(0:未处理 1:已转换为资源 2:已忽略)
+ */
+ private Integer status;
+
+ /**
+ * 转换后的资源ID
+ */
+ private String resourceId;
+
+ /**
+ * 爬取时间
+ */
+ private Date crawlTime;
+
+ /**
+ * 处理时间
+ */
+ private Date processTime;
+
+ /**
+ * 处理人
+ */
+ private String processor;
+
+ /**
+ * 创建时间
+ */
+ private Date createTime;
+
+ /**
+ * 更新时间
+ */
+ private Date updateTime;
+
+ // ==================== 关联的任务信息 ====================
+
+ /**
+ * 任务名称
+ */
+ private String taskName;
+
+ /**
+ * 任务分组
+ */
+ private String taskGroup;
+
+ /**
+ * Bean名称
+ */
+ private String beanName;
+
+ /**
+ * 方法名称
+ */
+ private String methodName;
+
+ /**
+ * 方法参数
+ */
+ private String methodParams;
+
+ // ==================== 关联的日志信息 ====================
+
+ /**
+ * 执行状态(0:失败 1:成功)
+ */
+ private Integer executeStatus;
+
+ /**
+ * 执行时长(ms)
+ */
+ private Long executeDuration;
+
+ /**
+ * 开始时间
+ */
+ private Date startTime;
+
+ /**
+ * 结束时间
+ */
+ private Date endTime;
+
+ // ==================== 扩展字段 ====================
+
+ /**
+ * 是否可以编辑(未处理和已忽略的可以编辑)
*/
private Boolean canEdit;
-
+
/**
- * @description 是否可以转换为资源(未处理的可以转换)
+ * 是否可以转换为资源(未处理的可以转换)
*/
private Boolean canConvert;
- public TbDataCollectionItem getItem() {
- return item;
+ // ==================== Getter/Setter ====================
+
+ public String getId() {
+ return id;
}
- public void setItem(TbDataCollectionItem item) {
- this.item = item;
+ public void setId(String id) {
+ this.id = id;
}
- public TbCrontabTask getTask() {
- return task;
+ public String getTaskId() {
+ return taskId;
}
- public void setTask(TbCrontabTask task) {
- this.task = task;
+ public void setTaskId(String taskId) {
+ this.taskId = taskId;
}
- public String getStatusText() {
- return statusText;
+ public String getLogId() {
+ return logId;
}
- public void setStatusText(String statusText) {
- this.statusText = statusText;
+ public void setLogId(String logId) {
+ this.logId = logId;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public String getSummary() {
+ return summary;
+ }
+
+ public void setSummary(String summary) {
+ this.summary = summary;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public String getSourceUrl() {
+ return sourceUrl;
+ }
+
+ public void setSourceUrl(String sourceUrl) {
+ this.sourceUrl = sourceUrl;
+ }
+
+ public String getCategory() {
+ return category;
+ }
+
+ public void setCategory(String category) {
+ this.category = category;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(String author) {
+ this.author = author;
+ }
+
+ public Date getPublishTime() {
+ return publishTime;
+ }
+
+ public void setPublishTime(Date publishTime) {
+ this.publishTime = publishTime;
+ }
+
+ public String getCoverImage() {
+ return coverImage;
+ }
+
+ public void setCoverImage(String coverImage) {
+ this.coverImage = coverImage;
+ }
+
+ public String getImages() {
+ return images;
+ }
+
+ public void setImages(String images) {
+ this.images = images;
+ }
+
+ public String getTags() {
+ return tags;
+ }
+
+ public void setTags(String tags) {
+ this.tags = tags;
+ }
+
+ public Integer getStatus() {
+ return status;
+ }
+
+ public void setStatus(Integer status) {
+ this.status = status;
+ }
+
+ public String getResourceId() {
+ return resourceId;
+ }
+
+ public void setResourceId(String resourceId) {
+ this.resourceId = resourceId;
+ }
+
+ public Date getCrawlTime() {
+ return crawlTime;
+ }
+
+ public void setCrawlTime(Date crawlTime) {
+ this.crawlTime = crawlTime;
+ }
+
+ public Date getProcessTime() {
+ return processTime;
+ }
+
+ public void setProcessTime(Date processTime) {
+ this.processTime = processTime;
+ }
+
+ public String getProcessor() {
+ return processor;
+ }
+
+ public void setProcessor(String processor) {
+ this.processor = processor;
+ }
+
+ public Date getCreateTime() {
+ return createTime;
+ }
+
+ public void setCreateTime(Date createTime) {
+ this.createTime = createTime;
+ }
+
+ public Date getUpdateTime() {
+ return updateTime;
+ }
+
+ public void setUpdateTime(Date updateTime) {
+ this.updateTime = updateTime;
+ }
+
+ public String getTaskName() {
+ return taskName;
+ }
+
+ public void setTaskName(String taskName) {
+ this.taskName = taskName;
+ }
+
+ public String getTaskGroup() {
+ return taskGroup;
+ }
+
+ public void setTaskGroup(String taskGroup) {
+ this.taskGroup = taskGroup;
+ }
+
+ public String getBeanName() {
+ return beanName;
+ }
+
+ public void setBeanName(String beanName) {
+ this.beanName = beanName;
+ }
+
+ public String getMethodName() {
+ return methodName;
+ }
+
+ public void setMethodName(String methodName) {
+ this.methodName = methodName;
+ }
+
+ public String getMethodParams() {
+ return methodParams;
+ }
+
+ public void setMethodParams(String methodParams) {
+ this.methodParams = methodParams;
+ }
+
+ public Integer getExecuteStatus() {
+ return executeStatus;
+ }
+
+ public void setExecuteStatus(Integer executeStatus) {
+ this.executeStatus = executeStatus;
+ }
+
+ public Long getExecuteDuration() {
+ return executeDuration;
+ }
+
+ public void setExecuteDuration(Long executeDuration) {
+ this.executeDuration = executeDuration;
+ }
+
+ public Date getStartTime() {
+ return startTime;
+ }
+
+ public void setStartTime(Date startTime) {
+ this.startTime = startTime;
+ }
+
+ public Date getEndTime() {
+ return endTime;
+ }
+
+ public void setEndTime(Date endTime) {
+ this.endTime = endTime;
}
public Boolean getCanEdit() {
diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java
index 768b984..72960ee 100644
--- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java
+++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java
@@ -1,5 +1,6 @@
package org.xyzh.crontab.config;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.Data;
import org.springframework.stereotype.Component;
@@ -9,8 +10,10 @@ import org.springframework.stereotype.Component;
@Component
public class CrawlerProperties {
+ @Value("${crawler.pythonPath}")
private String pythonPath;
+ @Value("${crawler.basePath}")
private String basePath;
}
diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java
index 0f3ae18..835eb97 100644
--- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java
+++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java
@@ -12,6 +12,10 @@ import org.xyzh.common.dto.crontab.TbCrontabTask;
import org.xyzh.common.dto.crontab.TbCrontabLog;
import org.xyzh.common.utils.IDUtils;
import org.xyzh.crontab.pojo.CrontabItem;
+
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.JSONObject;
+
import org.xyzh.common.utils.spring.SpringContextUtil;
import org.xyzh.crontab.config.CrontabProperties;
@@ -47,6 +51,14 @@ public class CrontabController {
// 仅返回爬虫能力的元信息(任务模版列表),不包含调度相关内容
CrontabProperties props =
SpringContextUtil.getBean(CrontabProperties.class);
+ String jString = JSON.toJSONString(props);
+ props = JSON.parseObject(jString, CrontabProperties.class);
+ props.getItems().forEach(item->item.getMethods().forEach(
+ method->{
+ method.setClazz(null);
+ method.setExcuete_method(null);
+ method.setPath(null);
+ }));
rd.success("ok", props.getItems());
} catch (Exception e) {
rd.fail("获取可创建定时任务失败: " + e.getMessage());
@@ -63,6 +75,25 @@ public class CrontabController {
public ResultDomain createCrontab(@RequestBody TbCrontabTask crontabItem) {
ResultDomain rd = new ResultDomain<>();
try {
+ // 根据taskGroup和methodName查找配置并填充beanName和methodName
+ if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
+ CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
+ crontabItem.getTaskGroup(),
+ crontabItem.getMethodName()
+ );
+ if (method != null) {
+ crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
+ crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
+ JSONObject methodParams = JSON.parseObject(crontabItem.getMethodParams());
+ methodParams.put("scriptPath", method.getPath());
+ crontabItem.setMethodParams(methodParams.toJSONString());
+
+ } else {
+ rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
+ + ", methodName=" + crontabItem.getMethodName());
+ return rd;
+ }
+ }
return crontabService.createTask(crontabItem);
} catch (Exception e) {
logger.error("创建定时任务失败", e);
@@ -71,6 +102,27 @@ public class CrontabController {
}
}
+ /**
+ * 根据taskGroup和methodName查找对应的方法配置
+ */
+ private CrontabItem.CrontabMethod findMethodByTaskGroupAndMethodName(String taskGroup, String methodName) {
+ CrontabProperties props = SpringContextUtil.getBean(CrontabProperties.class);
+ if (props == null || props.getItems() == null) {
+ return null;
+ }
+
+ for (CrontabItem item : props.getItems()) {
+ if (item.getName().equals(taskGroup)) {
+ for (CrontabItem.CrontabMethod method : item.getMethods()) {
+ if (method.getName().equals(methodName)) {
+ return method;
+ }
+ }
+ }
+ }
+ return null;
+ }
+
/**
* 更新定时任务
* @param crontabItem
@@ -80,6 +132,21 @@ public class CrontabController {
public ResultDomain updateCrontab(@RequestBody TbCrontabTask crontabItem) {
ResultDomain rd = new ResultDomain<>();
try {
+ // 根据taskGroup和methodName查找配置并填充beanName和methodName
+ if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
+ CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
+ crontabItem.getTaskGroup(),
+ crontabItem.getMethodName()
+ );
+ if (method != null) {
+ crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
+ crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
+ } else {
+ rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
+ + ", methodName=" + crontabItem.getMethodName());
+ return rd;
+ }
+ }
return crontabService.updateTask(crontabItem);
} catch (Exception e) {
logger.error("更新定时任务失败", e);
@@ -146,6 +213,88 @@ public class CrontabController {
return rd;
}
}
+
+ /**
+ * 根据ID查询日志详情
+ * @param logId 日志ID
+ * @return ResultDomain
+ */
+ @GetMapping("/log/{logId}")
+ public ResultDomain getLogById(@PathVariable(required = true, name="logId") String logId) {
+ ResultDomain rd = new ResultDomain<>();
+ try {
+ return crontabService.getLogById(logId);
+ } catch (Exception e) {
+ logger.error("获取日志详情失败", e);
+ rd.fail("获取日志详情失败: " + e.getMessage());
+ return rd;
+ }
+ }
+
+ @GetMapping("/task/validate")
+ public ResultDomain validateCronExpression(@RequestParam(required = true, name="cronExpression") String cronExpression) {
+ ResultDomain rd = new ResultDomain<>();
+ try {
+ return crontabService.validateCronExpression(cronExpression);
+ } catch (Exception e) {
+ logger.error("验证Cron表达式失败", e);
+ rd.fail("验证Cron表达式失败: " + e.getMessage());
+ return rd;
+ }
+ }
-
+
+ /**
+ * @description 启动定时任务
+ * @param
+ * @author yslg
+ * @since 2025-11-11
+ */
+ @PostMapping("/task/start/{taskId}")
+ public ResultDomain startTask(@PathVariable(required = true, name="taskId") String taskId) {
+ ResultDomain rd = new ResultDomain<>();
+ try {
+ return crontabService.startTask(taskId);
+ } catch (Exception e) {
+ logger.error("启动定时任务失败", e);
+ rd.fail("启动定时任务失败: " + e.getMessage());
+ return rd;
+ }
+ }
+
+ /**
+ * @description 暂停定时任务
+ * @param
+ * @author yslg
+ * @since 2025-11-11
+ */
+ @PostMapping("/task/pause/{taskId}")
+ public ResultDomain pauseTask(@PathVariable(required = true, name="taskId") String taskId) {
+ ResultDomain rd = new ResultDomain<>();
+ try {
+ return crontabService.pauseTask(taskId);
+ } catch (Exception e) {
+ logger.error("暂停定时任务失败", e);
+ rd.fail("暂停定时任务失败: " + e.getMessage());
+ return rd;
+ }
+ }
+
+ /**
+ * @description 立即执行一次任务
+ * @param
+ * @author yslg
+ * @since 2025-11-11
+ */
+ @PostMapping("/task/execute/{taskId}")
+ public ResultDomain executeTaskOnce(@PathVariable(required = true, name="taskId") String taskId) {
+ ResultDomain rd = new ResultDomain<>();
+ try {
+ return crontabService.executeTaskOnce(taskId);
+ } catch (Exception e) {
+ logger.error("执行定时任务失败", e);
+ rd.fail("执行定时任务失败: " + e.getMessage());
+ return rd;
+ }
+ }
}
diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java
index 304a23b..b5eed75 100644
--- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java
+++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java
@@ -5,6 +5,7 @@ import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.xyzh.common.core.page.PageParam;
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
+import org.xyzh.common.vo.DataCollectionItemVO;
import java.util.List;
@@ -82,5 +83,45 @@ public interface DataCollectionItemMapper extends BaseMapper 采集项VO列表
+ * @author yslg
+ * @since 2025-11-08
+ */
+ List selectVOList(TbDataCollectionItem filter);
+
+ /**
+ * @description 分页查询采集项VO列表(包含关联的任务和日志信息)
+ * @param filter 过滤条件
+ * @param pageParam 分页参数
+ * @return List 采集项VO列表
+ * @author yslg
+ * @since 2025-11-08
+ */
+ List selectVOPage(@Param("filter") TbDataCollectionItem filter, @Param("pageParam") PageParam pageParam);
+
+ /**
+ * @description 根据任务ID查询采集项VO列表(包含关联的任务和日志信息)
+ * @param taskId 任务ID
+ * @return List 采集项VO列表
+ * @author yslg
+ * @since 2025-11-08
+ */
+ List selectVOByTaskId(@Param("taskId") String taskId);
}
+
diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java
index b753729..05fade7 100644
--- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java
+++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java
@@ -16,9 +16,17 @@ public class CrontabItem {
@Data
public static class CrontabMethod {
private String name;
- @JSONField(name = "class")
private String clazz;
+ private String excuete_method;
private String path;
- private Map params;
+ private List params;
+ }
+
+ @Data
+ public static class CrontabParam {
+ private String name;
+ private String description;
+ private String type;
+ private Object value;
}
}
diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java
index 15c2c8e..96a8f61 100644
--- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java
+++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java
@@ -11,9 +11,13 @@ import org.xyzh.common.utils.IDUtils;
import org.xyzh.crontab.mapper.CrontabLogMapper;
import org.xyzh.crontab.pojo.TaskParams;
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.TypeReference;
+
import java.lang.reflect.Method;
import java.util.Date;
import java.util.HashMap;
+import java.util.Map;
/**
* @description 任务执行器
@@ -138,25 +142,29 @@ public class TaskExecutor {
private String injectTaskContext(Object bean, TbCrontabTask task, TbCrontabLog log) {
String methodParams = task.getMethodParams();
- // 如果Bean是BaseTask的子类,注入taskId和logId到JSON参数中
if (bean instanceof org.xyzh.crontab.task.BaseTask) {
try {
- TaskParams taskParams = TaskParams.fromJson(methodParams);
- if (taskParams != null) {
- // 注入taskId和logId
- if (taskParams.getParams() == null) {
- taskParams.setParams(new HashMap<>());
- }
- taskParams.getParams().put("taskId", task.getTaskId());
- taskParams.getParams().put("logId", log.getID());
- methodParams = taskParams.toJson();
- logger.debug("已注入任务上下文: taskId={}, logId={}", task.getTaskId(), log.getID());
- }
+ // 从task对象构建完整的TaskParams
+ TaskParams taskParams = new TaskParams();
+ taskParams.setTaskGroup(task.getTaskGroup()); // 从task表获取
+ taskParams.setMethodName(task.getMethodName()); // 从task表获取
+
+ // 将methodParams解析为Map并设置到params字段
+ Map params = JSON.parseObject(methodParams,
+ new TypeReference