From 9adbd6d3651b1eba7b4a6a64b664f304878d6fc0 Mon Sep 17 00:00:00 2001 From: wangys <3401275564@qq.com> Date: Mon, 10 Nov 2025 16:03:50 +0800 Subject: [PATCH] =?UTF-8?q?temp=E5=AE=9A=E6=97=B6=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sql/add_embedding_model_provider.sql | 18 - .../.bin/mysql/sql/createTableCrontab.sql | 34 + .../.bin/mysql/sql/createTableResource.sql | 39 +- .../crontab/DataCollectionItemService.java | 129 ++++ .../collection/DataCollectionService.java | 163 ----- .../dto/crontab/TbDataCollectionItem.java | 239 +++++++ .../dto/resource/TbDataCollectionConfig.java | 146 ---- .../dto/resource/TbDataCollectionLog.java | 120 ---- .../xyzh/common/vo/DataCollectionItemVO.java | 84 +++ schoolNewsServ/crontab/Java调用Python详解.md | 652 ++++++++++++++++++ schoolNewsServ/crontab/pom.xml | 9 + .../crontab/config/CrawlerProperties.java | 21 + .../crontab/config/CrontabPrpperties.java | 8 + .../crontab/controller/CrontabController.java | 221 ++---- .../DataCollectionItemController.java | 62 ++ .../org/xyzh/crontab/enums/TaskEnums.java | 15 +- .../mapper/DataCollectionItemMapper.java | 86 +++ .../org/xyzh/crontab/pojo/CrontabItem.java | 22 + .../xyzh/crontab/scheduler/TaskExecutor.java | 7 +- .../service/NCDataCollectionItemService.java | 15 + .../impl/DataCollectionItemServiceImpl.java | 479 +++++++++++++ .../org/xyzh/crontab/task/DataBackupTask.java | 98 +-- .../org/xyzh/crontab/task/LogCleanTask.java | 112 +-- .../crontab/task/SystemStatisticsTask.java | 90 +-- .../crontab/task/newsTask/ArticleStruct.java | 46 ++ .../task/newsTask/NewsCrawlerTask.java | 328 +++++++++ .../xyzh/crontab/task/newsTask/NewsTask.java | 5 +- .../task/newsTask/PythonExecutorExample.java | 234 +++++++ .../crontab/task/newsTask/ScriptDomain.java | 15 + .../src/main/resources/appliaction.yml | 34 + .../controller/DataCollectionController.java | 120 ---- .../ResourceManagementController.java | 253 ------- .../mapper/DataCollectionConfigMapper.java | 147 ---- .../news/mapper/DataCollectionLogMapper.java | 147 ---- .../news/service/NCDataCollectionService.java | 14 - .../impl/NCDataCollectionServiceImpl.java | 126 ---- .../mapper/DataCollectionConfigMapper.xml | 216 ------ .../mapper/DataCollectionLogMapper.xml | 188 ----- 38 files changed, 2710 insertions(+), 2032 deletions(-) delete mode 100644 schoolNewsServ/.bin/mysql/sql/add_embedding_model_provider.sql create mode 100644 schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java delete mode 100644 schoolNewsServ/api/api-news/src/main/java/org/xyzh/api/news/collection/DataCollectionService.java create mode 100644 schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbDataCollectionItem.java delete mode 100644 schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionConfig.java delete mode 100644 schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionLog.java create mode 100644 schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java create mode 100644 schoolNewsServ/crontab/Java调用Python详解.md create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrontabPrpperties.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/DataCollectionItemController.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/NCDataCollectionItemService.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/impl/DataCollectionItemServiceImpl.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ArticleStruct.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/PythonExecutorExample.java create mode 100644 schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ScriptDomain.java create mode 100644 schoolNewsServ/crontab/src/main/resources/appliaction.yml delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/controller/DataCollectionController.java delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/controller/ResourceManagementController.java delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionConfigMapper.java delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionLogMapper.java delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/service/NCDataCollectionService.java delete mode 100644 schoolNewsServ/news/src/main/java/org/xyzh/news/service/impl/NCDataCollectionServiceImpl.java delete mode 100644 schoolNewsServ/news/src/main/resources/mapper/DataCollectionConfigMapper.xml delete mode 100644 schoolNewsServ/news/src/main/resources/mapper/DataCollectionLogMapper.xml diff --git a/schoolNewsServ/.bin/mysql/sql/add_embedding_model_provider.sql b/schoolNewsServ/.bin/mysql/sql/add_embedding_model_provider.sql deleted file mode 100644 index 2f96330..0000000 --- a/schoolNewsServ/.bin/mysql/sql/add_embedding_model_provider.sql +++ /dev/null @@ -1,18 +0,0 @@ --- ======================================== --- 添加 embedding_model_provider 字段 --- ======================================== --- 用途:在 tb_ai_knowledge 表中添加向量模型提供商字段 --- 执行时间:2025-11-06 --- 注意:如果该字段已存在,请忽略此脚本 --- ======================================== - -USE `school_news`; - --- 检查并添加 embedding_model_provider 字段 -ALTER TABLE `tb_ai_knowledge` -ADD COLUMN `embedding_model_provider` VARCHAR(100) DEFAULT NULL COMMENT '向量模型提供商' -AFTER `embedding_model`; - --- 完成 -SELECT 'embedding_model_provider 字段添加成功!' AS message; - diff --git a/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql b/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql index f993e78..3d50d7e 100644 --- a/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql +++ b/schoolNewsServ/.bin/mysql/sql/createTableCrontab.sql @@ -57,3 +57,37 @@ CREATE TABLE `tb_crontab_log` ( KEY `idx_start_time` (`start_time`), KEY `idx_deleted` (`deleted`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='定时任务执行日志表'; + +-- ==================================================== +-- 数据采集项表(定时任务采集结果存储) +-- ==================================================== +DROP TABLE IF EXISTS `tb_data_collection_item`; +CREATE TABLE `tb_data_collection_item` ( + `id` VARCHAR(64) NOT NULL COMMENT '主键ID', + `task_id` VARCHAR(64) NOT NULL COMMENT '关联任务ID', + `title` VARCHAR(255) NOT NULL COMMENT '文章标题', + `content` LONGTEXT DEFAULT NULL COMMENT '文章内容(HTML)', + `summary` VARCHAR(500) DEFAULT NULL COMMENT '文章摘要', + `source` VARCHAR(255) DEFAULT NULL COMMENT '来源(如 人民日报)', + `source_url` VARCHAR(500) DEFAULT NULL COMMENT '来源URL(用于去重)', + `category` VARCHAR(100) DEFAULT NULL COMMENT '分类(politics/society等)', + `author` VARCHAR(100) DEFAULT NULL COMMENT '作者', + `publish_time` DATETIME DEFAULT NULL COMMENT '发布时间', + `cover_image` VARCHAR(500) DEFAULT NULL COMMENT '封面图片URL', + `images` TEXT DEFAULT NULL COMMENT '图片列表(JSON)', + `tags` VARCHAR(500) DEFAULT NULL COMMENT '标签(逗号分隔)', + `status` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '状态(0未处理 1已转换为资源 2已忽略)', + `resource_id` VARCHAR(64) DEFAULT NULL COMMENT '转换后的资源ID', + `crawl_time` DATETIME DEFAULT NULL COMMENT '爬取时间', + `process_time` DATETIME DEFAULT NULL COMMENT '处理时间', + `processor` VARCHAR(64) DEFAULT NULL COMMENT '处理人', + `create_time` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', + `update_time` DATETIME DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', + `delete_time` DATETIME DEFAULT NULL COMMENT '删除时间', + `deleted` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否删除(0:否 1:是)', + PRIMARY KEY (`id`), + KEY `idx_task_id` (`task_id`), + KEY `idx_status` (`status`), + KEY `idx_publish_time` (`publish_time`), + KEY `idx_source_url` (`source_url`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='数据采集项表'; \ No newline at end of file diff --git a/schoolNewsServ/.bin/mysql/sql/createTableResource.sql b/schoolNewsServ/.bin/mysql/sql/createTableResource.sql index 55f46cd..ab4458d 100644 --- a/schoolNewsServ/.bin/mysql/sql/createTableResource.sql +++ b/schoolNewsServ/.bin/mysql/sql/createTableResource.sql @@ -110,41 +110,4 @@ CREATE TABLE `tb_resource_tag` ( UNIQUE KEY `uk_resource_tag` (`resource_id`, `tag_id`), KEY `idx_resource` (`resource_id`), KEY `idx_tag` (`tag_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='资源标签关联表'; - --- 数据采集配置表 -DROP TABLE IF EXISTS `tb_data_collection_config`; -CREATE TABLE `tb_data_collection_config` ( - `id` VARCHAR(50) NOT NULL COMMENT '配置ID', - `name` VARCHAR(100) NOT NULL COMMENT '配置名称', - `source_url` VARCHAR(500) NOT NULL COMMENT '采集源URL', - `source_type` VARCHAR(50) DEFAULT NULL COMMENT '采集源类型', - `frequency` VARCHAR(20) DEFAULT 'daily' COMMENT '采集频率(daily每天 weekly每周)', - `tag_id` VARCHAR(50) DEFAULT NULL COMMENT '默认标签ID(文章分类标签,tagType=1)', - `status` INT(4) DEFAULT 1 COMMENT '状态(0禁用 1启用)', - `last_collect_time` TIMESTAMP NULL DEFAULT NULL COMMENT '最后采集时间', - `creator` VARCHAR(50) DEFAULT NULL COMMENT '创建者', - `updater` VARCHAR(50) DEFAULT NULL COMMENT '更新者', - `create_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', - `update_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', - `delete_time` TIMESTAMP NULL DEFAULT NULL COMMENT '删除时间', - `deleted` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '是否删除', - PRIMARY KEY (`id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='数据采集配置表'; - --- 数据采集记录表 -DROP TABLE IF EXISTS `tb_data_collection_log`; -CREATE TABLE `tb_data_collection_log` ( - `id` VARCHAR(50) NOT NULL COMMENT '记录ID', - `config_id` VARCHAR(50) NOT NULL COMMENT '配置ID', - `collect_count` INT(11) DEFAULT 0 COMMENT '采集数量', - `success_count` INT(11) DEFAULT 0 COMMENT '成功数量', - `fail_count` INT(11) DEFAULT 0 COMMENT '失败数量', - `status` INT(4) DEFAULT 1 COMMENT '状态(0失败 1成功 2部分成功)', - `message` TEXT COMMENT '采集消息', - `collect_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '采集时间', - PRIMARY KEY (`id`), - KEY `idx_config` (`config_id`), - KEY `idx_collect_time` (`collect_time`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='数据采集记录表'; - +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci COMMENT='资源标签关联表'; \ No newline at end of file diff --git a/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java b/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java new file mode 100644 index 0000000..0747fdb --- /dev/null +++ b/schoolNewsServ/api/api-crontab/src/main/java/org/xyzh/api/crontab/DataCollectionItemService.java @@ -0,0 +1,129 @@ +package org.xyzh.api.crontab; + +import org.xyzh.common.core.domain.ResultDomain; +import org.xyzh.common.core.page.PageParam; +import org.xyzh.common.dto.crontab.TbDataCollectionItem; +import org.xyzh.common.vo.DataCollectionItemVO; + +/** + * @description 数据采集项服务接口 + * @filename DataCollectionItemService.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +public interface DataCollectionItemService { + + /** + * @description 创建采集项 + * @param item 采集项对象 + * @return ResultDomain 创建结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain createItem(TbDataCollectionItem item); + + /** + * @description 批量创建采集项 + * @param itemList 采集项列表 + * @return ResultDomain 创建数量 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain batchCreateItems(java.util.List itemList); + + /** + * @description 更新采集项 + * @param item 采集项对象 + * @return ResultDomain 更新结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain updateItem(TbDataCollectionItem item); + + /** + * @description 删除采集项 + * @param itemId 采集项ID + * @return ResultDomain 删除结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain deleteItem(String itemId); + + /** + * @description 根据ID查询采集项 + * @param itemId 采集项ID + * @return ResultDomain 查询结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain getItemById(String itemId); + + /** + * @description 查询采集项列表 + * @param filter 过滤条件 + * @return ResultDomain 查询结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain getItemList(TbDataCollectionItem filter); + + /** + * @description 分页查询采集项列表 + * @param filter 过滤条件 + * @param pageParam 分页参数 + * @return ResultDomain 查询结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain getItemPage(TbDataCollectionItem filter, PageParam pageParam); + + /** + * @description 根据任务ID查询采集项列表 + * @param taskId 任务ID + * @return ResultDomain 查询结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain getItemsByTaskId(String taskId); + + /** + * @description 将采集项转换为资源 + * @param itemId 采集项ID + * @param tagId 标签ID(文章分类) + * @return ResultDomain 转换后的资源ID + * @author yslg + * @since 2025-11-08 + */ + ResultDomain convertToResource(String itemId, String tagId); + + /** + * @description 批量转换为资源 + * @param itemIds 采集项ID列表 + * @param tagId 标签ID(文章分类) + * @return ResultDomain 转换数量 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain batchConvertToResource(java.util.List itemIds, String tagId); + + /** + * @description 忽略采集项 + * @param itemId 采集项ID + * @return ResultDomain 操作结果 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain ignoreItem(String itemId); + + /** + * @description 统计采集项数量(按状态) + * @param taskId 任务ID(可选) + * @param status 状态 + * @return ResultDomain 数量 + * @author yslg + * @since 2025-11-08 + */ + ResultDomain countByStatus(String taskId, Integer status); +} + diff --git a/schoolNewsServ/api/api-news/src/main/java/org/xyzh/api/news/collection/DataCollectionService.java b/schoolNewsServ/api/api-news/src/main/java/org/xyzh/api/news/collection/DataCollectionService.java deleted file mode 100644 index f3c00ff..0000000 --- a/schoolNewsServ/api/api-news/src/main/java/org/xyzh/api/news/collection/DataCollectionService.java +++ /dev/null @@ -1,163 +0,0 @@ -package org.xyzh.api.news.collection; - -import org.xyzh.common.core.domain.ResultDomain; -import org.xyzh.common.dto.resource.TbDataCollectionConfig; -import org.xyzh.common.dto.resource.TbDataCollectionLog; - -import java.util.Date; -import java.util.List; - -/** - * @description 数据采集服务接口 - * @filename DataCollectionService.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -public interface DataCollectionService { - - // ----------------采集配置相关-------------------------------- - - /** - * @description 获取采集配置列表 - * @param status 状态(可选) - * @return ResultDomain 配置列表 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getConfigList(Integer status); - - /** - * @description 根据ID获取配置详情 - * @param configID 配置ID - * @return ResultDomain 配置详情 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getConfigById(String configID); - - /** - * @description 创建采集配置 - * @param config 配置信息 - * @return ResultDomain 创建结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain createConfig(TbDataCollectionConfig config); - - /** - * @description 更新采集配置 - * @param config 配置信息 - * @return ResultDomain 更新结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain updateConfig(TbDataCollectionConfig config); - - /** - * @description 删除采集配置 - * @param configID 配置ID - * @return ResultDomain 删除结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain deleteConfig(String configID); - - /** - * @description 更新配置状态 - * @param configID 配置ID - * @param status 状态 - * @return ResultDomain 更新结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain updateConfigStatus(String configID, Integer status); - - /** - * @description 更新最后采集时间 - * @param configID 配置ID - * @param lastCollectTime 最后采集时间 - * @return ResultDomain 更新结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain updateLastCollectTime(String configID, Date lastCollectTime); - - // ----------------采集日志相关-------------------------------- - - /** - * @description 获取采集日志列表 - * @param configID 配置ID(可选) - * @param startDate 开始日期(可选) - * @param endDate 结束日期(可选) - * @return ResultDomain 日志列表 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getLogList(String configID, Date startDate, Date endDate); - - /** - * @description 根据ID获取日志详情 - * @param logID 日志ID - * @return ResultDomain 日志详情 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getLogById(String logID); - - /** - * @description 创建采集日志 - * @param log 日志信息 - * @return ResultDomain 创建结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain createLog(TbDataCollectionLog log); - - /** - * @description 获取配置的采集统计 - * @param configID 配置ID - * @return ResultDomain 采集统计 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getConfigStatistics(String configID); - - // ----------------采集操作相关-------------------------------- - - /** - * @description 执行数据采集 - * @param configID 配置ID - * @return ResultDomain 采集结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain executeCollection(String configID); - - /** - * @description 批量执行数据采集 - * @param configIDs 配置ID列表 - * @return ResultDomain 采集结果列表 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain batchExecuteCollection(List configIDs); - - /** - * @description 停止采集任务 - * @param configID 配置ID - * @return ResultDomain 停止结果 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain stopCollection(String configID); - - /** - * @description 获取采集任务状态 - * @param configID 配置ID - * @return ResultDomain 任务状态 - * @author yslg - * @since 2025-10-15 - */ - ResultDomain getCollectionStatus(String configID); -} diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbDataCollectionItem.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbDataCollectionItem.java new file mode 100644 index 0000000..9a197ff --- /dev/null +++ b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbDataCollectionItem.java @@ -0,0 +1,239 @@ +package org.xyzh.common.dto.crontab; + +import org.xyzh.common.dto.BaseDTO; + +import java.util.Date; + +/** + * @description 数据采集项表(存储爬取的文章数据) + * @filename TbDataCollectionItem.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +public class TbDataCollectionItem extends BaseDTO { + + private static final long serialVersionUID = 1L; + + /** + * @description 关联的定时任务ID + */ + private String taskId; + + /** + * @description 文章标题 + */ + private String title; + + /** + * @description 文章内容(HTML格式) + */ + private String content; + + /** + * @description 文章摘要 + */ + private String summary; + + /** + * @description 来源(人民日报、新华社等) + */ + private String source; + + /** + * @description 来源URL(用于去重) + */ + private String sourceUrl; + + /** + * @description 分类(政治、社会、国际等) + */ + private String category; + + /** + * @description 作者 + */ + private String author; + + /** + * @description 发布时间 + */ + private Date publishTime; + + /** + * @description 封面图片URL + */ + private String coverImage; + + /** + * @description 图片列表(JSON格式,存储图片URL数组) + */ + private String images; + + /** + * @description 标签(多个用逗号分隔) + */ + private String tags; + + /** + * @description 状态(0:未处理 1:已转换为资源 2:已忽略) + */ + private Integer status; + + /** + * @description 转换后的资源ID(如果已转换为资源) + */ + private String resourceId; + + /** + * @description 爬取时间 + */ + private Date crawlTime; + + /** + * @description 处理时间 + */ + private Date processTime; + + /** + * @description 处理人 + */ + private String processor; + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public String getSourceUrl() { + return sourceUrl; + } + + public void setSourceUrl(String sourceUrl) { + this.sourceUrl = sourceUrl; + } + + public String getCategory() { + return category; + } + + public void setCategory(String category) { + this.category = category; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public Date getPublishTime() { + return publishTime; + } + + public void setPublishTime(Date publishTime) { + this.publishTime = publishTime; + } + + public String getCoverImage() { + return coverImage; + } + + public void setCoverImage(String coverImage) { + this.coverImage = coverImage; + } + + public String getImages() { + return images; + } + + public void setImages(String images) { + this.images = images; + } + + public String getTags() { + return tags; + } + + public void setTags(String tags) { + this.tags = tags; + } + + public Integer getStatus() { + return status; + } + + public void setStatus(Integer status) { + this.status = status; + } + + public String getResourceId() { + return resourceId; + } + + public void setResourceId(String resourceId) { + this.resourceId = resourceId; + } + + public Date getCrawlTime() { + return crawlTime; + } + + public void setCrawlTime(Date crawlTime) { + this.crawlTime = crawlTime; + } + + public Date getProcessTime() { + return processTime; + } + + public void setProcessTime(Date processTime) { + this.processTime = processTime; + } + + public String getProcessor() { + return processor; + } + + public void setProcessor(String processor) { + this.processor = processor; + } +} + diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionConfig.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionConfig.java deleted file mode 100644 index 97c0d4c..0000000 --- a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionConfig.java +++ /dev/null @@ -1,146 +0,0 @@ -package org.xyzh.common.dto.resource; - -import org.xyzh.common.dto.BaseDTO; -import java.util.Date; - -/** - * @description 数据采集配置表 - * @filename TbDataCollectionConfig.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -public class TbDataCollectionConfig extends BaseDTO { - - private static final long serialVersionUID = 1L; - - /** - * @description 配置名称 - */ - private String name; - - /** - * @description 采集源URL - */ - private String sourceUrl; - - /** - * @description 采集源类型 - */ - private String sourceType; - - /** - * @description 采集频率(daily每天 weekly每周) - */ - private String frequency; - - /** - * @description 默认标签ID(文章分类标签,tagType=1) - */ - private String tagID; - - /** - * @description 状态(0禁用 1启用) - */ - private Integer status; - - /** - * @description 最后采集时间 - */ - private Date lastCollectTime; - - /** - * @description 创建者 - */ - private String creator; - - /** - * @description 更新者 - */ - private String updater; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getSourceUrl() { - return sourceUrl; - } - - public void setSourceUrl(String sourceUrl) { - this.sourceUrl = sourceUrl; - } - - public String getSourceType() { - return sourceType; - } - - public void setSourceType(String sourceType) { - this.sourceType = sourceType; - } - - public String getFrequency() { - return frequency; - } - - public void setFrequency(String frequency) { - this.frequency = frequency; - } - - public String getTagID() { - return tagID; - } - - public void setTagID(String tagID) { - this.tagID = tagID; - } - - public Integer getStatus() { - return status; - } - - public void setStatus(Integer status) { - this.status = status; - } - - public Date getLastCollectTime() { - return lastCollectTime; - } - - public void setLastCollectTime(Date lastCollectTime) { - this.lastCollectTime = lastCollectTime; - } - - public String getCreator() { - return creator; - } - - public void setCreator(String creator) { - this.creator = creator; - } - - public String getUpdater() { - return updater; - } - - public void setUpdater(String updater) { - this.updater = updater; - } - - @Override - public String toString() { - return "TbDataCollectionConfig{" + - "id=" + getID() + - ", name='" + name + '\'' + - ", sourceUrl='" + sourceUrl + '\'' + - ", sourceType='" + sourceType + '\'' + - ", frequency='" + frequency + '\'' + - ", status=" + status + - ", lastCollectTime=" + lastCollectTime + - '}'; - } -} diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionLog.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionLog.java deleted file mode 100644 index 81354f7..0000000 --- a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/resource/TbDataCollectionLog.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.xyzh.common.dto.resource; - -import org.xyzh.common.dto.BaseDTO; -import java.util.Date; - -/** - * @description 数据采集记录表 - * @filename TbDataCollectionLog.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -public class TbDataCollectionLog extends BaseDTO { - - private static final long serialVersionUID = 1L; - - /** - * @description 配置ID - */ - private String configID; - - /** - * @description 采集数量 - */ - private Integer collectCount; - - /** - * @description 成功数量 - */ - private Integer successCount; - - /** - * @description 失败数量 - */ - private Integer failCount; - - /** - * @description 状态(0失败 1成功 2部分成功) - */ - private Integer status; - - /** - * @description 采集消息 - */ - private String message; - - /** - * @description 采集时间 - */ - private Date collectTime; - - public String getConfigID() { - return configID; - } - - public void setConfigID(String configID) { - this.configID = configID; - } - - public Integer getCollectCount() { - return collectCount; - } - - public void setCollectCount(Integer collectCount) { - this.collectCount = collectCount; - } - - public Integer getSuccessCount() { - return successCount; - } - - public void setSuccessCount(Integer successCount) { - this.successCount = successCount; - } - - public Integer getFailCount() { - return failCount; - } - - public void setFailCount(Integer failCount) { - this.failCount = failCount; - } - - public Integer getStatus() { - return status; - } - - public void setStatus(Integer status) { - this.status = status; - } - - public String getMessage() { - return message; - } - - public void setMessage(String message) { - this.message = message; - } - - public Date getCollectTime() { - return collectTime; - } - - public void setCollectTime(Date collectTime) { - this.collectTime = collectTime; - } - - @Override - public String toString() { - return "TbDataCollectionLog{" + - "id=" + getID() + - ", configID='" + configID + '\'' + - ", collectCount=" + collectCount + - ", successCount=" + successCount + - ", failCount=" + failCount + - ", status=" + status + - ", collectTime=" + collectTime + - '}'; - } -} diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java new file mode 100644 index 0000000..052ba8b --- /dev/null +++ b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/vo/DataCollectionItemVO.java @@ -0,0 +1,84 @@ +package org.xyzh.common.vo; + +import org.xyzh.common.dto.crontab.TbDataCollectionItem; +import org.xyzh.common.dto.crontab.TbCrontabTask; + +import java.io.Serializable; + +/** + * @description 数据采集项VO + * @filename DataCollectionItemVO.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +public class DataCollectionItemVO implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * @description 采集项数据 + */ + private TbDataCollectionItem item; + + /** + * @description 关联的定时任务信息 + */ + private TbCrontabTask task; + + /** + * @description 状态文本(用于前端显示) + */ + private String statusText; + + /** + * @description 是否可以编辑(未处理和已忽略的可以编辑) + */ + private Boolean canEdit; + + /** + * @description 是否可以转换为资源(未处理的可以转换) + */ + private Boolean canConvert; + + public TbDataCollectionItem getItem() { + return item; + } + + public void setItem(TbDataCollectionItem item) { + this.item = item; + } + + public TbCrontabTask getTask() { + return task; + } + + public void setTask(TbCrontabTask task) { + this.task = task; + } + + public String getStatusText() { + return statusText; + } + + public void setStatusText(String statusText) { + this.statusText = statusText; + } + + public Boolean getCanEdit() { + return canEdit; + } + + public void setCanEdit(Boolean canEdit) { + this.canEdit = canEdit; + } + + public Boolean getCanConvert() { + return canConvert; + } + + public void setCanConvert(Boolean canConvert) { + this.canConvert = canConvert; + } +} + diff --git a/schoolNewsServ/crontab/Java调用Python详解.md b/schoolNewsServ/crontab/Java调用Python详解.md new file mode 100644 index 0000000..76d3c44 --- /dev/null +++ b/schoolNewsServ/crontab/Java调用Python详解.md @@ -0,0 +1,652 @@ +# Java调用Python并获取返回结果详解 + +## 一、核心原理 + +Java通过 `ProcessBuilder` 或 `Runtime.exec()` 创建操作系统进程来执行Python脚本,然后通过进程的标准输入/输出流进行通信。 + +## 二、当前实现详解 + +### 1. 构建命令 + +```java +// 步骤1: 构建命令列表 +List command = new ArrayList<>(); + +// 步骤2: 处理Windows/Linux系统差异 +String os = System.getProperty("os.name").toLowerCase(); +if (os.contains("win")) { + // Windows系统需要通过cmd执行 + command.add("cmd"); // 命令解释器 + command.add("/c"); // /c表示执行后关闭 + command.add(pythonPath); // python或python3 +} else { + // Linux/Mac系统直接执行 + command.add(pythonPath); +} + +// 步骤3: 添加Python脚本和参数 +command.add("main.py"); // Python脚本 +command.add(category); // 参数1: 分类 +command.add(limit); // 参数2: 数量 +command.add(outputFile); // 参数3: 输出文件 +``` + +**命令示例:** +- Windows: `cmd /c python main.py politics 20 output/news.json` +- Linux: `python3 main.py politics 20 output/news.json` + +### 2. 创建进程 + +```java +// 创建进程构建器 +ProcessBuilder processBuilder = new ProcessBuilder(command); + +// 设置工作目录(Python脚本所在目录) +processBuilder.directory(scriptDir.toFile()); + +// 合并标准输出和错误输出(便于统一读取) +processBuilder.redirectErrorStream(true); + +// 启动进程 +Process process = processBuilder.start(); +``` + +**关键点:** +- `directory()`: 设置工作目录,确保Python脚本能找到相对路径的资源 +- `redirectErrorStream(true)`: 将stderr合并到stdout,方便统一读取 +- `start()`: 异步启动进程,不会阻塞 + +### 3. 读取输出流 + +```java +// 读取标准输出(Python的print输出) +StringBuilder output = new StringBuilder(); +try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + logger.debug("Python输出: {}", line); + } +} +``` + +**重要说明:** +- `process.getInputStream()`: 获取Python进程的标准输出 +- 必须读取输出流,否则缓冲区满会导致进程阻塞 +- 使用UTF-8编码避免中文乱码 + +### 4. 等待进程结束 + +```java +// 方式1: 带超时的等待(推荐) +boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); + +if (!finished) { + // 超时后强制终止进程 + process.destroy(); // 或 process.destroyForcibly() 强制终止 + throw new RuntimeException("任务超时"); +} + +// 方式2: 无限等待(不推荐,可能导致死锁) +int exitCode = process.waitFor(); +``` + +**退出码说明:** +- `0`: 执行成功 +- `非0`: 执行失败(通常是错误码) + +### 5. 获取返回结果 + +当前实现通过**文件传递**方式获取结果: + +```java +// Python脚本将结果写入JSON文件 +Path outputPath = scriptDir.resolve(outputFile); + +// Java读取文件内容 +String jsonContent = Files.readString(outputPath); + +// 解析JSON +ObjectMapper mapper = new ObjectMapper(); +List> newsList = mapper.readValue( + jsonContent, + List.class +); +``` + +## 三、三种数据传递方式对比 + +### 方式1: 文件传递(当前实现) + +**优点:** +- ✅ 适合大数据量 +- ✅ 数据持久化,便于调试 +- ✅ 实现简单 + +**缺点:** +- ⚠️ 需要文件I/O操作 +- ⚠️ 需要管理临时文件 +- ⚠️ 可能有并发问题(文件名冲突) + +**实现示例:** + +```java +// Java端 +String outputFile = "output/result_" + System.currentTimeMillis() + ".json"; +command.add(outputFile); + +// Python端 +import json +import sys + +result = {"status": "success", "data": [...]} +with open(sys.argv[1], 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False) +``` + +### 方式2: 标准输出传递(适合小数据) + +**优点:** +- ✅ 实时传输,无需文件 +- ✅ 适合小数据量(< 1MB) +- ✅ 无文件管理开销 + +**缺点:** +- ⚠️ 大数据量可能阻塞 +- ⚠️ 不能传递二进制数据 +- ⚠️ 需要与日志输出区分 + +**实现示例:** + +```java +// Java端:读取标准输出 +StringBuilder result = new StringBuilder(); +try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + // 约定:以特定标记区分结果和日志 + if (line.startsWith("RESULT:")) { + result.append(line.substring(7)); // 去掉"RESULT:"前缀 + } else { + logger.info("Python日志: {}", line); + } + } +} + +// 解析JSON结果 +String jsonResult = result.toString(); +ObjectMapper mapper = new ObjectMapper(); +Map data = mapper.readValue(jsonResult, Map.class); +``` + +```python +# Python端:输出结果 +import json +import sys + +# 日志输出到stderr +print("开始爬取...", file=sys.stderr) + +# 结果输出到stdout(带标记) +result = {"status": "success", "data": [...]} +print("RESULT:" + json.dumps(result, ensure_ascii=False)) +``` + +### 方式3: 标准输入传递参数(双向通信) + +**优点:** +- ✅ 可以传递复杂参数 +- ✅ 支持交互式通信 + +**缺点:** +- ⚠️ 实现复杂 +- ⚠️ 需要处理流关闭时机 + +**实现示例:** + +```java +// Java端:通过标准输入传递参数 +ProcessBuilder pb = new ProcessBuilder("python", "script.py"); +Process process = pb.start(); + +// 写入参数到标准输入 +try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(process.getOutputStream(), "UTF-8"))) { + String params = "{\"category\":\"politics\",\"limit\":20}"; + writer.write(params); + writer.newLine(); + writer.flush(); +} + +// 关闭输入流(告诉Python输入结束) +process.getOutputStream().close(); + +// 读取输出 +// ... 同方式2 +``` + +```python +# Python端:从标准输入读取参数 +import json +import sys + +# 读取参数 +params_json = sys.stdin.readline().strip() +params = json.loads(params_json) + +category = params.get("category", "politics") +limit = params.get("limit", 20) + +# 执行爬取 +result = crawl_news(category, limit) + +# 输出结果 +print(json.dumps(result, ensure_ascii=False)) +``` + +## 四、完整优化实现 + +### 改进版实现(支持多种方式) + +```java +package org.xyzh.crontab.task.newsTask; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.*; + +/** + * Java调用Python的完整实现 + */ +@Component("newsCrewerTask") +public class NewsCrewerTask { + + private static final Logger logger = LoggerFactory.getLogger(NewsCrewerTask.class); + private final ObjectMapper objectMapper = new ObjectMapper(); + + @Value("${crewer.python.path:python}") + private String pythonPath; + + @Value("${crewer.script.path:../schoolNewsCrewer}") + private String scriptPath; + + @Value("${crewer.timeout:300}") + private int timeout; + + /** + * 方式1: 通过文件传递结果(当前实现,适合大数据) + */ + public List> executeByFile(String category, int limit) { + logger.info("执行爬虫任务 - 文件方式"); + + try { + // 1. 构建命令 + List command = buildCommand("main.py", category, String.valueOf(limit)); + + // 2. 生成输出文件 + String timestamp = String.valueOf(System.currentTimeMillis()); + String outputFile = String.format("output/news_%s_%s.json", category, timestamp); + command.add(outputFile); + + // 3. 执行进程 + ProcessResult result = executeProcess(command); + + if (result.getExitCode() != 0) { + throw new RuntimeException("Python执行失败: " + result.getOutput()); + } + + // 4. 读取结果文件 + Path outputPath = Paths.get(scriptPath).resolve(outputFile); + if (!Files.exists(outputPath)) { + throw new RuntimeException("输出文件不存在: " + outputFile); + } + + String jsonContent = Files.readString(outputPath, StandardCharsets.UTF_8); + List> newsList = objectMapper.readValue( + jsonContent, + objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class) + ); + + // 5. 清理临时文件(可选) + // Files.deleteIfExists(outputPath); + + return newsList; + + } catch (Exception e) { + logger.error("执行失败", e); + throw new RuntimeException("爬虫任务执行失败", e); + } + } + + /** + * 方式2: 通过标准输出传递结果(适合小数据) + */ + public List> executeByStdout(String category, int limit) { + logger.info("执行爬虫任务 - 标准输出方式"); + + try { + // 1. 构建命令(使用特殊脚本,输出JSON到stdout) + List command = buildCommand("main_stdout.py", category, String.valueOf(limit)); + + // 2. 执行进程 + ProcessResult result = executeProcess(command); + + if (result.getExitCode() != 0) { + throw new RuntimeException("Python执行失败: " + result.getOutput()); + } + + // 3. 从输出中提取JSON(约定:最后一行是JSON结果) + String output = result.getOutput(); + String[] lines = output.split("\n"); + + // 查找JSON行(以{或[开头) + String jsonLine = null; + for (int i = lines.length - 1; i >= 0; i--) { + String line = lines[i].trim(); + if (line.startsWith("{") || line.startsWith("[")) { + jsonLine = line; + break; + } + } + + if (jsonLine == null) { + throw new RuntimeException("未找到JSON结果"); + } + + // 4. 解析JSON + List> newsList = objectMapper.readValue( + jsonLine, + objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class) + ); + + return newsList; + + } catch (Exception e) { + logger.error("执行失败", e); + throw new RuntimeException("爬虫任务执行失败", e); + } + } + + /** + * 方式3: 通过标准输入传递参数(双向通信) + */ + public List> executeByStdin(String category, int limit) { + logger.info("执行爬虫任务 - 标准输入方式"); + + Process process = null; + try { + // 1. 构建命令 + List command = buildCommand("main_stdin.py"); + ProcessBuilder pb = new ProcessBuilder(command); + pb.directory(Paths.get(scriptPath).toFile()); + pb.redirectErrorStream(true); + + // 2. 启动进程 + process = pb.start(); + + // 3. 写入参数到标准输入 + try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(process.getOutputStream(), StandardCharsets.UTF_8))) { + + Map params = Map.of( + "category", category, + "limit", limit + ); + + String paramsJson = objectMapper.writeValueAsString(params); + writer.write(paramsJson); + writer.newLine(); + writer.flush(); + } + + // 4. 关闭输入流(重要!) + process.getOutputStream().close(); + + // 5. 读取输出 + StringBuilder output = new StringBuilder(); + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + // 6. 等待进程结束 + boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); + if (!finished) { + process.destroyForcibly(); + throw new RuntimeException("任务超时"); + } + + int exitCode = process.exitValue(); + if (exitCode != 0) { + throw new RuntimeException("Python执行失败,退出码: " + exitCode); + } + + // 7. 解析结果 + String jsonResult = output.toString().trim(); + List> newsList = objectMapper.readValue( + jsonResult, + objectMapper.getTypeFactory().constructCollectionType(List.class, Map.class) + ); + + return newsList; + + } catch (Exception e) { + logger.error("执行失败", e); + throw new RuntimeException("爬虫任务执行失败", e); + } finally { + if (process != null && process.isAlive()) { + process.destroyForcibly(); + } + } + } + + /** + * 通用进程执行方法 + */ + private ProcessResult executeProcess(List command) throws IOException, InterruptedException { + long startTime = System.currentTimeMillis(); + + // 创建进程构建器 + ProcessBuilder pb = new ProcessBuilder(command); + pb.directory(Paths.get(scriptPath).toFile()); + pb.redirectErrorStream(true); + + logger.info("执行命令: {}", String.join(" ", command)); + + // 启动进程 + Process process = pb.start(); + + // 读取输出(必须在单独线程中,避免阻塞) + StringBuilder output = new StringBuilder(); + StringBuilder error = new StringBuilder(); + + // 使用CompletableFuture异步读取,避免死锁 + CompletableFuture outputFuture = CompletableFuture.supplyAsync(() -> { + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + logger.debug("Python输出: {}", line); + } + return output.toString(); + } catch (IOException e) { + logger.error("读取输出失败", e); + return ""; + } + }); + + // 等待进程结束(带超时) + boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new RuntimeException("任务超时(超过" + timeout + "秒)"); + } + + // 获取输出 + String outputStr = outputFuture.get(5, TimeUnit.SECONDS); + + int exitCode = process.exitValue(); + long duration = System.currentTimeMillis() - startTime; + + logger.info("进程执行完成 - 退出码: {}, 耗时: {}ms", exitCode, duration); + + return new ProcessResult(exitCode, outputStr, duration); + } + + /** + * 构建命令列表 + */ + private List buildCommand(String... args) { + List command = new ArrayList<>(); + + String os = System.getProperty("os.name").toLowerCase(); + if (os.contains("win")) { + command.add("cmd"); + command.add("/c"); + command.add(pythonPath); + } else { + command.add(pythonPath); + } + + for (String arg : args) { + command.add(arg); + } + + return command; + } + + /** + * 进程执行结果 + */ + private static class ProcessResult { + private final int exitCode; + private final String output; + private final long duration; + + public ProcessResult(int exitCode, String output, long duration) { + this.exitCode = exitCode; + this.output = output; + this.duration = duration; + } + + public int getExitCode() { + return exitCode; + } + + public String getOutput() { + return output; + } + + public long getDuration() { + return duration; + } + } +} +``` + +## 五、关键注意事项 + +### 1. 必须读取输出流 + +**错误示例:** +```java +Process process = pb.start(); +int exitCode = process.waitFor(); // 可能永远阻塞! +``` + +**原因:** 如果输出缓冲区满了,Python进程会阻塞等待读取。 + +**正确做法:** +```java +Process process = pb.start(); + +// 必须读取输出流 +Thread outputThread = new Thread(() -> { + try (BufferedReader reader = ...) { + // 读取输出 + } +}); +outputThread.start(); + +process.waitFor(); +``` + +### 2. 处理编码问题 + +```java +// 指定UTF-8编码,避免中文乱码 +new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8) +new OutputStreamWriter(process.getOutputStream(), StandardCharsets.UTF_8) +``` + +### 3. 超时控制 + +```java +// 使用带超时的waitFor +boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); +if (!finished) { + process.destroyForcibly(); // 强制终止 +} +``` + +### 4. 资源清理 + +```java +try { + // 执行逻辑 +} finally { + if (process != null && process.isAlive()) { + process.destroyForcibly(); + } + // 关闭流 + process.getInputStream().close(); + process.getOutputStream().close(); + process.getErrorStream().close(); +} +``` + +### 5. 错误处理 + +```java +// 检查退出码 +if (exitCode != 0) { + // 读取错误输出 + String error = readErrorStream(process); + throw new RuntimeException("执行失败: " + error); +} +``` + +## 六、性能优化建议 + +1. **使用线程池**:如果频繁调用,使用线程池管理进程 +2. **连接复用**:考虑Python服务模式(HTTP/GRPC) +3. **异步执行**:使用CompletableFuture异步执行 +4. **缓存结果**:对相同参数的请求缓存结果 + +## 七、总结 + +- **文件传递**:适合大数据量,当前实现方式 +- **标准输出**:适合小数据量,实时传输 +- **标准输入**:适合复杂参数,双向通信 + +根据实际需求选择合适的方式,当前的文件传递方式已经足够好! + diff --git a/schoolNewsServ/crontab/pom.xml b/schoolNewsServ/crontab/pom.xml index 0f0dbee..d11794f 100644 --- a/schoolNewsServ/crontab/pom.xml +++ b/schoolNewsServ/crontab/pom.xml @@ -25,6 +25,11 @@ api-crontab ${school-news.version} + + org.xyzh + api-news + ${school-news.version} + @@ -38,6 +43,10 @@ system ${school-news.version} + + org.projectlombok + lombok + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java new file mode 100644 index 0000000..a25355e --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrawlerProperties.java @@ -0,0 +1,21 @@ +package org.xyzh.crontab.config; + +import org.springframework.boot.context.properties.ConfigurationProperties; +import org.xyzh.crontab.task.newsTask.ScriptDomain; + +import lombok.Data; + +import org.springframework.beans.factory.annotation.Value; +import java.util.List; + +@Data +@ConfigurationProperties(prefix = "crawler") +public class CrawlerProperties { + + @Value("${crawler.base.path}") + private String basePath; + + @Value("${crawler.script}") + private List scripts; + +} diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrontabPrpperties.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrontabPrpperties.java new file mode 100644 index 0000000..fe33e6f --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/config/CrontabPrpperties.java @@ -0,0 +1,8 @@ +package org.xyzh.crontab.config; + +import org.springframework.boot.context.properties.ConfigurationProperties; + +@ConfigurationProperties(prefix = "crontab") +public class CrontabPrpperties { + +} diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java index 59d6ab3..3753d13 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/CrontabController.java @@ -10,6 +10,13 @@ import org.xyzh.common.core.page.PageParam; import org.xyzh.common.core.page.PageRequest; import org.xyzh.common.dto.crontab.TbCrontabTask; import org.xyzh.common.dto.crontab.TbCrontabLog; +import org.xyzh.common.utils.IDUtils; +import org.xyzh.crontab.pojo.CrontabItem; + +import java.util.Date; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestParam; + /** * @description 定时任务控制器 @@ -27,204 +34,64 @@ public class CrontabController { @Autowired private CrontabService crontabService; - // ----------------定时任务管理-------------------------------- - /** - * @description 创建定时任务 - * @param task 任务对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 获取可创建的定时任务 + * @return */ - @PostMapping("/task") - public ResultDomain createTask(@RequestBody TbCrontabTask task) { - return crontabService.createTask(task); + @GetMapping("/getEnabledCrontabList") + public ResultDomain getEnabledCrontabList(@RequestParam String param) { + return null; } /** - * @description 更新定时任务 - * @param task 任务对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 创建定时任务 + * @param crontabItem + * @return */ - @PutMapping("/task") - public ResultDomain updateTask(@RequestBody TbCrontabTask task) { - return crontabService.updateTask(task); + @PostMapping("/crontabTask") + public ResultDomain createCrontab(@RequestBody TbCrontabTask crontabItem) { + return null; } /** - * @description 删除定时任务 - * @param task 任务对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 更新定时任务 + * @param crontabItem + * @return */ - @DeleteMapping("/task") - public ResultDomain deleteTask(@RequestBody TbCrontabTask task) { - return crontabService.deleteTask(task.getID()); + @PutMapping("/crontabTask") + public ResultDomain updateCrontab(@RequestBody TbCrontabTask crontabItem) { + return null; } /** - * @description 根据ID查询任务 - * @param taskId 任务ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 删除定时任务 + * @param crontabItem + * @return */ - @GetMapping("/task/{taskId}") - public ResultDomain getTaskById(@PathVariable(value = "taskId") String taskId) { - return crontabService.getTaskById(taskId); + @DeleteMapping("/crontabTask") + public ResultDomain deleteCrontab(@RequestBody TbCrontabTask crontabItem) { + return null; } /** - * @description 查询任务列表 - * @param filter 过滤条件 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 获取定时任务分页列表 + * @param pageParam + * @return */ - @PostMapping("/task/list") - public ResultDomain getTaskList(@RequestBody TbCrontabTask filter) { - return crontabService.getTaskList(filter); + @PostMapping("/crontabTaskPage") + public ResultDomain getCrontabTask(@RequestBody PageRequest pageRequest) { + return null; } /** - * @description 分页查询任务列表 - * @param pageRequest 分页请求对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 + * 获取定时任务日志分页列表 + * @param pageRequest + * @return */ - @PostMapping("/task/page") - public ResultDomain getTaskPage(@RequestBody PageRequest pageRequest) { - TbCrontabTask filter = pageRequest.getFilter(); - PageParam pageParam = pageRequest.getPageParam(); - return crontabService.getTaskPage(filter, pageParam); - } - - /** - * @description 启动定时任务 - * @param taskId 任务ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @PostMapping("/task/start/{taskId}") - public ResultDomain startTask(@PathVariable(value = "taskId") String taskId) { - return crontabService.startTask(taskId); - } - - /** - * @description 暂停定时任务 - * @param taskId 任务ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @PostMapping("/task/pause/{taskId}") - public ResultDomain pauseTask(@PathVariable(value = "taskId") String taskId) { - return crontabService.pauseTask(taskId); - } - - /** - * @description 立即执行一次任务 - * @param taskId 任务ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @PostMapping("/task/execute/{taskId}") - public ResultDomain executeTaskOnce(@PathVariable(value = "taskId") String taskId) { - return crontabService.executeTaskOnce(taskId); - } - - /** - * @description 验证Cron表达式 - * @param cronExpression Cron表达式 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @GetMapping("/task/validate") - public ResultDomain validateCronExpression(@RequestParam String cronExpression) { - return crontabService.validateCronExpression(cronExpression); - } - - // ----------------定时任务日志-------------------------------- - - /** - * @description 根据任务ID查询日志 - * @param taskId 任务ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @GetMapping("/log/task/{taskId}") - public ResultDomain getLogsByTaskId(@PathVariable(value = "taskId") String taskId) { - return crontabService.getLogsByTaskId(taskId); - } - - /** - * @description 查询日志列表 - * @param filter 过滤条件 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @PostMapping("/log/list") - public ResultDomain getLogList(@RequestBody TbCrontabLog filter) { - return crontabService.getLogList(filter); - } - - /** - * @description 分页查询日志列表 - * @param pageRequest 分页请求对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @PostMapping("/log/page") - public ResultDomain getLogPage(@RequestBody PageRequest pageRequest) { - TbCrontabLog filter = pageRequest.getFilter(); - PageParam pageParam = pageRequest.getPageParam(); - return crontabService.getLogPage(filter, pageParam); - } - - /** - * @description 根据ID查询日志详情 - * @param logId 日志ID - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @GetMapping("/log/{logId}") - public ResultDomain getLogById(@PathVariable(value = "logId") String logId) { - return crontabService.getLogById(logId); - } - - /** - * @description 清理指定天数之前的日志 - * @param days 天数 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @DeleteMapping("/log/clean/{days}") - public ResultDomain cleanLogs(@PathVariable(value = "days") Integer days) { - return crontabService.cleanLogs(days); - } - - /** - * @description 删除日志 - * @param log 日志对象 - * @return ResultDomain - * @author yslg - * @since 2025-10-25 - */ - @DeleteMapping("/log") - public ResultDomain deleteLog(@RequestBody TbCrontabLog log) { - return crontabService.deleteLog(log.getID()); + @PostMapping("/crontabTaskLogPage") + public ResultDomain getCrontabTaskLog(@RequestBody PageRequest pageRequest) { + return null; } + + } - diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/DataCollectionItemController.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/DataCollectionItemController.java new file mode 100644 index 0000000..8368546 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/controller/DataCollectionItemController.java @@ -0,0 +1,62 @@ +package org.xyzh.crontab.controller; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.*; +import org.xyzh.api.crontab.DataCollectionItemService; +import org.xyzh.common.core.domain.ResultDomain; +import org.xyzh.common.core.page.PageParam; +import org.xyzh.common.core.page.PageRequest; +import org.xyzh.common.dto.crontab.TbDataCollectionItem; +import org.xyzh.common.vo.DataCollectionItemVO; + +import java.util.List; + +/** + * @description 数据采集项控制器 + * @filename DataCollectionItemController.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +@RestController +@RequestMapping("/crontab/collection/item") +public class DataCollectionItemController { + + private static final Logger logger = LoggerFactory.getLogger(DataCollectionItemController.class); + + @Autowired + private DataCollectionItemService itemService; + + /** + * @description 查看一个任务日志对应创建的所有数据采集项 + * @param taskLogId + * @return + */ + @GetMapping("/task/{taskLogId}") + public ResultDomain getTaskLogDataCollectionItemList(@PathVariable String taskLogId) { + return null; + } + + /** + * @description 获取数据采集项分页列表 + * @param pageRequest + * @return + */ + @PostMapping("/page") + public ResultDomain getCollectionItemPage(@RequestBody PageRequest pageRequest) { + return null; + } + + /** + * @description 转换成文章 + * @param dataCollectionItem + * @return + */ + @PostMapping("/resource") + public ResultDomain convertToArticle(@RequestBody DataCollectionItemVO dataCollectionItem) { + return null; + } +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/enums/TaskEnums.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/enums/TaskEnums.java index 4c9ea3b..9c4ccd2 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/enums/TaskEnums.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/enums/TaskEnums.java @@ -2,14 +2,17 @@ package org.xyzh.crontab.enums; import java.util.Arrays; -import org.xyzh.crontab.task.DataBackupTask; -import org.xyzh.crontab.task.LogCleanTask; -import org.xyzh.crontab.task.SystemStatisticsTask; +import org.xyzh.crontab.task.newsTask.NewsCrawlerTask; + +// import org.xyzh.crontab.task.DataBackupTask; +// import org.xyzh.crontab.task.LogCleanTask; +// import org.xyzh.crontab.task.SystemStatisticsTask; public enum TaskEnums { - DATA_BACKUP("dataBackup", DataBackupTask.class), - LOG_CLEAN("logClean", LogCleanTask.class), - SystemStatistics("systemStatistics", SystemStatisticsTask.class); + // DATA_BACKUP("dataBackup", DataBackupTask.class), + // LOG_CLEAN("logClean", LogCleanTask.class), + // SystemStatistics("systemStatistics", SystemStatisticsTask.class); + NEWS_CRAWLER("newsCrawler", NewsCrawlerTask.class); private String name; diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java new file mode 100644 index 0000000..304a23b --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/mapper/DataCollectionItemMapper.java @@ -0,0 +1,86 @@ +package org.xyzh.crontab.mapper; + +import com.baomidou.mybatisplus.core.mapper.BaseMapper; +import org.apache.ibatis.annotations.Mapper; +import org.apache.ibatis.annotations.Param; +import org.xyzh.common.core.page.PageParam; +import org.xyzh.common.dto.crontab.TbDataCollectionItem; + +import java.util.List; + +/** + * @description 数据采集项数据访问层 + * @filename DataCollectionItemMapper.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +@Mapper +public interface DataCollectionItemMapper extends BaseMapper { + + /** + * @description 根据来源URL查询采集项(用于去重) + * @param sourceUrl 来源URL + * @return TbDataCollectionItem 采集项 + * @author yslg + * @since 2025-11-08 + */ + TbDataCollectionItem selectBySourceUrl(@Param("sourceUrl") String sourceUrl); + + /** + * @description 根据任务ID查询采集项列表 + * @param taskId 任务ID + * @return List 采集项列表 + * @author yslg + * @since 2025-11-08 + */ + List selectByTaskId(@Param("taskId") String taskId); + + /** + * @description 查询采集项列表 + * @param filter 过滤条件 + * @return List 采集项列表 + * @author yslg + * @since 2025-11-08 + */ + List selectItemList(TbDataCollectionItem filter); + + /** + * @description 分页查询采集项列表 + * @param filter 过滤条件 + * @param pageParam 分页参数 + * @return List 采集项列表 + * @author yslg + * @since 2025-11-08 + */ + List selectItemPage(@Param("filter") TbDataCollectionItem filter, @Param("pageParam") PageParam pageParam); + + /** + * @description 统计采集项总数 + * @param filter 过滤条件 + * @return long 总数 + * @author yslg + * @since 2025-11-08 + */ + long countItems(@Param("filter") TbDataCollectionItem filter); + + /** + * @description 批量插入采集项 + * @param itemList 采集项列表 + * @return int 影响行数 + * @author yslg + * @since 2025-11-08 + */ + int batchInsertItems(@Param("itemList") List itemList); + + /** + * @description 根据状态统计数量 + * @param taskId 任务ID(可选) + * @param status 状态 + * @return long 数量 + * @author yslg + * @since 2025-11-08 + */ + long countByStatus(@Param("taskId") String taskId, @Param("status") Integer status); +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java new file mode 100644 index 0000000..83b6923 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/pojo/CrontabItem.java @@ -0,0 +1,22 @@ +package org.xyzh.crontab.pojo; + +import lombok.Data; +import lombok.NoArgsConstructor; +import java.util.List; +import java.util.Map; + +@Data +@NoArgsConstructor +public class CrontabItem { + + private String name; + private List methods; + + @Data + public class CrontabMethod { + private String name; + private String clazz; + private String path; + private Map params; + } +} diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java index abd6c59..015a072 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java @@ -65,7 +65,12 @@ public class TaskExecutor { if (task.getMethodParams() != null && !task.getMethodParams().isEmpty()) { // 如果有参数,需要解析参数类型 method = bean.getClass().getMethod(task.getMethodName(), String.class); - method.invoke(bean, task.getMethodParams()); + // 如果是newsCrewerTask,将taskId添加到参数前面 + String methodParams = task.getMethodParams(); + if ("newsCrewerTask".equals(task.getBeanName()) && task.getTaskId() != null) { + methodParams = task.getTaskId() + "|" + methodParams; + } + method.invoke(bean, methodParams); } else { // 无参方法 method = bean.getClass().getMethod(task.getMethodName()); diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/NCDataCollectionItemService.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/NCDataCollectionItemService.java new file mode 100644 index 0000000..51287ec --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/NCDataCollectionItemService.java @@ -0,0 +1,15 @@ +package org.xyzh.crontab.service; + +import org.xyzh.api.crontab.DataCollectionItemService; + +/** + * @description 数据采集项服务接口(继承API接口) + * @filename DataCollectionItemService.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +public interface NCDataCollectionItemService extends DataCollectionItemService { + +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/impl/DataCollectionItemServiceImpl.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/impl/DataCollectionItemServiceImpl.java new file mode 100644 index 0000000..766d435 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/service/impl/DataCollectionItemServiceImpl.java @@ -0,0 +1,479 @@ +package org.xyzh.crontab.service.impl; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.xyzh.api.crontab.DataCollectionItemService; +import org.xyzh.api.news.resource.ResourceService; +import org.xyzh.common.core.domain.ResultDomain; +import org.xyzh.common.core.page.PageDomain; +import org.xyzh.common.core.page.PageParam; +import org.xyzh.common.dto.crontab.TbDataCollectionItem; +import org.xyzh.common.dto.resource.TbResource; +import org.xyzh.common.utils.IDUtils; +import org.xyzh.common.vo.DataCollectionItemVO; +import org.xyzh.common.vo.ResourceVO; +import org.xyzh.crontab.mapper.DataCollectionItemMapper; +import org.xyzh.crontab.mapper.CrontabTaskMapper; +import org.xyzh.common.dto.crontab.TbCrontabTask; +import org.xyzh.system.utils.LoginUtil; + +import java.util.Date; +import java.util.List; +import java.util.stream.Collectors; + +/** + * @description 数据采集项服务实现类 + * @filename DataCollectionItemServiceImpl.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +@Service +public class DataCollectionItemServiceImpl implements DataCollectionItemService { + + private static final Logger logger = LoggerFactory.getLogger(DataCollectionItemServiceImpl.class); + + @Autowired + private DataCollectionItemMapper itemMapper; + + @Autowired + private CrontabTaskMapper taskMapper; + + @Autowired + private ResourceService resourceService; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain createItem(TbDataCollectionItem item) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + // 检查URL是否已存在(去重) + if (item.getSourceUrl() != null && !item.getSourceUrl().isEmpty()) { + TbDataCollectionItem existing = itemMapper.selectBySourceUrl(item.getSourceUrl()); + if (existing != null) { + resultDomain.fail("该文章已存在,URL: " + item.getSourceUrl()); + return resultDomain; + } + } + + // 生成ID + item.setID(IDUtils.generateID()); + item.setCreateTime(new Date()); + item.setDeleted(false); + + // 默认值 + if (item.getStatus() == null) { + item.setStatus(0); // 默认未处理 + } + if (item.getCrawlTime() == null) { + item.setCrawlTime(new Date()); + } + + int result = itemMapper.insert(item); + if (result > 0) { + logger.info("创建采集项成功: {}", item.getTitle()); + resultDomain.success("创建采集项成功", item); + } else { + resultDomain.fail("创建采集项失败"); + } + } catch (Exception e) { + logger.error("创建采集项异常: ", e); + resultDomain.fail("创建采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain batchCreateItems(List itemList) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemList == null || itemList.isEmpty()) { + resultDomain.fail("采集项列表为空"); + return resultDomain; + } + + int successCount = 0; + Date now = new Date(); + + for (TbDataCollectionItem item : itemList) { + // 检查URL是否已存在(去重) + if (item.getSourceUrl() != null && !item.getSourceUrl().isEmpty()) { + TbDataCollectionItem existing = itemMapper.selectBySourceUrl(item.getSourceUrl()); + if (existing != null) { + logger.debug("跳过已存在的采集项: {}", item.getSourceUrl()); + continue; + } + } + + // 设置默认值 + item.setID(IDUtils.generateID()); + item.setCreateTime(now); + item.setDeleted(false); + if (item.getStatus() == null) { + item.setStatus(0); + } + if (item.getCrawlTime() == null) { + item.setCrawlTime(now); + } + + itemMapper.insert(item); + successCount++; + } + + logger.info("批量创建采集项成功,共{}条,成功{}条", itemList.size(), successCount); + resultDomain.success("批量创建采集项成功", successCount); + } catch (Exception e) { + logger.error("批量创建采集项异常: ", e); + resultDomain.fail("批量创建采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain updateItem(TbDataCollectionItem item) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (item.getID() == null) { + resultDomain.fail("采集项ID不能为空"); + return resultDomain; + } + + item.setUpdateTime(new Date()); + int result = itemMapper.updateById(item); + + if (result > 0) { + logger.info("更新采集项成功: {}", item.getID()); + resultDomain.success("更新采集项成功", item); + } else { + resultDomain.fail("更新采集项失败"); + } + } catch (Exception e) { + logger.error("更新采集项异常: ", e); + resultDomain.fail("更新采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain deleteItem(String itemId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemId == null || itemId.isEmpty()) { + resultDomain.fail("采集项ID不能为空"); + return resultDomain; + } + + int result = itemMapper.deleteById(itemId); + if (result > 0) { + logger.info("删除采集项成功,ID: {}", itemId); + resultDomain.success("删除采集项成功", (TbDataCollectionItem) null); + } else { + resultDomain.fail("删除采集项失败"); + } + } catch (Exception e) { + logger.error("删除采集项异常: ", e); + resultDomain.fail("删除采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + public ResultDomain getItemById(String itemId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemId == null || itemId.isEmpty()) { + resultDomain.fail("采集项ID不能为空"); + return resultDomain; + } + + TbDataCollectionItem item = itemMapper.selectById(itemId); + if (item != null) { + DataCollectionItemVO vo = buildVO(item); + resultDomain.success("查询成功", vo); + } else { + resultDomain.fail("采集项不存在"); + } + } catch (Exception e) { + logger.error("查询采集项异常: ", e); + resultDomain.fail("查询采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + public ResultDomain getItemList(TbDataCollectionItem filter) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (filter == null) { + filter = new TbDataCollectionItem(); + } + filter.setDeleted(false); + + List list = itemMapper.selectItemList(filter); + List voList = list.stream() + .map(this::buildVO) + .collect(Collectors.toList()); + + resultDomain.success("查询成功", voList); + } catch (Exception e) { + logger.error("查询采集项列表异常: ", e); + resultDomain.fail("查询采集项列表异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + public ResultDomain getItemPage(TbDataCollectionItem filter, PageParam pageParam) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (filter == null) { + filter = new TbDataCollectionItem(); + } + filter.setDeleted(false); + + if (pageParam == null) { + pageParam = new PageParam(); + } + + List list = itemMapper.selectItemPage(filter, pageParam); + long total = itemMapper.countItems(filter); + + List voList = list.stream() + .map(this::buildVO) + .collect(Collectors.toList()); + + PageDomain pageDomain = new PageDomain<>(); + pageDomain.setDataList(voList); + pageParam.setTotalElements(total); + pageParam.setTotalPages((int) Math.ceil((double) total / pageParam.getPageSize())); + pageDomain.setPageParam(pageParam); + + resultDomain.success("查询成功", pageDomain); + } catch (Exception e) { + logger.error("分页查询采集项异常: ", e); + resultDomain.fail("分页查询采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + public ResultDomain getItemsByTaskId(String taskId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (taskId == null || taskId.isEmpty()) { + resultDomain.fail("任务ID不能为空"); + return resultDomain; + } + + List list = itemMapper.selectByTaskId(taskId); + List voList = list.stream() + .map(this::buildVO) + .collect(Collectors.toList()); + + resultDomain.success("查询成功", voList); + } catch (Exception e) { + logger.error("根据任务ID查询采集项异常: ", e); + resultDomain.fail("根据任务ID查询采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain convertToResource(String itemId, String tagId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemId == null || itemId.isEmpty()) { + resultDomain.fail("采集项ID不能为空"); + return resultDomain; + } + if (tagId == null || tagId.isEmpty()) { + resultDomain.fail("标签ID不能为空"); + return resultDomain; + } + + // 查询采集项 + TbDataCollectionItem item = itemMapper.selectById(itemId); + if (item == null) { + resultDomain.fail("采集项不存在"); + return resultDomain; + } + + if (item.getStatus() == 1) { + resultDomain.fail("该采集项已转换为资源"); + return resultDomain; + } + + // 创建资源 + TbResource resource = new TbResource(); + resource.setResourceID(IDUtils.generateID()); + resource.setTitle(item.getTitle()); + resource.setContent(item.getContent()); + resource.setSummary(item.getSummary()); + resource.setCoverImage(item.getCoverImage()); + resource.setTagID(tagId); + resource.setAuthor(item.getAuthor()); + resource.setSource(item.getSource()); + resource.setSourceUrl(item.getSourceUrl()); + resource.setPublishTime(item.getPublishTime() != null ? item.getPublishTime() : new Date()); + resource.setStatus(1); // 已发布 + resource.setViewCount(0); + resource.setLikeCount(0); + resource.setCollectCount(0); + resource.setIsRecommend(false); + resource.setIsBanner(false); + resource.setCreateTime(new Date()); + resource.setDeleted(false); + + ResourceVO resourceVO = new ResourceVO(); + resourceVO.setResource(resource); + + ResultDomain createResult = resourceService.createResource(resourceVO); + if (!createResult.isSuccess()) { + resultDomain.fail("转换为资源失败: " + createResult.getMessage()); + return resultDomain; + } + + // 更新采集项状态 + item.setStatus(1); // 已转换为资源 + item.setResourceId(resource.getResourceID()); + item.setProcessTime(new Date()); + item.setProcessor(LoginUtil.getCurrentUserId()); + itemMapper.updateById(item); + + logger.info("采集项转换为资源成功,采集项ID: {}, 资源ID: {}", itemId, resource.getResourceID()); + resultDomain.success("转换为资源成功", resource.getResourceID()); + } catch (Exception e) { + logger.error("转换为资源异常: ", e); + resultDomain.fail("转换为资源异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain batchConvertToResource(List itemIds, String tagId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemIds == null || itemIds.isEmpty()) { + resultDomain.fail("采集项ID列表为空"); + return resultDomain; + } + if (tagId == null || tagId.isEmpty()) { + resultDomain.fail("标签ID不能为空"); + return resultDomain; + } + + int successCount = 0; + for (String itemId : itemIds) { + ResultDomain convertResult = convertToResource(itemId, tagId); + if (convertResult.isSuccess()) { + successCount++; + } + } + + logger.info("批量转换为资源完成,共{}条,成功{}条", itemIds.size(), successCount); + resultDomain.success("批量转换为资源完成", successCount); + } catch (Exception e) { + logger.error("批量转换为资源异常: ", e); + resultDomain.fail("批量转换为资源异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + @Transactional(rollbackFor = Exception.class) + public ResultDomain ignoreItem(String itemId) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + if (itemId == null || itemId.isEmpty()) { + resultDomain.fail("采集项ID不能为空"); + return resultDomain; + } + + TbDataCollectionItem item = itemMapper.selectById(itemId); + if (item == null) { + resultDomain.fail("采集项不存在"); + return resultDomain; + } + + item.setStatus(2); // 已忽略 + item.setProcessTime(new Date()); + item.setProcessor(LoginUtil.getCurrentUserId()); + itemMapper.updateById(item); + + logger.info("忽略采集项成功,ID: {}", itemId); + resultDomain.success("忽略采集项成功", item); + } catch (Exception e) { + logger.error("忽略采集项异常: ", e); + resultDomain.fail("忽略采集项异常: " + e.getMessage()); + } + return resultDomain; + } + + @Override + public ResultDomain countByStatus(String taskId, Integer status) { + ResultDomain resultDomain = new ResultDomain<>(); + try { + long count = itemMapper.countByStatus(taskId, status); + resultDomain.success("统计成功", count); + } catch (Exception e) { + logger.error("统计采集项数量异常: ", e); + resultDomain.fail("统计采集项数量异常: " + e.getMessage()); + } + return resultDomain; + } + + /** + * @description 构建VO对象 + * @param item 采集项 + * @return DataCollectionItemVO + * @author yslg + * @since 2025-11-08 + */ + private DataCollectionItemVO buildVO(TbDataCollectionItem item) { + DataCollectionItemVO vo = new DataCollectionItemVO(); + vo.setItem(item); + + // 查询关联的定时任务 + if (item.getTaskId() != null && !item.getTaskId().isEmpty()) { + TbCrontabTask task = taskMapper.selectTaskById(item.getTaskId()); + vo.setTask(task); + } + + // 设置状态文本 + String statusText = "未处理"; + if (item.getStatus() != null) { + switch (item.getStatus()) { + case 0: + statusText = "未处理"; + break; + case 1: + statusText = "已转换为资源"; + break; + case 2: + statusText = "已忽略"; + break; + default: + statusText = "未知"; + } + } + vo.setStatusText(statusText); + + // 设置操作权限 + vo.setCanEdit(item.getStatus() == null || item.getStatus() == 0 || item.getStatus() == 2); + vo.setCanConvert(item.getStatus() == null || item.getStatus() == 0); + + return vo; + } +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/DataBackupTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/DataBackupTask.java index 18126ed..2249940 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/DataBackupTask.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/DataBackupTask.java @@ -1,60 +1,60 @@ -package org.xyzh.crontab.task; +// package org.xyzh.crontab.task; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; +// import org.slf4j.Logger; +// import org.slf4j.LoggerFactory; +// import org.springframework.stereotype.Component; -import java.text.SimpleDateFormat; -import java.util.Date; +// import java.text.SimpleDateFormat; +// import java.util.Date; -/** - * @description 数据备份任务 - * @filename DataBackupTask.java - * @author yslg - * @copyright xyzh - * @since 2025-10-25 - */ -@Component("dataBackupTask") -public class DataBackupTask { +// /** +// * @description 数据备份任务 +// * @filename DataBackupTask.java +// * @author yslg +// * @copyright xyzh +// * @since 2025-10-25 +// */ +// @Component("dataBackupTask") +// public class DataBackupTask { - private static final Logger logger = LoggerFactory.getLogger(DataBackupTask.class); +// private static final Logger logger = LoggerFactory.getLogger(DataBackupTask.class); - /** - * @description 执行数据备份 - * @author yslg - * @since 2025-10-25 - */ - public void execute() { - logger.info("开始执行数据备份任务..."); +// /** +// * @description 执行数据备份 +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute() { +// logger.info("开始执行数据备份任务..."); - try { - SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd_HHmmss"); - String backupTime = sdf.format(new Date()); +// try { +// SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd_HHmmss"); +// String backupTime = sdf.format(new Date()); - // TODO: 实现数据备份逻辑 - // 1. 备份数据库 - // 2. 备份文件 - // 3. 压缩备份文件 - // 4. 上传到备份服务器或云存储 +// // TODO: 实现数据备份逻辑 +// // 1. 备份数据库 +// // 2. 备份文件 +// // 3. 压缩备份文件 +// // 4. 上传到备份服务器或云存储 - Thread.sleep(2000); // 模拟执行 +// Thread.sleep(2000); // 模拟执行 - logger.info("数据备份任务执行完成,备份标识: {}", backupTime); - } catch (Exception e) { - logger.error("数据备份任务执行失败: ", e); - throw new RuntimeException("数据备份任务执行失败", e); - } - } +// logger.info("数据备份任务执行完成,备份标识: {}", backupTime); +// } catch (Exception e) { +// logger.error("数据备份任务执行失败: ", e); +// throw new RuntimeException("数据备份任务执行失败", e); +// } +// } - /** - * @description 执行带参数的备份任务 - * @param params 参数(备份类型:full-全量,incremental-增量) - * @author yslg - * @since 2025-10-25 - */ - public void execute(String params) { - logger.info("开始执行数据备份任务,备份类型: {}", params); - execute(); - } -} +// /** +// * @description 执行带参数的备份任务 +// * @param params 参数(备份类型:full-全量,incremental-增量) +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute(String params) { +// logger.info("开始执行数据备份任务,备份类型: {}", params); +// execute(); +// } +// } diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/LogCleanTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/LogCleanTask.java index d20f007..fdad085 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/LogCleanTask.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/LogCleanTask.java @@ -1,68 +1,68 @@ -package org.xyzh.crontab.task; +// package org.xyzh.crontab.task; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; -import org.xyzh.crontab.mapper.CrontabLogMapper; +// import org.slf4j.Logger; +// import org.slf4j.LoggerFactory; +// import org.springframework.beans.factory.annotation.Autowired; +// import org.springframework.stereotype.Component; +// import org.xyzh.crontab.mapper.CrontabLogMapper; -import java.util.Calendar; -import java.util.Date; +// import java.util.Calendar; +// import java.util.Date; -/** - * @description 清理过期日志任务 - * @filename LogCleanTask.java - * @author yslg - * @copyright xyzh - * @since 2025-10-25 - */ -@Component("logCleanTask") -public class LogCleanTask { +// /** +// * @description 清理过期日志任务 +// * @filename LogCleanTask.java +// * @author yslg +// * @copyright xyzh +// * @since 2025-10-25 +// */ +// @Component("logCleanTask") +// public class LogCleanTask { - private static final Logger logger = LoggerFactory.getLogger(LogCleanTask.class); +// private static final Logger logger = LoggerFactory.getLogger(LogCleanTask.class); - @Autowired - private CrontabLogMapper logMapper; +// @Autowired +// private CrontabLogMapper logMapper; - /** - * @description 执行日志清理,默认清理30天前的日志 - * @author yslg - * @since 2025-10-25 - */ - public void execute() { - execute("30"); - } +// /** +// * @description 执行日志清理,默认清理30天前的日志 +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute() { +// execute("30"); +// } - /** - * @description 执行日志清理 - * @param params 天数参数 - * @author yslg - * @since 2025-10-25 - */ - public void execute(String params) { - logger.info("开始执行日志清理任务..."); +// /** +// * @description 执行日志清理 +// * @param params 天数参数 +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute(String params) { +// logger.info("开始执行日志清理任务..."); - try { - int days = 30; // 默认30天 - if (params != null && !params.isEmpty()) { - try { - days = Integer.parseInt(params); - } catch (NumberFormatException e) { - logger.warn("参数格式错误,使用默认值30天"); - } - } +// try { +// int days = 30; // 默认30天 +// if (params != null && !params.isEmpty()) { +// try { +// days = Integer.parseInt(params); +// } catch (NumberFormatException e) { +// logger.warn("参数格式错误,使用默认值30天"); +// } +// } - Calendar calendar = Calendar.getInstance(); - calendar.add(Calendar.DAY_OF_MONTH, -days); - Date beforeDate = calendar.getTime(); +// Calendar calendar = Calendar.getInstance(); +// calendar.add(Calendar.DAY_OF_MONTH, -days); +// Date beforeDate = calendar.getTime(); - int count = logMapper.cleanLogsByDate(beforeDate); +// int count = logMapper.cleanLogsByDate(beforeDate); - logger.info("日志清理任务执行完成,共清理{}条日志", count); - } catch (Exception e) { - logger.error("日志清理任务执行失败: ", e); - throw new RuntimeException("日志清理任务执行失败", e); - } - } -} +// logger.info("日志清理任务执行完成,共清理{}条日志", count); +// } catch (Exception e) { +// logger.error("日志清理任务执行失败: ", e); +// throw new RuntimeException("日志清理任务执行失败", e); +// } +// } +// } diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/SystemStatisticsTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/SystemStatisticsTask.java index 65547fe..20bddb6 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/SystemStatisticsTask.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/SystemStatisticsTask.java @@ -1,54 +1,54 @@ -package org.xyzh.crontab.task; +// package org.xyzh.crontab.task; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; +// import org.slf4j.Logger; +// import org.slf4j.LoggerFactory; +// import org.springframework.stereotype.Component; -/** - * @description 系统数据统计任务 - * @filename SystemStatisticsTask.java - * @author yslg - * @copyright xyzh - * @since 2025-10-25 - */ -@Component("systemStatisticsTask") -public class SystemStatisticsTask { +// /** +// * @description 系统数据统计任务 +// * @filename SystemStatisticsTask.java +// * @author yslg +// * @copyright xyzh +// * @since 2025-10-25 +// */ +// @Component("systemStatisticsTask") +// public class SystemStatisticsTask { - private static final Logger logger = LoggerFactory.getLogger(SystemStatisticsTask.class); +// private static final Logger logger = LoggerFactory.getLogger(SystemStatisticsTask.class); - /** - * @description 执行系统数据统计 - * @author yslg - * @since 2025-10-25 - */ - public void execute() { - logger.info("开始执行系统数据统计任务..."); +// /** +// * @description 执行系统数据统计 +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute() { +// logger.info("开始执行系统数据统计任务..."); - try { - // TODO: 实现系统数据统计逻辑 - // 1. 统计用户数据 - // 2. 统计资源数据 - // 3. 统计访问数据 - // 4. 生成统计报告 +// try { +// // TODO: 实现系统数据统计逻辑 +// // 1. 统计用户数据 +// // 2. 统计资源数据 +// // 3. 统计访问数据 +// // 4. 生成统计报告 - Thread.sleep(1000); // 模拟执行 +// Thread.sleep(1000); // 模拟执行 - logger.info("系统数据统计任务执行完成"); - } catch (Exception e) { - logger.error("系统数据统计任务执行失败: ", e); - throw new RuntimeException("系统数据统计任务执行失败", e); - } - } +// logger.info("系统数据统计任务执行完成"); +// } catch (Exception e) { +// logger.error("系统数据统计任务执行失败: ", e); +// throw new RuntimeException("系统数据统计任务执行失败", e); +// } +// } - /** - * @description 执行带参数的统计任务 - * @param params 参数 - * @author yslg - * @since 2025-10-25 - */ - public void execute(String params) { - logger.info("开始执行系统数据统计任务,参数: {}", params); - execute(); - } -} +// /** +// * @description 执行带参数的统计任务 +// * @param params 参数 +// * @author yslg +// * @since 2025-10-25 +// */ +// public void execute(String params) { +// logger.info("开始执行系统数据统计任务,参数: {}", params); +// execute(); +// } +// } diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ArticleStruct.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ArticleStruct.java new file mode 100644 index 0000000..0bcd312 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ArticleStruct.java @@ -0,0 +1,46 @@ +package org.xyzh.crontab.task.newsTask; + +import java.util.List; + +import org.xyzh.common.dto.resource.TbResource; + +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * @description 爬虫返回文章结构 + * @filename ArticleStruct.java + * @author yslg + * @copyright xyzh + * @since 2025-11-10 + */ +@Data +@NoArgsConstructor +public class ArticleStruct { + + private String title; + private String url; + private String publishTime; + private String author; + private String source; + private List contentRows; + + @Data + @NoArgsConstructor + public class RowStruct { + // private String tag; + // private String style; // ttext-indent: 2em;->\t\t + private String content; // 完整携带样式的p标签 + } + + public TbResource toTbResource(){ + TbResource tbResource = new TbResource(); + tbResource.setTitle(this.title); + // tbResource.setUrl(this.url); + // tbResource.setPublishTime(this.publishTime); + // tbResource.setAuthor(this.author); + // tbResource.setSource(this.source); + // tbResource.setContentRows(this.contentRows); + return tbResource; + } +} diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java new file mode 100644 index 0000000..5ce62d8 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java @@ -0,0 +1,328 @@ +package org.xyzh.crontab.task.newsTask; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import org.xyzh.api.crontab.DataCollectionItemService; +import org.xyzh.common.core.domain.ResultDomain; +import org.xyzh.common.dto.crontab.TbDataCollectionItem; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * @description 新闻爬虫定时任务 + * @filename NewsCrewerTask.java + * @author yslg + * @copyright xyzh + * @since 2025-11-08 + */ +@Component("newsCrewerTask") +public class NewsCrawlerTask { + + private static final Logger logger = LoggerFactory.getLogger(NewsCrawlerTask.class); + + @Value("${crewer.python.path:python}") + private String pythonPath; + + @Value("${crewer.script.path:../schoolNewsCrewer}") + private String scriptPath; + + @Value("${crewer.timeout:300}") + private int timeout; + + @Autowired + private DataCollectionItemService itemService; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + /** + * @description 执行新闻爬虫任务(默认爬取人民日报政治类新闻20条) + * @author yslg + * @since 2025-11-08 + */ + public void execute() { + execute("rmrb,politics,20"); + } + + /** + * @description 执行新闻爬虫任务 + * @param params 参数格式: "source,category,limit" 或 "taskId|source,category,limit" + * 如果包含taskId,格式为: "taskId|source,category,limit" + * source: 新闻源(rmrb-人民日报) + * category: 分类(politics-政治, society-社会等) + * limit: 爬取数量 + * @author yslg + * @since 2025-11-08 + */ + public void execute(String params) { + logger.info("开始执行新闻爬虫任务,参数: {}", params); + + try { + // 解析参数(支持taskId|source,category,limit格式) + String taskId = null; + String actualParams = params; + + if (params.contains("|")) { + String[] parts = params.split("\\|", 2); + taskId = parts[0]; + actualParams = parts[1]; + } + + String[] paramArray = actualParams.split(","); + String source = paramArray.length > 0 ? paramArray[0] : "rmrb"; + String category = paramArray.length > 1 ? paramArray[1] : "politics"; + String limit = paramArray.length > 2 ? paramArray[2] : "20"; + + logger.info("爬虫参数 - 来源: {}, 分类: {}, 数量: {}", source, category, limit); + + // 验证Python和脚本路径 + Path scriptDir = Paths.get(scriptPath); + if (!Files.exists(scriptDir)) { + throw new RuntimeException("爬虫脚本目录不存在: " + scriptPath); + } + + // 构建Python命令 + List command = new ArrayList<>(); + + // 检查是否是Windows系统 + String os = System.getProperty("os.name").toLowerCase(); + if (os.contains("win")) { + command.add("cmd"); + command.add("/c"); + command.add(pythonPath); + } else { + command.add(pythonPath); + } + + command.add("main.py"); + command.add(category); + command.add(limit); + + // 生成输出文件名 + String timestamp = String.valueOf(System.currentTimeMillis()); + String outputFile = String.format("output/news_%s_%s_%s.json", source, category, timestamp); + command.add(outputFile); + + logger.info("执行命令: {}", String.join(" ", command)); + + // 创建进程构建器 + ProcessBuilder processBuilder = new ProcessBuilder(command); + processBuilder.directory(scriptDir.toFile()); + processBuilder.redirectErrorStream(true); + + // 启动进程 + Process process = processBuilder.start(); + + // 读取输出 + StringBuilder output = new StringBuilder(); + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + logger.debug("Python输出: {}", line); + } + } + + // 等待进程结束 + boolean finished = process.waitFor(timeout, TimeUnit.SECONDS); + + if (!finished) { + process.destroy(); + throw new RuntimeException("爬虫任务超时(超过" + timeout + "秒)"); + } + + int exitCode = process.exitValue(); + + if (exitCode == 0) { + logger.info("新闻爬虫任务执行成功"); + + // 读取并解析结果文件 + Path outputPath = scriptDir.resolve(outputFile); + if (Files.exists(outputPath)) { + String jsonContent = Files.readString(outputPath); + ObjectMapper mapper = new ObjectMapper(); + List> newsList = mapper.readValue( + jsonContent, + List.class + ); + + logger.info("成功爬取 {} 条新闻", newsList.size()); + + // 保存新闻数据到数据库 + if (taskId != null && !taskId.isEmpty()) { + saveNewsToDatabase(newsList, taskId, source, category); + } else { + logger.warn("未提供任务ID,跳过数据保存"); + } + + } else { + logger.warn("输出文件不存在: {}", outputFile); + } + + } else { + logger.error("新闻爬虫任务执行失败,退出码: {}", exitCode); + logger.error("输出内容:\n{}", output.toString()); + throw new RuntimeException("爬虫任务执行失败,退出码: " + exitCode); + } + + } catch (Exception e) { + logger.error("新闻爬虫任务执行异常: ", e); + throw new RuntimeException("新闻爬虫任务执行异常", e); + } + } + + /** + * @description 测试Python环境 + * @author yslg + * @since 2025-11-08 + */ + public void testPythonEnvironment() { + logger.info("测试Python环境..."); + + try { + ProcessBuilder pb = new ProcessBuilder(pythonPath, "--version"); + Process process = pb.start(); + + BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream()) + ); + + String version = reader.readLine(); + int exitCode = process.waitFor(); + + if (exitCode == 0) { + logger.info("Python环境正常: {}", version); + } else { + logger.error("Python环境异常"); + } + + } catch (Exception e) { + logger.error("测试Python环境失败: ", e); + } + } + + /** + * @description 将新闻数据保存到数据库 + * @param newsList 新闻列表 + * @param taskId 任务ID + * @param source 新闻来源 + * @param category 分类 + * @author yslg + * @since 2025-11-08 + */ + private void saveNewsToDatabase(List> newsList, String taskId, String source, String category) { + logger.info("开始保存 {} 条新闻到数据库,任务ID: {}", newsList.size(), taskId); + + try { + List itemList = new ArrayList<>(); + Date now = new Date(); + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + + for (Map news : newsList) { + try { + TbDataCollectionItem item = new TbDataCollectionItem(); + + // 基本信息 + item.setTaskId(taskId); + item.setTitle(getStringValue(news, "title")); + item.setContent(getStringValue(news, "content")); + item.setSummary(getStringValue(news, "summary")); + item.setSource(source.equals("rmrb") ? "人民日报" : source); + item.setSourceUrl(getStringValue(news, "url")); + item.setCategory(category); + item.setAuthor(getStringValue(news, "author")); + + // 发布时间 + String publishTimeStr = getStringValue(news, "publish_time"); + if (publishTimeStr != null && !publishTimeStr.isEmpty()) { + try { + item.setPublishTime(dateFormat.parse(publishTimeStr)); + } catch (Exception e) { + logger.warn("解析发布时间失败: {}", publishTimeStr); + item.setPublishTime(now); + } + } else { + item.setPublishTime(now); + } + + // 封面图片 + item.setCoverImage(getStringValue(news, "cover_image")); + + // 图片列表(JSON格式) + Object imagesObj = news.get("images"); + if (imagesObj != null) { + if (imagesObj instanceof List) { + item.setImages(objectMapper.writeValueAsString(imagesObj)); + } else if (imagesObj instanceof String) { + item.setImages((String) imagesObj); + } + } + + // 标签 + Object tagsObj = news.get("tags"); + if (tagsObj != null) { + if (tagsObj instanceof List) { + List tags = (List) tagsObj; + item.setTags(String.join(",", tags)); + } else if (tagsObj instanceof String) { + item.setTags((String) tagsObj); + } + } + + // 状态和时间 + item.setStatus(0); // 未处理 + item.setCrawlTime(now); + + itemList.add(item); + } catch (Exception e) { + logger.error("转换新闻数据失败: ", e); + } + } + + // 批量保存 + if (!itemList.isEmpty()) { + ResultDomain result = itemService.batchCreateItems(itemList); + if (result.isSuccess()) { + logger.info("成功保存 {} 条新闻到数据库", result.getData()); + } else { + logger.error("保存新闻到数据库失败: {}", result.getMessage()); + } + } else { + logger.warn("没有有效的新闻数据需要保存"); + } + + } catch (Exception e) { + logger.error("保存新闻数据到数据库异常: ", e); + } + } + + /** + * @description 从Map中安全获取字符串值 + * @param map Map对象 + * @param key 键 + * @return String 值 + * @author yslg + * @since 2025-11-08 + */ + private String getStringValue(Map map, String key) { + Object value = map.get(key); + if (value == null) { + return null; + } + return value.toString(); + } +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsTask.java index 2b66d94..4c24f95 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsTask.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsTask.java @@ -5,8 +5,9 @@ abstract public class NewsTask { // 爬取网站目标 private String target; - // 爬取标题 - private String title; + + // 爬取搜索 + private String query; } diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/PythonExecutorExample.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/PythonExecutorExample.java new file mode 100644 index 0000000..e27ab69 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/PythonExecutorExample.java @@ -0,0 +1,234 @@ +package org.xyzh.crontab.task.newsTask; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * Java调用Python的简化示例 + * 用于学习和理解核心原理 + */ +public class PythonExecutorExample { + + /** + * 示例1: 最简单的调用方式 + */ + public static void example1_Simple() throws Exception { + // 1. 构建命令 + ProcessBuilder pb = new ProcessBuilder("python", "script.py", "arg1", "arg2"); + + // 2. 启动进程 + Process process = pb.start(); + + // 3. 读取输出 + BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8) + ); + String line; + while ((line = reader.readLine()) != null) { + System.out.println("Python输出: " + line); + } + + // 4. 等待结束 + int exitCode = process.waitFor(); + System.out.println("退出码: " + exitCode); + } + + /** + * 示例2: 获取返回结果(通过标准输出) + */ + public static String example2_GetResult() throws Exception { + ProcessBuilder pb = new ProcessBuilder("python", "script.py"); + Process process = pb.start(); + + // 读取所有输出 + StringBuilder result = new StringBuilder(); + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + result.append(line); + } + } + + process.waitFor(); + return result.toString(); + } + + /** + * 示例3: 带超时控制 + */ + public static void example3_WithTimeout() throws Exception { + ProcessBuilder pb = new ProcessBuilder("python", "script.py"); + Process process = pb.start(); + + // 带超时的等待(5秒) + boolean finished = process.waitFor(5, TimeUnit.SECONDS); + + if (!finished) { + // 超时,强制终止 + process.destroyForcibly(); + System.out.println("任务超时"); + } else { + int exitCode = process.exitValue(); + System.out.println("执行完成,退出码: " + exitCode); + } + } + + /** + * 示例4: 传递参数(通过命令行) + */ + public static void example4_PassArgs() throws Exception { + // 方式1: 通过命令行参数 + List command = new ArrayList<>(); + command.add("python"); + command.add("script.py"); + command.add("category=politics"); + command.add("limit=20"); + + ProcessBuilder pb = new ProcessBuilder(command); + Process process = pb.start(); + + // ... 读取输出 + process.waitFor(); + } + + /** + * 示例5: 传递参数(通过标准输入) + */ + public static void example5_PassArgsByStdin() throws Exception { + ProcessBuilder pb = new ProcessBuilder("python", "script.py"); + Process process = pb.start(); + + // 写入参数到标准输入 + try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(process.getOutputStream(), StandardCharsets.UTF_8))) { + writer.write("{\"category\":\"politics\",\"limit\":20}"); + writer.newLine(); + writer.flush(); + } + + // 关闭输入流(重要!告诉Python输入结束) + process.getOutputStream().close(); + + // 读取输出 + BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8) + ); + String result = reader.readLine(); + System.out.println("结果: " + result); + + process.waitFor(); + } + + /** + * 示例6: 处理Windows/Linux差异 + */ + public static void example6_CrossPlatform() throws Exception { + List command = new ArrayList<>(); + + String os = System.getProperty("os.name").toLowerCase(); + if (os.contains("win")) { + // Windows需要通过cmd执行 + command.add("cmd"); + command.add("/c"); + command.add("python"); + } else { + // Linux/Mac直接执行 + command.add("python3"); + } + + command.add("script.py"); + + ProcessBuilder pb = new ProcessBuilder(command); + Process process = pb.start(); + process.waitFor(); + } + + /** + * 示例7: 完整的错误处理 + */ + public static void example7_Complete() throws Exception { + ProcessBuilder pb = new ProcessBuilder("python", "script.py"); + + // 合并标准输出和错误输出 + pb.redirectErrorStream(true); + + // 设置工作目录 + pb.directory(new File("/path/to/script")); + + Process process = null; + try { + process = pb.start(); + + // 读取输出 + StringBuilder output = new StringBuilder(); + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + // 等待结束(带超时) + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new RuntimeException("任务超时"); + } + + int exitCode = process.exitValue(); + + if (exitCode == 0) { + System.out.println("执行成功"); + System.out.println("输出: " + output.toString()); + } else { + System.err.println("执行失败,退出码: " + exitCode); + System.err.println("错误输出: " + output.toString()); + throw new RuntimeException("Python执行失败"); + } + + } catch (Exception e) { + throw new RuntimeException("执行异常", e); + } finally { + // 清理资源 + if (process != null && process.isAlive()) { + process.destroyForcibly(); + } + } + } + + /** + * 示例8: 异步执行(不阻塞) + */ + public static void example8_Async() { + new Thread(() -> { + try { + ProcessBuilder pb = new ProcessBuilder("python", "script.py"); + Process process = pb.start(); + + // 在后台线程中读取输出 + BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8) + ); + String line; + while ((line = reader.readLine()) != null) { + System.out.println("后台输出: " + line); + } + + process.waitFor(); + System.out.println("后台任务完成"); + + } catch (Exception e) { + e.printStackTrace(); + } + }).start(); + + System.out.println("主线程继续执行..."); + } +} + diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ScriptDomain.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ScriptDomain.java new file mode 100644 index 0000000..4ec2584 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/ScriptDomain.java @@ -0,0 +1,15 @@ +package org.xyzh.crontab.task.newsTask; + +import lombok.Data; + +@Data +public class ScriptDomain { + + private String name; + private String path; + private String method; + private String param; + private String output; + + +} diff --git a/schoolNewsServ/crontab/src/main/resources/appliaction.yml b/schoolNewsServ/crontab/src/main/resources/appliaction.yml new file mode 100644 index 0000000..abff1c8 --- /dev/null +++ b/schoolNewsServ/crontab/src/main/resources/appliaction.yml @@ -0,0 +1,34 @@ +crawler: + python: + path: C:/Python312/python.exe + base: + path: F:/Project/schoolNews/schoolNewsCrawler + script: + - name: xxx爬虫 + path: crawler/xxx.py + method: xxx + param: xxx + output: xxx + +crontab: + items: #可供前端选择的定时任务列表 + - name: 人民日报新闻爬取 + methods: #爬取方式 + - name: 关键字搜索爬取 + class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask + path: crawler/xxx.py + params: + query: String #搜索关键字 + total: Integer #总新闻数量 + - name: 排行榜爬取 + class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask + path: crawler/xxx.py + - name: 往日精彩头条爬取 + class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask + path: crawler/xxx.py + params: + startDate: String #开始日期 + endDate: String #结束日期 + isYestoday: Boolean #是否是昨天 + + \ No newline at end of file diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/DataCollectionController.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/DataCollectionController.java deleted file mode 100644 index 8a5509c..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/DataCollectionController.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.xyzh.news.controller; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.*; -import org.xyzh.api.news.collection.DataCollectionService; -import org.xyzh.common.core.domain.ResultDomain; -import org.xyzh.common.dto.resource.TbDataCollectionConfig; -import org.xyzh.common.dto.resource.TbDataCollectionLog; - -/** - * @description 数据采集控制器 - * @filename DataCollectionController.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -@RestController -@RequestMapping("/news/collection") -public class DataCollectionController { - private static final Logger logger = LoggerFactory.getLogger(DataCollectionController.class); - - @Autowired - private DataCollectionService dataCollectionService; - - /** - * 获取配置列表 - */ - @GetMapping("/config/list") - public ResultDomain getConfigList(TbDataCollectionConfig filter) { - return null; - // return dataCollectionService.getConfigList(filter); - } - - /** - * 根据ID获取配置详情 - */ - @GetMapping("/config/{configID}") - public ResultDomain getConfigById(@PathVariable String configID) { - return dataCollectionService.getConfigById(configID); - } - - /** - * 创建配置 - */ - @PostMapping("/config/create") - public ResultDomain createConfig(@RequestBody TbDataCollectionConfig config) { - return dataCollectionService.createConfig(config); - } - - /** - * 更新配置 - */ - @PutMapping("/config/update") - public ResultDomain updateConfig(@RequestBody TbDataCollectionConfig config) { - return dataCollectionService.updateConfig(config); - } - - /** - * 删除配置 - */ - @DeleteMapping("/config/{configID}") - public ResultDomain deleteConfig(@PathVariable String configID) { - return dataCollectionService.deleteConfig(configID); - } - - /** - * 更新配置状态 - */ - @PutMapping("/config/{configID}/status") - public ResultDomain updateConfigStatus( - @PathVariable String configID, - @RequestParam Integer status) { - return dataCollectionService.updateConfigStatus(configID, status); - } - - /** - * 获取日志列表 - */ - @GetMapping("/log/list") - public ResultDomain getLogList(TbDataCollectionLog filter) { - return null; - // return dataCollectionService.getLogList(filter); - } - - /** - * 根据ID获取日志详情 - */ - @GetMapping("/log/{logID}") - public ResultDomain getLogById(@PathVariable String logID) { - return dataCollectionService.getLogById(logID); - } - - /** - * 创建日志 - */ - @PostMapping("/log/create") - public ResultDomain createLog(@RequestBody TbDataCollectionLog log) { - return dataCollectionService.createLog(log); - } - - /** - * 删除日志 - */ - @DeleteMapping("/log/{logID}") - public ResultDomain deleteLog(@PathVariable String logID) { - return null; - // return dataCollectionService.deleteLog(logID); - } - - /** - * 获取活跃配置 - */ - @GetMapping("/active") - public ResultDomain getActiveConfigs() { - return null; - // return dataCollectionService.getActiveConfigs(); - } -} diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/ResourceManagementController.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/ResourceManagementController.java deleted file mode 100644 index 331d62d..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/controller/ResourceManagementController.java +++ /dev/null @@ -1,253 +0,0 @@ -package org.xyzh.news.controller; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.web.bind.annotation.*; -import org.xyzh.common.core.domain.ResultDomain; -import org.xyzh.common.dto.resource.TbResource; -import org.xyzh.common.dto.resource.TbDataCollectionConfig; - -import java.util.Date; -import java.util.Map; - -/** - * @description 资源管理控制器 - * @filename ResourceManagementController.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -@RestController -@RequestMapping("/news/management") -public class ResourceManagementController { - private static final Logger logger = LoggerFactory.getLogger(ResourceManagementController.class); - - // ==================== 数据采集管理 ==================== - - /** - * 配置采集来源 - */ - @PostMapping("/collection/config-source") - public ResultDomain configCollectionSource(@RequestBody Map configData) { - // TODO: 实现配置采集来源 - return null; - } - - /** - * 设置采集频率 - */ - @PutMapping("/collection/frequency") - public ResultDomain setCollectionFrequency(@RequestBody Map params) { - // TODO: 实现设置采集频率(天/周) - return null; - } - - /** - * 手动触发采集 - */ - @PostMapping("/collection/manual-trigger") - public ResultDomain manualTriggerCollection(@RequestParam String configID) { - // TODO: 实现手动触发采集 - return null; - } - - /** - * 获取采集配置列表 - */ - @GetMapping("/collection/config-list") - public ResultDomain getCollectionConfigList() { - // TODO: 实现获取采集配置列表 - return null; - } - - /** - * 更新采集配置 - */ - @PutMapping("/collection/config-update") - public ResultDomain updateCollectionConfig(@RequestBody TbDataCollectionConfig config) { - // TODO: 实现更新采集配置 - return null; - } - - /** - * 删除采集配置 - */ - @DeleteMapping("/collection/config/{configID}") - public ResultDomain deleteCollectionConfig(@PathVariable String configID) { - // TODO: 实现删除采集配置 - return null; - } - - // ==================== 文章编辑管理 ==================== - - /** - * 手动新建文章 - */ - @PostMapping("/article/create") - public ResultDomain createArticle(@RequestBody Map articleData) { - // TODO: 实现手动新建文章(富文本编辑器,插入图片/链接) - return null; - } - - /** - * 编辑文章内容 - */ - @PutMapping("/article/edit") - public ResultDomain editArticle(@RequestBody TbResource article) { - // TODO: 实现编辑文章内容 - return null; - } - - /** - * 删除文章 - */ - @DeleteMapping("/article/{articleID}") - public ResultDomain deleteArticle(@PathVariable String articleID) { - // TODO: 实现删除文章 - return null; - } - - /** - * 设置文章状态 - */ - @PutMapping("/article/status") - public ResultDomain setArticleStatus(@RequestBody Map params) { - // TODO: 实现设置文章状态(草稿/已发布) - return null; - } - - /** - * 上传文章图片 - */ - @PostMapping("/article/upload-image") - public ResultDomain uploadArticleImage(@RequestParam("file") String file) { - // TODO: 实现上传文章图片 - return null; - } - - /** - * 插入文章链接 - */ - @PutMapping("/article/insert-link") - public ResultDomain insertArticleLink(@RequestBody Map params) { - // TODO: 实现插入文章链接 - return null; - } - - /** - * 获取文章编辑历史 - */ - @GetMapping("/article/edit-history/{articleID}") - public ResultDomain> getArticleEditHistory(@PathVariable String articleID) { - // TODO: 实现获取文章编辑历史 - return null; - } - - // ==================== 数据记录管理 ==================== - - /** - * 记录数据采集信息 - */ - @PostMapping("/record/collection") - public ResultDomain recordCollectionData(@RequestBody Map recordData) { - // TODO: 实现记录数据采集时间、采集数量、采集状态 - return null; - } - - /** - * 记录文章发布信息 - */ - @PostMapping("/record/publish") - public ResultDomain recordPublishData(@RequestBody Map publishData) { - // TODO: 实现记录文章发布时间、发布人、修改记录 - return null; - } - - /** - * 获取采集记录列表 - */ - @GetMapping("/record/collection-list") - public ResultDomain> getCollectionRecordList( - @RequestParam(required = false) Date startDate, - @RequestParam(required = false) Date endDate) { - // TODO: 实现获取采集记录列表 - return null; - } - - /** - * 获取发布记录列表 - */ - @GetMapping("/record/publish-list") - public ResultDomain> getPublishRecordList( - @RequestParam(required = false) String publisher, - @RequestParam(required = false) Date startDate, - @RequestParam(required = false) Date endDate) { - // TODO: 实现获取发布记录列表 - return null; - } - - // ==================== 自动发布管理 ==================== - - /** - * 配置文章自动发布时间 - */ - @PutMapping("/auto-publish/schedule") - public ResultDomain scheduleAutoPublish(@RequestBody Map scheduleData) { - // TODO: 实现配置文章自动发布时间 - return null; - } - - /** - * 设置发布前核验规则 - */ - @PutMapping("/auto-publish/verification") - public ResultDomain setVerificationRules(@RequestBody Map rules) { - // TODO: 实现设置发布前核验规则(如内容审核) - return null; - } - - /** - * 配置通知方式 - */ - @PutMapping("/auto-publish/notification") - public ResultDomain configNotification(@RequestBody Map notificationConfig) { - // TODO: 实现设置通知方式(邮件/站内信)、提醒格式 - return null; - } - - /** - * 开启/关闭自动发布 - */ - @PutMapping("/auto-publish/toggle") - public ResultDomain toggleAutoPublish(@RequestBody Map params) { - // TODO: 实现支持关闭自动发布 - return null; - } - - /** - * 获取自动发布配置 - */ - @GetMapping("/auto-publish/config") - public ResultDomain> getAutoPublishConfig() { - // TODO: 实现获取自动发布配置 - return null; - } - - /** - * 获取自动发布任务列表 - */ - @GetMapping("/auto-publish/task-list") - public ResultDomain> getAutoPublishTaskList() { - // TODO: 实现获取自动发布任务列表 - return null; - } - - /** - * 手动执行自动发布任务 - */ - @PostMapping("/auto-publish/execute") - public ResultDomain executeAutoPublishTask(@RequestParam String taskID) { - // TODO: 实现手动执行自动发布任务 - return null; - } -} diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionConfigMapper.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionConfigMapper.java deleted file mode 100644 index 9f202e3..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionConfigMapper.java +++ /dev/null @@ -1,147 +0,0 @@ -package org.xyzh.news.mapper; - -import com.baomidou.mybatisplus.core.mapper.BaseMapper; -import org.apache.ibatis.annotations.Mapper; -import org.apache.ibatis.annotations.Param; -import org.xyzh.common.core.page.PageParam; -import org.xyzh.common.dto.resource.TbDataCollectionConfig; - -import java.util.List; - -/** - * @description DataCollectionConfigMapper.java文件描述 数据采集配置数据访问层 - * @filename DataCollectionConfigMapper.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -@Mapper -public interface DataCollectionConfigMapper extends BaseMapper { - - /** - * @description 查询数据采集配置列表 - * @param filter 过滤条件 - * @return List 数据采集配置列表 - * @author yslg - * @since 2025-10-15 - */ - List selectDataCollectionConfigs(TbDataCollectionConfig filter); - - /** - * @description 根据配置ID查询配置信息 - * @param configId 配置ID - * @return TbDataCollectionConfig 配置信息 - * @author yslg - * @since 2025-10-15 - */ - TbDataCollectionConfig selectByConfigId(@Param("configId") String configId); - - /** - * @description 根据名称查询配置 - * @param name 配置名称 - * @return TbDataCollectionConfig 配置信息 - * @author yslg - * @since 2025-10-15 - */ - TbDataCollectionConfig selectByName(@Param("name") String name); - - /** - * @description 根据状态查询配置列表 - * @param status 状态 - * @return List 配置列表 - * @author yslg - * @since 2025-10-15 - */ - List selectByStatus(@Param("status") Integer status); - - /** - * @description 根据类型查询配置列表 - * @param type 类型 - * @return List 配置列表 - * @author yslg - * @since 2025-10-15 - */ - List selectByType(@Param("type") Integer type); - - /** - * @description 查询启用的配置列表 - * @return List 配置列表 - * @author yslg - * @since 2025-10-15 - */ - List selectActiveConfigs(); - - /** - * @description 检查配置名称是否存在 - * @param name 配置名称 - * @param excludeId 排除的配置ID(用于更新时排除自身) - * @return int 存在的数量 - * @author yslg - * @since 2025-10-15 - */ - int countByName(@Param("name") String name, @Param("excludeId") String excludeId); - - /** - * @description 插入数据采集配置 - * @param dataCollectionConfig 数据采集配置 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int insertDataCollectionConfig(TbDataCollectionConfig dataCollectionConfig); - - /** - * @description 更新数据采集配置 - * @param dataCollectionConfig 数据采集配置 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int updateDataCollectionConfig(TbDataCollectionConfig dataCollectionConfig); - - /** - * @description 删除数据采集配置 - * @param dataCollectionConfig 数据采集配置 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int deleteDataCollectionConfig(TbDataCollectionConfig dataCollectionConfig); - - /** - * @description 批量插入数据采集配置 - * @param dataCollectionConfigList 数据采集配置列表 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int batchInsertDataCollectionConfigs(@Param("dataCollectionConfigList") List dataCollectionConfigList); - - /** - * @description 批量删除数据采集配置 - * @param ids 配置ID列表 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int batchDeleteDataCollectionConfigs(@Param("ids") List ids); - - /** - * @description 分页查询数据采集配置 - * @param filter 过滤条件 - * @param pageParam 分页参数 - * @return List 数据采集配置列表 - * @author yslg - * @since 2025-10-15 - */ - List selectDataCollectionConfigsPage(@Param("filter") TbDataCollectionConfig filter, @Param("pageParam") PageParam pageParam); - - /** - * @description 统计数据采集配置总数 - * @param filter 过滤条件 - * @return long 总数 - * @author yslg - * @since 2025-10-15 - */ - long countDataCollectionConfigs(@Param("filter") TbDataCollectionConfig filter); -} diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionLogMapper.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionLogMapper.java deleted file mode 100644 index 640005b..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/mapper/DataCollectionLogMapper.java +++ /dev/null @@ -1,147 +0,0 @@ -package org.xyzh.news.mapper; - -import com.baomidou.mybatisplus.core.mapper.BaseMapper; -import org.apache.ibatis.annotations.Mapper; -import org.apache.ibatis.annotations.Param; -import org.xyzh.common.core.page.PageParam; -import org.xyzh.common.dto.resource.TbDataCollectionLog; - -import java.util.List; - -/** - * @description DataCollectionLogMapper.java文件描述 数据采集记录数据访问层 - * @filename DataCollectionLogMapper.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -@Mapper -public interface DataCollectionLogMapper extends BaseMapper { - - /** - * @description 查询数据采集记录列表 - * @param filter 过滤条件 - * @return List 数据采集记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectDataCollectionLogs(TbDataCollectionLog filter); - - /** - * @description 根据记录ID查询记录信息 - * @param logId 记录ID - * @return TbDataCollectionLog 记录信息 - * @author yslg - * @since 2025-10-15 - */ - TbDataCollectionLog selectByLogId(@Param("logId") String logId); - - /** - * @description 根据配置ID查询记录列表 - * @param configId 配置ID - * @return List 记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectByConfigId(@Param("configId") String configId); - - /** - * @description 根据状态查询记录列表 - * @param status 状态 - * @return List 记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectByStatus(@Param("status") Integer status); - - /** - * @description 根据类型查询记录列表 - * @param type 类型 - * @return List 记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectByType(@Param("type") Integer type); - - /** - * @description 查询最新的记录列表 - * @param limit 限制数量 - * @return List 记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectLatestLogs(@Param("limit") Integer limit); - - /** - * @description 查询采集统计信息 - * @param configId 配置ID - * @return TbDataCollectionLog 统计信息 - * @author yslg - * @since 2025-10-15 - */ - TbDataCollectionLog selectCollectionStatistics(@Param("configId") String configId); - - /** - * @description 插入数据采集记录 - * @param dataCollectionLog 数据采集记录 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int insertDataCollectionLog(TbDataCollectionLog dataCollectionLog); - - /** - * @description 更新数据采集记录 - * @param dataCollectionLog 数据采集记录 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int updateDataCollectionLog(TbDataCollectionLog dataCollectionLog); - - /** - * @description 删除数据采集记录 - * @param dataCollectionLog 数据采集记录 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int deleteDataCollectionLog(TbDataCollectionLog dataCollectionLog); - - /** - * @description 批量插入数据采集记录 - * @param dataCollectionLogList 数据采集记录列表 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int batchInsertDataCollectionLogs(@Param("dataCollectionLogList") List dataCollectionLogList); - - /** - * @description 批量删除数据采集记录 - * @param ids 记录ID列表 - * @return int 影响行数 - * @author yslg - * @since 2025-10-15 - */ - int batchDeleteDataCollectionLogs(@Param("ids") List ids); - - /** - * @description 分页查询数据采集记录 - * @param filter 过滤条件 - * @param pageParam 分页参数 - * @return List 数据采集记录列表 - * @author yslg - * @since 2025-10-15 - */ - List selectDataCollectionLogsPage(@Param("filter") TbDataCollectionLog filter, @Param("pageParam") PageParam pageParam); - - /** - * @description 统计数据采集记录总数 - * @param filter 过滤条件 - * @return long 总数 - * @author yslg - * @since 2025-10-15 - */ - long countDataCollectionLogs(@Param("filter") TbDataCollectionLog filter); -} diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/service/NCDataCollectionService.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/service/NCDataCollectionService.java deleted file mode 100644 index fb7d09d..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/service/NCDataCollectionService.java +++ /dev/null @@ -1,14 +0,0 @@ -package org.xyzh.news.service; - -import org.xyzh.api.news.collection.DataCollectionService; - -/** - * @description 数据采集服务接口 - * @filename NCDataCollectionService.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -public interface NCDataCollectionService extends DataCollectionService { - -} diff --git a/schoolNewsServ/news/src/main/java/org/xyzh/news/service/impl/NCDataCollectionServiceImpl.java b/schoolNewsServ/news/src/main/java/org/xyzh/news/service/impl/NCDataCollectionServiceImpl.java deleted file mode 100644 index b0f9095..0000000 --- a/schoolNewsServ/news/src/main/java/org/xyzh/news/service/impl/NCDataCollectionServiceImpl.java +++ /dev/null @@ -1,126 +0,0 @@ -package org.xyzh.news.service.impl; - -import java.util.Date; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Service; -import org.xyzh.common.core.domain.ResultDomain; -import org.xyzh.common.dto.resource.TbDataCollectionConfig; -import org.xyzh.common.dto.resource.TbDataCollectionLog; -import org.xyzh.news.mapper.DataCollectionConfigMapper; -import org.xyzh.news.mapper.DataCollectionLogMapper; -import org.xyzh.api.news.collection.DataCollectionService; - -/** - * @description 数据采集服务实现类 - * @filename NCDataCollectionServiceImpl.java - * @author yslg - * @copyright xyzh - * @since 2025-10-15 - */ -@Service -public class NCDataCollectionServiceImpl implements DataCollectionService { - - private static final Logger logger = LoggerFactory.getLogger(NCDataCollectionServiceImpl.class); - - @Autowired - private DataCollectionConfigMapper dataCollectionConfigMapper; - - @Autowired - private DataCollectionLogMapper dataCollectionLogMapper; - - @Override - public ResultDomain batchExecuteCollection(List configIDs) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain createConfig(TbDataCollectionConfig config) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain createLog(TbDataCollectionLog log) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain deleteConfig(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain executeCollection(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getCollectionStatus(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getConfigById(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getConfigList(Integer status) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getConfigStatistics(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getLogById(String logID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain getLogList(String configID, Date startDate, Date endDate) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain stopCollection(String configID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain updateConfig(TbDataCollectionConfig config) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain updateConfigStatus(String configID, Integer status) { - // TODO Auto-generated method stub - return null; - } - - @Override - public ResultDomain updateLastCollectTime(String configID, Date lastCollectTime) { - // TODO Auto-generated method stub - return null; - } - - -} diff --git a/schoolNewsServ/news/src/main/resources/mapper/DataCollectionConfigMapper.xml b/schoolNewsServ/news/src/main/resources/mapper/DataCollectionConfigMapper.xml deleted file mode 100644 index 6f52676..0000000 --- a/schoolNewsServ/news/src/main/resources/mapper/DataCollectionConfigMapper.xml +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - id, name, source_url, source_type, frequency, tag_id, status, - last_collect_time, creator, updater, create_time, update_time, - delete_time, deleted - - - - - - deleted = 0 - - AND name LIKE CONCAT('%', #{name}, '%') - - - AND source_type = #{sourceType} - - - AND frequency = #{frequency} - - - AND tag_id = #{tagID} - - - AND status = #{status} - - - - - - - - - - - - - - - - - - - - - - - - - - - - INSERT INTO tb_data_collection_config ( - id, name, source_url, source_type, frequency, tag_id, status, - last_collect_time, creator, updater, create_time, update_time, - delete_time, deleted - ) VALUES ( - #{id}, #{name}, #{sourceUrl}, #{sourceType}, #{frequency}, #{tagID}, #{status}, - #{lastCollectTime}, #{creator}, #{updater}, #{createTime}, #{updateTime}, - #{deleteTime}, #{deleted} - ) - - - - - UPDATE tb_data_collection_config - - - name = #{name}, - - - source_url = #{sourceUrl}, - - - source_type = #{sourceType}, - - - frequency = #{frequency}, - - - tag_id = #{tagID}, - - - status = #{status}, - - - last_collect_time = #{lastCollectTime}, - - - updater = #{updater}, - - - update_time = #{updateTime}, - - - delete_time = #{deleteTime}, - - - deleted = #{deleted}, - - - WHERE id = #{id} - - - - - DELETE FROM tb_data_collection_config - WHERE id = #{id} - - - - - INSERT INTO tb_data_collection_config ( - id, name, source_url, source_type, frequency, tag_id, status, - last_collect_time, creator, updater, create_time, update_time, - delete_time, deleted - ) VALUES - - ( - #{item.id}, #{item.name}, #{item.sourceUrl}, #{item.sourceType}, #{item.frequency}, - #{item.tagID}, #{item.status}, #{item.lastCollectTime}, #{item.creator}, - #{item.updater}, #{item.createTime}, #{item.updateTime}, #{item.deleteTime}, #{item.deleted} - ) - - - - - - DELETE FROM tb_data_collection_config - WHERE id IN - - #{id} - - - - - - - - - - diff --git a/schoolNewsServ/news/src/main/resources/mapper/DataCollectionLogMapper.xml b/schoolNewsServ/news/src/main/resources/mapper/DataCollectionLogMapper.xml deleted file mode 100644 index b5926c2..0000000 --- a/schoolNewsServ/news/src/main/resources/mapper/DataCollectionLogMapper.xml +++ /dev/null @@ -1,188 +0,0 @@ - - - - - - - - - - - - - - - - - - - id, config_id, collect_count, success_count, fail_count, status, - message, collect_time - - - - - - - AND config_id = #{configID} - - - AND status = #{status} - - - - - - - - - - - - - - - - - - - - - - - - - - - - INSERT INTO tb_data_collection_log ( - id, config_id, collect_count, success_count, fail_count, status, - message, collect_time - ) VALUES ( - #{id}, #{configID}, #{collectCount}, #{successCount}, #{failCount}, #{status}, - #{message}, #{collectTime} - ) - - - - - UPDATE tb_data_collection_log - - - config_id = #{configID}, - - - collect_count = #{collectCount}, - - - success_count = #{successCount}, - - - fail_count = #{failCount}, - - - status = #{status}, - - - message = #{message}, - - - collect_time = #{collectTime}, - - - WHERE id = #{id} - - - - - DELETE FROM tb_data_collection_log - WHERE id = #{id} - - - - - INSERT INTO tb_data_collection_log ( - id, config_id, collect_count, success_count, fail_count, status, - message, collect_time - ) VALUES - - ( - #{item.id}, #{item.configID}, #{item.collectCount}, #{item.successCount}, - #{item.failCount}, #{item.status}, #{item.message}, #{item.collectTime} - ) - - - - - - DELETE FROM tb_data_collection_log - WHERE id IN - - #{id} - - - - - - - - - -