调试修改爬虫

This commit is contained in:
2025-11-12 19:16:50 +08:00
parent 675e6da7d7
commit e55a52f20b
27 changed files with 1023 additions and 601 deletions

View File

@@ -63,6 +63,11 @@ public class DataCollectionItemController {
return itemService.convertToResource(request.getItemId(), request.getTagId());
}
@PutMapping("/{itemId}/status/{status}")
public ResultDomain<String> updateItemStatus(@PathVariable(name = "itemId") String itemId, @PathVariable(name = "status") int status) {
return itemService.updateItemStatus(itemId, status);
}
/**
* @description 转换请求
*/

View File

@@ -28,6 +28,15 @@ public interface CrontabLogMapper extends BaseMapper<TbCrontabLog> {
*/
int insertLog(@Param("log") TbCrontabLog log);
/**
* @description 更新日志
* @param log 日志信息
* @return int 影响行数
* @author yslg
* @since 2025-11-12
*/
int updateLog(@Param("log") TbCrontabLog log);
/**
* @description 根据ID查询日志
* @param logId 日志ID

View File

@@ -84,6 +84,16 @@ public interface DataCollectionItemMapper extends BaseMapper<TbDataCollectionIte
*/
long countByStatus(@Param("taskId") String taskId, @Param("status") Integer status);
/**
* @description 更新采集项状态
* @param itemId 采集项ID
* @param status 状态
* @return int 影响行数
* @author yslg
* @since 2025-11-08
*/
int updateItemStatus(@Param("itemId") String itemId, @Param("status") Integer status);
// ==================== VO查询方法(使用JOIN返回完整VO) ====================
/**

View File

@@ -58,6 +58,10 @@ public class TaskExecutor {
log.setDeleted(false);
try {
log.setExecuteStatus(0);
log.setExecuteMessage("执行中");
int i = logMapper.insertLog(log);
// 检查是否允许并发执行
if (task.getConcurrent() == 0) {
// TODO: 可以添加分布式锁来防止并发执行
@@ -84,7 +88,7 @@ public class TaskExecutor {
log.setEndTime(endTime);
log.setExecuteDuration((int) (endTime.getTime() - startTime.getTime()));
log.setExecuteStatus(1);
log.setExecuteMessage("执行成功");
log.setExecuteMessage(null);
logger.info("任务执行成功: {} [{}ms]", task.getTaskName(), log.getExecuteDuration());
} catch (Exception e) {
@@ -100,7 +104,7 @@ public class TaskExecutor {
} finally {
// 保存执行日志
try {
logMapper.insertLog(log);
logMapper.updateLog(log);
} catch (Exception e) {
logger.error("保存任务执行日志失败: {}", task.getTaskName(), e);
}

View File

@@ -17,10 +17,13 @@ import org.xyzh.common.utils.IDUtils;
import org.xyzh.common.vo.DataCollectionItemVO;
import org.xyzh.common.vo.ResourceVO;
import org.xyzh.crontab.mapper.DataCollectionItemMapper;
import org.xyzh.crontab.mapper.CrontabLogMapper;
import org.xyzh.crontab.mapper.CrontabTaskMapper;
import org.xyzh.common.dto.crontab.TbCrontabLog;
import org.xyzh.common.dto.crontab.TbCrontabTask;
import org.xyzh.system.utils.LoginUtil;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@@ -42,6 +45,9 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
@Autowired
private CrontabTaskMapper taskMapper;
@Autowired
private CrontabLogMapper logMapper;
@Autowired
private ResourceService resourceService;
@@ -100,11 +106,23 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
int successCount = 0;
Date now = new Date();
int result = itemMapper.batchInsertItems(itemList);
if (result > 0) {
successCount = result;
List<TbDataCollectionItem> newItems = new ArrayList<>();
for (TbDataCollectionItem it : itemList) {
TbDataCollectionItem existing = itemMapper.selectBySourceUrl(it.getSourceUrl());
if (existing == null) {
newItems.add(it);
}
}
if (!newItems.isEmpty()) {
successCount = itemMapper.batchInsertItems(newItems);
}
String logId = itemList.get(0).getLogId();
TbCrontabLog log = new TbCrontabLog();
log.setID(logId);
log.setExecuteStatus(1);
log.setExecuteMessage("爬取成功,共" + itemList.size() + "条,新增" + successCount + "");
int i = logMapper.updateLog(log);
logger.info("批量创建采集项成功,共{}条,成功{}条", itemList.size(), successCount);
resultDomain.success("批量创建采集项成功", successCount);
@@ -404,5 +422,21 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
return resultDomain;
}
@Override
public ResultDomain<String> updateItemStatus(String itemId, int status) {
ResultDomain<String> resultDomain = new ResultDomain<>();
try {
int result = itemMapper.updateItemStatus(itemId, status);
if (result > 0) {
resultDomain.success("更新采集项状态成功", itemId);
} else {
resultDomain.fail("更新采集项状态失败");
}
} catch (Exception e) {
logger.error("更新采集项状态异常: ", e);
resultDomain.fail("更新采集项状态异常: " + e.getMessage());
}
return resultDomain;
}
}

View File

@@ -23,6 +23,9 @@ public class ArticleStruct {
private String publishTime;
private String author;
private String source;
private String logId;
private Integer executeStatus;
private String executeMessage;
private List<RowStruct> contentRows;
@Data

View File

@@ -158,7 +158,8 @@ public class NewsCrawlerTask extends PythonCommandTask {
item.setTaskId(taskId);
item.setLogId(logId);
item.setTitle(news.getTitle());
item.setExecuteStatus(news.getExecuteStatus());
item.setExecuteMessage(news.getExecuteMessage());
// 拼接HTML内容
if (news.getContentRows() != null && !news.getContentRows().isEmpty()) {
StringBuilder html = new StringBuilder();

View File

@@ -99,6 +99,20 @@
</trim>
</insert>
<!-- updateLog -->
<update id="updateLog">
UPDATE tb_crontab_log
SET
<if test="log.executeStatus != null">execute_status = #{log.executeStatus},</if>
<if test="log.executeMessage != null">execute_message = #{log.executeMessage},</if>
<if test="log.exceptionInfo != null">exception_info = #{log.exceptionInfo},</if>
<if test="log.endTime != null">end_time = #{log.endTime},</if>
<if test="log.executeDuration != null">execute_duration = #{log.executeDuration},</if>
update_time = NOW()
WHERE id = #{log.ID} AND deleted = 0
</update>
<!-- 根据ID查询日志 -->
<select id="selectLogById" resultMap="BaseResultMap">
SELECT

View File

@@ -25,6 +25,8 @@
<result column="crawl_time" property="crawlTime" />
<result column="process_time" property="processTime" />
<result column="processor" property="processor" />
<result column="execute_status" property="executeStatus" />
<result column="execute_message" property="executeMessage" />
<result column="create_time" property="createTime" />
<result column="update_time" property="updateTime" />
<result column="delete_time" property="deleteTime" />
@@ -53,6 +55,8 @@
<result column="crawl_time" property="crawlTime" />
<result column="process_time" property="processTime" />
<result column="processor" property="processor" />
<result column="item_execute_status" property="itemExecuteStatus" />
<result column="item_execute_message" property="itemExecuteMessage" />
<result column="item_create_time" property="createTime" />
<result column="item_update_time" property="updateTime" />
@@ -74,7 +78,7 @@
<sql id="Base_Column_List">
id, task_id, log_id, title, content, summary, source, source_url, category, author,
publish_time, cover_image, images, tags, status, resource_id, crawl_time, process_time,
processor, create_time, update_time, delete_time, deleted
processor, execute_status, execute_message, create_time, update_time, delete_time, deleted
</sql>
<!-- VO查询字段列表(包含关联表) -->
@@ -98,6 +102,8 @@
i.crawl_time,
i.process_time,
i.processor,
i.execute_status as item_execute_status,
i.execute_message as item_execute_message,
i.create_time as item_create_time,
i.update_time as item_update_time,
t.task_name,
@@ -259,7 +265,7 @@
INSERT INTO tb_data_collection_item (
id, task_id, log_id, title, content, summary, source, source_url,
category, author, publish_time, cover_image, images, tags, status,
resource_id, crawl_time, process_time, processor,
resource_id, crawl_time, process_time, processor, execute_status, execute_message,
create_time, update_time, deleted
)
VALUES
@@ -269,7 +275,7 @@
#{item.summary}, #{item.source}, #{item.sourceUrl}, #{item.category},
#{item.author}, #{item.publishTime}, #{item.coverImage}, #{item.images},
#{item.tags}, #{item.status}, #{item.resourceId}, #{item.crawlTime},
#{item.processTime}, #{item.processor},
#{item.processTime}, #{item.processor}, #{item.executeStatus}, #{item.executeMessage},
NOW(), NOW(), 0
)
</foreach>
@@ -397,4 +403,12 @@
ORDER BY i.create_time DESC
</select>
<!-- updateItemStatus -->
<update id="updateItemStatus">
UPDATE tb_data_collection_item
SET status = #{status}
WHERE id = #{itemId}
AND deleted = 0
</update>
</mapper>