搜索关键字爬虫
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
package org.xyzh.crontab.config;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import lombok.Data;
|
||||
import org.springframework.stereotype.Component;
|
||||
@@ -9,8 +10,10 @@ import org.springframework.stereotype.Component;
|
||||
@Component
|
||||
public class CrawlerProperties {
|
||||
|
||||
@Value("${crawler.pythonPath}")
|
||||
private String pythonPath;
|
||||
|
||||
@Value("${crawler.basePath}")
|
||||
private String basePath;
|
||||
|
||||
}
|
||||
|
||||
@@ -12,6 +12,10 @@ import org.xyzh.common.dto.crontab.TbCrontabTask;
|
||||
import org.xyzh.common.dto.crontab.TbCrontabLog;
|
||||
import org.xyzh.common.utils.IDUtils;
|
||||
import org.xyzh.crontab.pojo.CrontabItem;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
|
||||
import org.xyzh.common.utils.spring.SpringContextUtil;
|
||||
import org.xyzh.crontab.config.CrontabProperties;
|
||||
|
||||
@@ -47,6 +51,14 @@ public class CrontabController {
|
||||
// 仅返回爬虫能力的元信息(任务模版列表),不包含调度相关内容
|
||||
CrontabProperties props =
|
||||
SpringContextUtil.getBean(CrontabProperties.class);
|
||||
String jString = JSON.toJSONString(props);
|
||||
props = JSON.parseObject(jString, CrontabProperties.class);
|
||||
props.getItems().forEach(item->item.getMethods().forEach(
|
||||
method->{
|
||||
method.setClazz(null);
|
||||
method.setExcuete_method(null);
|
||||
method.setPath(null);
|
||||
}));
|
||||
rd.success("ok", props.getItems());
|
||||
} catch (Exception e) {
|
||||
rd.fail("获取可创建定时任务失败: " + e.getMessage());
|
||||
@@ -63,6 +75,25 @@ public class CrontabController {
|
||||
public ResultDomain<TbCrontabTask> createCrontab(@RequestBody TbCrontabTask crontabItem) {
|
||||
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
|
||||
try {
|
||||
// 根据taskGroup和methodName查找配置并填充beanName和methodName
|
||||
if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
|
||||
CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
|
||||
crontabItem.getTaskGroup(),
|
||||
crontabItem.getMethodName()
|
||||
);
|
||||
if (method != null) {
|
||||
crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
|
||||
crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
|
||||
JSONObject methodParams = JSON.parseObject(crontabItem.getMethodParams());
|
||||
methodParams.put("scriptPath", method.getPath());
|
||||
crontabItem.setMethodParams(methodParams.toJSONString());
|
||||
|
||||
} else {
|
||||
rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
|
||||
+ ", methodName=" + crontabItem.getMethodName());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
return crontabService.createTask(crontabItem);
|
||||
} catch (Exception e) {
|
||||
logger.error("创建定时任务失败", e);
|
||||
@@ -71,6 +102,27 @@ public class CrontabController {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据taskGroup和methodName查找对应的方法配置
|
||||
*/
|
||||
private CrontabItem.CrontabMethod findMethodByTaskGroupAndMethodName(String taskGroup, String methodName) {
|
||||
CrontabProperties props = SpringContextUtil.getBean(CrontabProperties.class);
|
||||
if (props == null || props.getItems() == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (CrontabItem item : props.getItems()) {
|
||||
if (item.getName().equals(taskGroup)) {
|
||||
for (CrontabItem.CrontabMethod method : item.getMethods()) {
|
||||
if (method.getName().equals(methodName)) {
|
||||
return method;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新定时任务
|
||||
* @param crontabItem
|
||||
@@ -80,6 +132,21 @@ public class CrontabController {
|
||||
public ResultDomain<TbCrontabTask> updateCrontab(@RequestBody TbCrontabTask crontabItem) {
|
||||
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
|
||||
try {
|
||||
// 根据taskGroup和methodName查找配置并填充beanName和methodName
|
||||
if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
|
||||
CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
|
||||
crontabItem.getTaskGroup(),
|
||||
crontabItem.getMethodName()
|
||||
);
|
||||
if (method != null) {
|
||||
crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
|
||||
crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
|
||||
} else {
|
||||
rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
|
||||
+ ", methodName=" + crontabItem.getMethodName());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
return crontabService.updateTask(crontabItem);
|
||||
} catch (Exception e) {
|
||||
logger.error("更新定时任务失败", e);
|
||||
@@ -146,6 +213,88 @@ public class CrontabController {
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据ID查询日志详情
|
||||
* @param logId 日志ID
|
||||
* @return ResultDomain<TbCrontabLog>
|
||||
*/
|
||||
@GetMapping("/log/{logId}")
|
||||
public ResultDomain<TbCrontabLog> getLogById(@PathVariable(required = true, name="logId") String logId) {
|
||||
ResultDomain<TbCrontabLog> rd = new ResultDomain<>();
|
||||
try {
|
||||
return crontabService.getLogById(logId);
|
||||
} catch (Exception e) {
|
||||
logger.error("获取日志详情失败", e);
|
||||
rd.fail("获取日志详情失败: " + e.getMessage());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
|
||||
@GetMapping("/task/validate")
|
||||
public ResultDomain<String> validateCronExpression(@RequestParam(required = true, name="cronExpression") String cronExpression) {
|
||||
ResultDomain<String> rd = new ResultDomain<>();
|
||||
try {
|
||||
return crontabService.validateCronExpression(cronExpression);
|
||||
} catch (Exception e) {
|
||||
logger.error("验证Cron表达式失败", e);
|
||||
rd.fail("验证Cron表达式失败: " + e.getMessage());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @description 启动定时任务
|
||||
* @param
|
||||
* @author yslg
|
||||
* @since 2025-11-11
|
||||
*/
|
||||
@PostMapping("/task/start/{taskId}")
|
||||
public ResultDomain<TbCrontabTask> startTask(@PathVariable(required = true, name="taskId") String taskId) {
|
||||
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
|
||||
try {
|
||||
return crontabService.startTask(taskId);
|
||||
} catch (Exception e) {
|
||||
logger.error("启动定时任务失败", e);
|
||||
rd.fail("启动定时任务失败: " + e.getMessage());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @description 暂停定时任务
|
||||
* @param
|
||||
* @author yslg
|
||||
* @since 2025-11-11
|
||||
*/
|
||||
@PostMapping("/task/pause/{taskId}")
|
||||
public ResultDomain<TbCrontabTask> pauseTask(@PathVariable(required = true, name="taskId") String taskId) {
|
||||
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
|
||||
try {
|
||||
return crontabService.pauseTask(taskId);
|
||||
} catch (Exception e) {
|
||||
logger.error("暂停定时任务失败", e);
|
||||
rd.fail("暂停定时任务失败: " + e.getMessage());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @description 立即执行一次任务
|
||||
* @param
|
||||
* @author yslg
|
||||
* @since 2025-11-11
|
||||
*/
|
||||
@PostMapping("/task/execute/{taskId}")
|
||||
public ResultDomain<TbCrontabTask> executeTaskOnce(@PathVariable(required = true, name="taskId") String taskId) {
|
||||
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
|
||||
try {
|
||||
return crontabService.executeTaskOnce(taskId);
|
||||
} catch (Exception e) {
|
||||
logger.error("执行定时任务失败", e);
|
||||
rd.fail("执行定时任务失败: " + e.getMessage());
|
||||
return rd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import org.apache.ibatis.annotations.Mapper;
|
||||
import org.apache.ibatis.annotations.Param;
|
||||
import org.xyzh.common.core.page.PageParam;
|
||||
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
|
||||
import org.xyzh.common.vo.DataCollectionItemVO;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@@ -82,5 +83,45 @@ public interface DataCollectionItemMapper extends BaseMapper<TbDataCollectionIte
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
long countByStatus(@Param("taskId") String taskId, @Param("status") Integer status);
|
||||
|
||||
// ==================== VO查询方法(使用JOIN返回完整VO) ====================
|
||||
|
||||
/**
|
||||
* @description 根据ID查询采集项VO(包含关联的任务和日志信息)
|
||||
* @param itemId 采集项ID
|
||||
* @return DataCollectionItemVO 采集项VO
|
||||
* @author yslg
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
DataCollectionItemVO selectVOById(@Param("itemId") String itemId);
|
||||
|
||||
/**
|
||||
* @description 查询采集项VO列表(包含关联的任务和日志信息)
|
||||
* @param filter 过滤条件
|
||||
* @return List<DataCollectionItemVO> 采集项VO列表
|
||||
* @author yslg
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
List<DataCollectionItemVO> selectVOList(TbDataCollectionItem filter);
|
||||
|
||||
/**
|
||||
* @description 分页查询采集项VO列表(包含关联的任务和日志信息)
|
||||
* @param filter 过滤条件
|
||||
* @param pageParam 分页参数
|
||||
* @return List<DataCollectionItemVO> 采集项VO列表
|
||||
* @author yslg
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
List<DataCollectionItemVO> selectVOPage(@Param("filter") TbDataCollectionItem filter, @Param("pageParam") PageParam pageParam);
|
||||
|
||||
/**
|
||||
* @description 根据任务ID查询采集项VO列表(包含关联的任务和日志信息)
|
||||
* @param taskId 任务ID
|
||||
* @return List<DataCollectionItemVO> 采集项VO列表
|
||||
* @author yslg
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
List<DataCollectionItemVO> selectVOByTaskId(@Param("taskId") String taskId);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -16,9 +16,17 @@ public class CrontabItem {
|
||||
@Data
|
||||
public static class CrontabMethod {
|
||||
private String name;
|
||||
@JSONField(name = "class")
|
||||
private String clazz;
|
||||
private String excuete_method;
|
||||
private String path;
|
||||
private Map<String, Object> params;
|
||||
private List<CrontabParam> params;
|
||||
}
|
||||
|
||||
@Data
|
||||
public static class CrontabParam {
|
||||
private String name;
|
||||
private String description;
|
||||
private String type;
|
||||
private Object value;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,9 +11,13 @@ import org.xyzh.common.utils.IDUtils;
|
||||
import org.xyzh.crontab.mapper.CrontabLogMapper;
|
||||
import org.xyzh.crontab.pojo.TaskParams;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.TypeReference;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @description 任务执行器
|
||||
@@ -138,25 +142,29 @@ public class TaskExecutor {
|
||||
private String injectTaskContext(Object bean, TbCrontabTask task, TbCrontabLog log) {
|
||||
String methodParams = task.getMethodParams();
|
||||
|
||||
// 如果Bean是BaseTask的子类,注入taskId和logId到JSON参数中
|
||||
if (bean instanceof org.xyzh.crontab.task.BaseTask) {
|
||||
try {
|
||||
TaskParams taskParams = TaskParams.fromJson(methodParams);
|
||||
if (taskParams != null) {
|
||||
// 注入taskId和logId
|
||||
if (taskParams.getParams() == null) {
|
||||
taskParams.setParams(new HashMap<>());
|
||||
}
|
||||
taskParams.getParams().put("taskId", task.getTaskId());
|
||||
taskParams.getParams().put("logId", log.getID());
|
||||
methodParams = taskParams.toJson();
|
||||
logger.debug("已注入任务上下文: taskId={}, logId={}", task.getTaskId(), log.getID());
|
||||
}
|
||||
// 从task对象构建完整的TaskParams
|
||||
TaskParams taskParams = new TaskParams();
|
||||
taskParams.setTaskGroup(task.getTaskGroup()); // 从task表获取
|
||||
taskParams.setMethodName(task.getMethodName()); // 从task表获取
|
||||
|
||||
// 将methodParams解析为Map并设置到params字段
|
||||
Map<String, Object> params = JSON.parseObject(methodParams,
|
||||
new TypeReference<Map<String, Object>>(){});
|
||||
|
||||
// 注入taskId和logId
|
||||
params.put("taskId", task.getTaskId());
|
||||
params.put("logId", log.getID());
|
||||
|
||||
taskParams.setParams(params);
|
||||
|
||||
methodParams = taskParams.toJson();
|
||||
} catch (Exception e) {
|
||||
logger.warn("注入任务上下文失败,使用原始参数: {}", e.getMessage());
|
||||
logger.warn("构建TaskParams失败: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return methodParams;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,6 @@ import org.xyzh.system.utils.LoginUtil;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* @description 数据采集项服务实现类
|
||||
@@ -102,29 +101,9 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
int successCount = 0;
|
||||
Date now = new Date();
|
||||
|
||||
for (TbDataCollectionItem item : itemList) {
|
||||
// 检查URL是否已存在(去重)
|
||||
if (item.getSourceUrl() != null && !item.getSourceUrl().isEmpty()) {
|
||||
TbDataCollectionItem existing = itemMapper.selectBySourceUrl(item.getSourceUrl());
|
||||
if (existing != null) {
|
||||
logger.debug("跳过已存在的采集项: {}", item.getSourceUrl());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// 设置默认值
|
||||
item.setID(IDUtils.generateID());
|
||||
item.setCreateTime(now);
|
||||
item.setDeleted(false);
|
||||
if (item.getStatus() == null) {
|
||||
item.setStatus(0);
|
||||
}
|
||||
if (item.getCrawlTime() == null) {
|
||||
item.setCrawlTime(now);
|
||||
}
|
||||
|
||||
itemMapper.insert(item);
|
||||
successCount++;
|
||||
int result = itemMapper.batchInsertItems(itemList);
|
||||
if (result > 0) {
|
||||
successCount = result;
|
||||
}
|
||||
|
||||
logger.info("批量创建采集项成功,共{}条,成功{}条", itemList.size(), successCount);
|
||||
@@ -195,9 +174,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
return resultDomain;
|
||||
}
|
||||
|
||||
TbDataCollectionItem item = itemMapper.selectById(itemId);
|
||||
if (item != null) {
|
||||
DataCollectionItemVO vo = buildVO(item);
|
||||
DataCollectionItemVO vo = itemMapper.selectVOById(itemId);
|
||||
if (vo != null) {
|
||||
resultDomain.success("查询成功", vo);
|
||||
} else {
|
||||
resultDomain.fail("采集项不存在");
|
||||
@@ -218,10 +196,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
}
|
||||
filter.setDeleted(false);
|
||||
|
||||
List<TbDataCollectionItem> list = itemMapper.selectItemList(filter);
|
||||
List<DataCollectionItemVO> voList = list.stream()
|
||||
.map(this::buildVO)
|
||||
.collect(Collectors.toList());
|
||||
List<DataCollectionItemVO> voList = itemMapper.selectVOList(filter);
|
||||
|
||||
|
||||
resultDomain.success("查询成功", voList);
|
||||
} catch (Exception e) {
|
||||
@@ -244,12 +220,9 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
pageParam = new PageParam();
|
||||
}
|
||||
|
||||
List<TbDataCollectionItem> list = itemMapper.selectItemPage(filter, pageParam);
|
||||
long total = itemMapper.countItems(filter);
|
||||
List<DataCollectionItemVO> voList = itemMapper.selectVOPage(filter, pageParam);
|
||||
|
||||
List<DataCollectionItemVO> voList = list.stream()
|
||||
.map(this::buildVO)
|
||||
.collect(Collectors.toList());
|
||||
long total = itemMapper.countItems(filter);
|
||||
|
||||
PageDomain<DataCollectionItemVO> pageDomain = new PageDomain<>();
|
||||
pageDomain.setDataList(voList);
|
||||
@@ -274,10 +247,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
return resultDomain;
|
||||
}
|
||||
|
||||
List<TbDataCollectionItem> list = itemMapper.selectByTaskId(taskId);
|
||||
List<DataCollectionItemVO> voList = list.stream()
|
||||
.map(this::buildVO)
|
||||
.collect(Collectors.toList());
|
||||
List<DataCollectionItemVO> voList = itemMapper.selectVOByTaskId(taskId);
|
||||
|
||||
|
||||
resultDomain.success("查询成功", voList);
|
||||
} catch (Exception e) {
|
||||
@@ -433,47 +404,5 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
|
||||
return resultDomain;
|
||||
}
|
||||
|
||||
/**
|
||||
* @description 构建VO对象
|
||||
* @param item 采集项
|
||||
* @return DataCollectionItemVO
|
||||
* @author yslg
|
||||
* @since 2025-11-08
|
||||
*/
|
||||
private DataCollectionItemVO buildVO(TbDataCollectionItem item) {
|
||||
DataCollectionItemVO vo = new DataCollectionItemVO();
|
||||
vo.setItem(item);
|
||||
|
||||
// 查询关联的定时任务
|
||||
if (item.getTaskId() != null && !item.getTaskId().isEmpty()) {
|
||||
TbCrontabTask task = taskMapper.selectTaskById(item.getTaskId());
|
||||
vo.setTask(task);
|
||||
}
|
||||
|
||||
// 设置状态文本
|
||||
String statusText = "未处理";
|
||||
if (item.getStatus() != null) {
|
||||
switch (item.getStatus()) {
|
||||
case 0:
|
||||
statusText = "未处理";
|
||||
break;
|
||||
case 1:
|
||||
statusText = "已转换为资源";
|
||||
break;
|
||||
case 2:
|
||||
statusText = "已忽略";
|
||||
break;
|
||||
default:
|
||||
statusText = "未知";
|
||||
}
|
||||
}
|
||||
vo.setStatusText(statusText);
|
||||
|
||||
// 设置操作权限
|
||||
vo.setCanEdit(item.getStatus() == null || item.getStatus() == 0 || item.getStatus() == 2);
|
||||
vo.setCanConvert(item.getStatus() == null || item.getStatus() == 0);
|
||||
|
||||
return vo;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
@@ -41,6 +42,11 @@ public abstract class CommandTask extends BaseTask {
|
||||
processBuilder.directory(workDir.toFile());
|
||||
processBuilder.redirectErrorStream(true);
|
||||
|
||||
// 设置环境变量强制Python使用UTF-8编码(解决Windows GBK编码问题)
|
||||
Map<String, String> env = processBuilder.environment();
|
||||
env.put("PYTHONIOENCODING", "utf-8"); // Python I/O编码
|
||||
env.put("PYTHONUTF8", "1"); // Python 3.7+ UTF-8模式
|
||||
|
||||
// 启动进程
|
||||
Process process = processBuilder.start();
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ public abstract class PythonCommandTask extends CommandTask {
|
||||
|
||||
@Autowired
|
||||
protected CrawlerProperties crawlerProperties;
|
||||
|
||||
/**
|
||||
* 获取Python可执行文件路径
|
||||
*/
|
||||
@@ -47,18 +46,16 @@ public abstract class PythonCommandTask extends CommandTask {
|
||||
|
||||
/**
|
||||
* 构建Python命令
|
||||
*
|
||||
* 注意: 不使用 cmd /c 或 bash -c,直接调用Python可执行文件
|
||||
* 这样可以避免shell对JSON参数中的引号进行错误处理
|
||||
* ProcessBuilder可以直接启动exe文件,参数会正确传递
|
||||
*/
|
||||
@Override
|
||||
protected List<String> buildCommand(TaskParams taskParams) throws Exception {
|
||||
List<String> command = new ArrayList<>();
|
||||
|
||||
// 检查操作系统
|
||||
String os = System.getProperty("os.name").toLowerCase();
|
||||
if (os.contains("win")) {
|
||||
command.add("cmd");
|
||||
command.add("/c");
|
||||
}
|
||||
|
||||
// 直接调用Python可执行文件,不使用shell
|
||||
command.add(getPythonPath());
|
||||
|
||||
// 添加Python脚本和参数(由子类实现)
|
||||
|
||||
@@ -7,6 +7,7 @@ import org.springframework.stereotype.Component;
|
||||
import org.xyzh.api.crontab.DataCollectionItemService;
|
||||
import org.xyzh.common.core.domain.ResultDomain;
|
||||
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
|
||||
import org.xyzh.common.utils.IDUtils;
|
||||
import org.xyzh.crontab.config.CrontabProperties;
|
||||
import org.xyzh.crontab.pojo.TaskParams;
|
||||
import org.xyzh.crontab.task.PythonCommandTask;
|
||||
@@ -17,7 +18,9 @@ import java.nio.file.Paths;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @description 新闻爬虫定时任务
|
||||
@@ -42,43 +45,58 @@ public class NewsCrawlerTask extends PythonCommandTask {
|
||||
protected List<String> buildPythonArgs(TaskParams taskParams) throws Exception {
|
||||
List<String> args = new ArrayList<>();
|
||||
|
||||
String methodName = taskParams.getMethodName();
|
||||
String source = "rmrb";
|
||||
String category = "politics";
|
||||
String limit = "20";
|
||||
|
||||
// 根据不同的方法名称构建不同的参数
|
||||
if ("关键字搜索爬取".equals(methodName)) {
|
||||
String query = taskParams.getParamAsString("query");
|
||||
Integer total = taskParams.getParamAsInt("total");
|
||||
category = query != null ? query : "politics";
|
||||
limit = total != null ? total.toString() : "20";
|
||||
|
||||
} else if ("排行榜爬取".equals(methodName)) {
|
||||
category = "ranking";
|
||||
|
||||
} else if ("往日精彩头条爬取".equals(methodName)) {
|
||||
String startDate = taskParams.getParamAsString("startDate");
|
||||
String endDate = taskParams.getParamAsString("endDate");
|
||||
Boolean isYesterday = taskParams.getParamAsBoolean("isYesterday");
|
||||
category = "history";
|
||||
// 这里可以将日期参数传递给Python脚本
|
||||
// 1. 从params获取scriptPath
|
||||
String scriptPath = taskParams.getParamAsString("scriptPath");
|
||||
if (scriptPath == null || scriptPath.isEmpty()) {
|
||||
throw new Exception("scriptPath参数缺失");
|
||||
}
|
||||
|
||||
// 生成输出文件名
|
||||
// 2. 生成输出文件名
|
||||
String timestamp = String.valueOf(System.currentTimeMillis());
|
||||
String outputFile = String.format("output/news_%s_%s_%s.json", source, category, timestamp);
|
||||
String outputFile = String.format("output/news_%s.json", timestamp);
|
||||
|
||||
// 保存输出文件路径到params中,供handleResult使用
|
||||
taskParams.setParam("_outputFile", outputFile);
|
||||
|
||||
// 添加脚本和参数
|
||||
args.add("main.py");
|
||||
args.add(category);
|
||||
args.add(limit);
|
||||
|
||||
|
||||
// 4. 构建命令参数
|
||||
args.add(scriptPath); // 动态脚本路径
|
||||
|
||||
// 5. 遍历params,动态构建命令行参数
|
||||
if (taskParams.getParams() != null) {
|
||||
for (Map.Entry<String, Object> entry : taskParams.getParams().entrySet()) {
|
||||
String key = entry.getKey();
|
||||
Object value = entry.getValue();
|
||||
|
||||
// 跳过特殊参数
|
||||
if (key.startsWith("_") || key.equals("scriptPath") ||
|
||||
key.equals("taskId") || key.equals("logId")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 获取对应的Python参数名
|
||||
String pythonArg = "--"+key;
|
||||
if (pythonArg != null && value != null) {
|
||||
if (value instanceof Boolean) {
|
||||
// Boolean类型: true时只传参数名,false时不传
|
||||
if ((Boolean) value) {
|
||||
args.add(pythonArg);
|
||||
}
|
||||
} else {
|
||||
// String/Integer类型: 传参数名+值
|
||||
args.add(pythonArg);
|
||||
args.add(value.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 6. 统一添加output参数
|
||||
args.add("--output");
|
||||
args.add(outputFile);
|
||||
|
||||
logger.info("爬虫参数 - 来源: {}, 分类: {}, 数量: {}", source, category, limit);
|
||||
logger.info("Python脚本: {}, 命令行参数: {}", scriptPath, String.join(" ", args.subList(1, args.size())));
|
||||
|
||||
return args;
|
||||
}
|
||||
@@ -98,11 +116,12 @@ public class NewsCrawlerTask extends PythonCommandTask {
|
||||
|
||||
// 读取并解析结果文件
|
||||
String jsonContent = Files.readString(outputPath);
|
||||
List<ArticleStruct> newsList = JSON.parseObject(
|
||||
jsonContent,
|
||||
new TypeReference<List<ArticleStruct>>() {}
|
||||
);
|
||||
|
||||
ResultDomain<ArticleStruct> result = JSON.parseObject(jsonContent, new TypeReference<ResultDomain<ArticleStruct>>(){});
|
||||
if (!result.isSuccess()) {
|
||||
logger.error("爬取新闻失败: {}", result.getMessage());
|
||||
return;
|
||||
}
|
||||
List<ArticleStruct> newsList = result.getDataList();
|
||||
logger.info("成功爬取 {} 条新闻", newsList.size());
|
||||
|
||||
// 获取taskId和logId
|
||||
@@ -126,6 +145,8 @@ public class NewsCrawlerTask extends PythonCommandTask {
|
||||
try {
|
||||
List<TbDataCollectionItem> itemList = new ArrayList<>();
|
||||
Date now = new Date();
|
||||
SimpleDateFormat parser = new SimpleDateFormat("yyyy年MM月dd日HH:mm");
|
||||
|
||||
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
for (ArticleStruct news : newsList) {
|
||||
@@ -133,6 +154,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
|
||||
TbDataCollectionItem item = new TbDataCollectionItem();
|
||||
|
||||
// 基本信息
|
||||
item.setID(IDUtils.generateID());
|
||||
item.setTaskId(taskId);
|
||||
item.setLogId(logId);
|
||||
item.setTitle(news.getTitle());
|
||||
@@ -156,7 +178,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
|
||||
String publishTimeStr = news.getPublishTime();
|
||||
if (publishTimeStr != null && !publishTimeStr.isEmpty()) {
|
||||
try {
|
||||
item.setPublishTime(dateFormat.parse(publishTimeStr));
|
||||
item.setPublishTime(dateFormat.parse(dateFormat.format(parser.parse(publishTimeStr))));
|
||||
} catch (Exception e) {
|
||||
logger.warn("解析发布时间失败: {}", publishTimeStr);
|
||||
item.setPublishTime(now);
|
||||
|
||||
Reference in New Issue
Block a user