搜索关键字爬虫

This commit is contained in:
2025-11-12 16:10:34 +08:00
parent 7be02fe396
commit 675e6da7d7
37 changed files with 3382 additions and 572 deletions

View File

@@ -1,5 +1,6 @@
package org.xyzh.crontab.config;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.Data;
import org.springframework.stereotype.Component;
@@ -9,8 +10,10 @@ import org.springframework.stereotype.Component;
@Component
public class CrawlerProperties {
@Value("${crawler.pythonPath}")
private String pythonPath;
@Value("${crawler.basePath}")
private String basePath;
}

View File

@@ -12,6 +12,10 @@ import org.xyzh.common.dto.crontab.TbCrontabTask;
import org.xyzh.common.dto.crontab.TbCrontabLog;
import org.xyzh.common.utils.IDUtils;
import org.xyzh.crontab.pojo.CrontabItem;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import org.xyzh.common.utils.spring.SpringContextUtil;
import org.xyzh.crontab.config.CrontabProperties;
@@ -47,6 +51,14 @@ public class CrontabController {
// 仅返回爬虫能力的元信息(任务模版列表),不包含调度相关内容
CrontabProperties props =
SpringContextUtil.getBean(CrontabProperties.class);
String jString = JSON.toJSONString(props);
props = JSON.parseObject(jString, CrontabProperties.class);
props.getItems().forEach(item->item.getMethods().forEach(
method->{
method.setClazz(null);
method.setExcuete_method(null);
method.setPath(null);
}));
rd.success("ok", props.getItems());
} catch (Exception e) {
rd.fail("获取可创建定时任务失败: " + e.getMessage());
@@ -63,6 +75,25 @@ public class CrontabController {
public ResultDomain<TbCrontabTask> createCrontab(@RequestBody TbCrontabTask crontabItem) {
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
try {
// 根据taskGroup和methodName查找配置并填充beanName和methodName
if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
crontabItem.getTaskGroup(),
crontabItem.getMethodName()
);
if (method != null) {
crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
JSONObject methodParams = JSON.parseObject(crontabItem.getMethodParams());
methodParams.put("scriptPath", method.getPath());
crontabItem.setMethodParams(methodParams.toJSONString());
} else {
rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
+ ", methodName=" + crontabItem.getMethodName());
return rd;
}
}
return crontabService.createTask(crontabItem);
} catch (Exception e) {
logger.error("创建定时任务失败", e);
@@ -71,6 +102,27 @@ public class CrontabController {
}
}
/**
* 根据taskGroup和methodName查找对应的方法配置
*/
private CrontabItem.CrontabMethod findMethodByTaskGroupAndMethodName(String taskGroup, String methodName) {
CrontabProperties props = SpringContextUtil.getBean(CrontabProperties.class);
if (props == null || props.getItems() == null) {
return null;
}
for (CrontabItem item : props.getItems()) {
if (item.getName().equals(taskGroup)) {
for (CrontabItem.CrontabMethod method : item.getMethods()) {
if (method.getName().equals(methodName)) {
return method;
}
}
}
}
return null;
}
/**
* 更新定时任务
* @param crontabItem
@@ -80,6 +132,21 @@ public class CrontabController {
public ResultDomain<TbCrontabTask> updateCrontab(@RequestBody TbCrontabTask crontabItem) {
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
try {
// 根据taskGroup和methodName查找配置并填充beanName和methodName
if (crontabItem.getBeanName() == null || crontabItem.getBeanName().isEmpty()) {
CrontabItem.CrontabMethod method = findMethodByTaskGroupAndMethodName(
crontabItem.getTaskGroup(),
crontabItem.getMethodName()
);
if (method != null) {
crontabItem.setBeanName(method.getClazz()); // 设置Bean名称
crontabItem.setMethodName(method.getExcuete_method()); // 设置执行方法名
} else {
rd.fail("未找到对应的配置: taskGroup=" + crontabItem.getTaskGroup()
+ ", methodName=" + crontabItem.getMethodName());
return rd;
}
}
return crontabService.updateTask(crontabItem);
} catch (Exception e) {
logger.error("更新定时任务失败", e);
@@ -146,6 +213,88 @@ public class CrontabController {
return rd;
}
}
/**
* 根据ID查询日志详情
* @param logId 日志ID
* @return ResultDomain<TbCrontabLog>
*/
@GetMapping("/log/{logId}")
public ResultDomain<TbCrontabLog> getLogById(@PathVariable(required = true, name="logId") String logId) {
ResultDomain<TbCrontabLog> rd = new ResultDomain<>();
try {
return crontabService.getLogById(logId);
} catch (Exception e) {
logger.error("获取日志详情失败", e);
rd.fail("获取日志详情失败: " + e.getMessage());
return rd;
}
}
@GetMapping("/task/validate")
public ResultDomain<String> validateCronExpression(@RequestParam(required = true, name="cronExpression") String cronExpression) {
ResultDomain<String> rd = new ResultDomain<>();
try {
return crontabService.validateCronExpression(cronExpression);
} catch (Exception e) {
logger.error("验证Cron表达式失败", e);
rd.fail("验证Cron表达式失败: " + e.getMessage());
return rd;
}
}
/**
* @description 启动定时任务
* @param
* @author yslg
* @since 2025-11-11
*/
@PostMapping("/task/start/{taskId}")
public ResultDomain<TbCrontabTask> startTask(@PathVariable(required = true, name="taskId") String taskId) {
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
try {
return crontabService.startTask(taskId);
} catch (Exception e) {
logger.error("启动定时任务失败", e);
rd.fail("启动定时任务失败: " + e.getMessage());
return rd;
}
}
/**
* @description 暂停定时任务
* @param
* @author yslg
* @since 2025-11-11
*/
@PostMapping("/task/pause/{taskId}")
public ResultDomain<TbCrontabTask> pauseTask(@PathVariable(required = true, name="taskId") String taskId) {
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
try {
return crontabService.pauseTask(taskId);
} catch (Exception e) {
logger.error("暂停定时任务失败", e);
rd.fail("暂停定时任务失败: " + e.getMessage());
return rd;
}
}
/**
* @description 立即执行一次任务
* @param
* @author yslg
* @since 2025-11-11
*/
@PostMapping("/task/execute/{taskId}")
public ResultDomain<TbCrontabTask> executeTaskOnce(@PathVariable(required = true, name="taskId") String taskId) {
ResultDomain<TbCrontabTask> rd = new ResultDomain<>();
try {
return crontabService.executeTaskOnce(taskId);
} catch (Exception e) {
logger.error("执行定时任务失败", e);
rd.fail("执行定时任务失败: " + e.getMessage());
return rd;
}
}
}

View File

@@ -5,6 +5,7 @@ import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.xyzh.common.core.page.PageParam;
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
import org.xyzh.common.vo.DataCollectionItemVO;
import java.util.List;
@@ -82,5 +83,45 @@ public interface DataCollectionItemMapper extends BaseMapper<TbDataCollectionIte
* @since 2025-11-08
*/
long countByStatus(@Param("taskId") String taskId, @Param("status") Integer status);
// ==================== VO查询方法(使用JOIN返回完整VO) ====================
/**
* @description 根据ID查询采集项VO包含关联的任务和日志信息
* @param itemId 采集项ID
* @return DataCollectionItemVO 采集项VO
* @author yslg
* @since 2025-11-08
*/
DataCollectionItemVO selectVOById(@Param("itemId") String itemId);
/**
* @description 查询采集项VO列表包含关联的任务和日志信息
* @param filter 过滤条件
* @return List<DataCollectionItemVO> 采集项VO列表
* @author yslg
* @since 2025-11-08
*/
List<DataCollectionItemVO> selectVOList(TbDataCollectionItem filter);
/**
* @description 分页查询采集项VO列表包含关联的任务和日志信息
* @param filter 过滤条件
* @param pageParam 分页参数
* @return List<DataCollectionItemVO> 采集项VO列表
* @author yslg
* @since 2025-11-08
*/
List<DataCollectionItemVO> selectVOPage(@Param("filter") TbDataCollectionItem filter, @Param("pageParam") PageParam pageParam);
/**
* @description 根据任务ID查询采集项VO列表包含关联的任务和日志信息
* @param taskId 任务ID
* @return List<DataCollectionItemVO> 采集项VO列表
* @author yslg
* @since 2025-11-08
*/
List<DataCollectionItemVO> selectVOByTaskId(@Param("taskId") String taskId);
}

View File

@@ -16,9 +16,17 @@ public class CrontabItem {
@Data
public static class CrontabMethod {
private String name;
@JSONField(name = "class")
private String clazz;
private String excuete_method;
private String path;
private Map<String, Object> params;
private List<CrontabParam> params;
}
@Data
public static class CrontabParam {
private String name;
private String description;
private String type;
private Object value;
}
}

View File

@@ -11,9 +11,13 @@ import org.xyzh.common.utils.IDUtils;
import org.xyzh.crontab.mapper.CrontabLogMapper;
import org.xyzh.crontab.pojo.TaskParams;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.TypeReference;
import java.lang.reflect.Method;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
/**
* @description 任务执行器
@@ -138,25 +142,29 @@ public class TaskExecutor {
private String injectTaskContext(Object bean, TbCrontabTask task, TbCrontabLog log) {
String methodParams = task.getMethodParams();
// 如果Bean是BaseTask的子类注入taskId和logId到JSON参数中
if (bean instanceof org.xyzh.crontab.task.BaseTask) {
try {
TaskParams taskParams = TaskParams.fromJson(methodParams);
if (taskParams != null) {
// 注入taskId和logId
if (taskParams.getParams() == null) {
taskParams.setParams(new HashMap<>());
}
taskParams.getParams().put("taskId", task.getTaskId());
taskParams.getParams().put("logId", log.getID());
methodParams = taskParams.toJson();
logger.debug("已注入任务上下文: taskId={}, logId={}", task.getTaskId(), log.getID());
}
// 从task对象构建完整的TaskParams
TaskParams taskParams = new TaskParams();
taskParams.setTaskGroup(task.getTaskGroup()); // task表获取
taskParams.setMethodName(task.getMethodName()); // 从task表获取
// 将methodParams解析为Map并设置到params字段
Map<String, Object> params = JSON.parseObject(methodParams,
new TypeReference<Map<String, Object>>(){});
// 注入taskId和logId
params.put("taskId", task.getTaskId());
params.put("logId", log.getID());
taskParams.setParams(params);
methodParams = taskParams.toJson();
} catch (Exception e) {
logger.warn("注入任务上下文失败,使用原始参数: {}", e.getMessage());
logger.warn("构建TaskParams失败: {}", e.getMessage());
}
}
return methodParams;
}
}

View File

@@ -23,7 +23,6 @@ import org.xyzh.system.utils.LoginUtil;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
/**
* @description 数据采集项服务实现类
@@ -102,29 +101,9 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
int successCount = 0;
Date now = new Date();
for (TbDataCollectionItem item : itemList) {
// 检查URL是否已存在去重
if (item.getSourceUrl() != null && !item.getSourceUrl().isEmpty()) {
TbDataCollectionItem existing = itemMapper.selectBySourceUrl(item.getSourceUrl());
if (existing != null) {
logger.debug("跳过已存在的采集项: {}", item.getSourceUrl());
continue;
}
}
// 设置默认值
item.setID(IDUtils.generateID());
item.setCreateTime(now);
item.setDeleted(false);
if (item.getStatus() == null) {
item.setStatus(0);
}
if (item.getCrawlTime() == null) {
item.setCrawlTime(now);
}
itemMapper.insert(item);
successCount++;
int result = itemMapper.batchInsertItems(itemList);
if (result > 0) {
successCount = result;
}
logger.info("批量创建采集项成功,共{}条,成功{}条", itemList.size(), successCount);
@@ -195,9 +174,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
return resultDomain;
}
TbDataCollectionItem item = itemMapper.selectById(itemId);
if (item != null) {
DataCollectionItemVO vo = buildVO(item);
DataCollectionItemVO vo = itemMapper.selectVOById(itemId);
if (vo != null) {
resultDomain.success("查询成功", vo);
} else {
resultDomain.fail("采集项不存在");
@@ -218,10 +196,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
}
filter.setDeleted(false);
List<TbDataCollectionItem> list = itemMapper.selectItemList(filter);
List<DataCollectionItemVO> voList = list.stream()
.map(this::buildVO)
.collect(Collectors.toList());
List<DataCollectionItemVO> voList = itemMapper.selectVOList(filter);
resultDomain.success("查询成功", voList);
} catch (Exception e) {
@@ -244,12 +220,9 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
pageParam = new PageParam();
}
List<TbDataCollectionItem> list = itemMapper.selectItemPage(filter, pageParam);
long total = itemMapper.countItems(filter);
List<DataCollectionItemVO> voList = itemMapper.selectVOPage(filter, pageParam);
List<DataCollectionItemVO> voList = list.stream()
.map(this::buildVO)
.collect(Collectors.toList());
long total = itemMapper.countItems(filter);
PageDomain<DataCollectionItemVO> pageDomain = new PageDomain<>();
pageDomain.setDataList(voList);
@@ -274,10 +247,8 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
return resultDomain;
}
List<TbDataCollectionItem> list = itemMapper.selectByTaskId(taskId);
List<DataCollectionItemVO> voList = list.stream()
.map(this::buildVO)
.collect(Collectors.toList());
List<DataCollectionItemVO> voList = itemMapper.selectVOByTaskId(taskId);
resultDomain.success("查询成功", voList);
} catch (Exception e) {
@@ -433,47 +404,5 @@ public class DataCollectionItemServiceImpl implements DataCollectionItemService
return resultDomain;
}
/**
* @description 构建VO对象
* @param item 采集项
* @return DataCollectionItemVO
* @author yslg
* @since 2025-11-08
*/
private DataCollectionItemVO buildVO(TbDataCollectionItem item) {
DataCollectionItemVO vo = new DataCollectionItemVO();
vo.setItem(item);
// 查询关联的定时任务
if (item.getTaskId() != null && !item.getTaskId().isEmpty()) {
TbCrontabTask task = taskMapper.selectTaskById(item.getTaskId());
vo.setTask(task);
}
// 设置状态文本
String statusText = "未处理";
if (item.getStatus() != null) {
switch (item.getStatus()) {
case 0:
statusText = "未处理";
break;
case 1:
statusText = "已转换为资源";
break;
case 2:
statusText = "已忽略";
break;
default:
statusText = "未知";
}
}
vo.setStatusText(statusText);
// 设置操作权限
vo.setCanEdit(item.getStatus() == null || item.getStatus() == 0 || item.getStatus() == 2);
vo.setCanConvert(item.getStatus() == null || item.getStatus() == 0);
return vo;
}
}

View File

@@ -8,6 +8,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
@@ -41,6 +42,11 @@ public abstract class CommandTask extends BaseTask {
processBuilder.directory(workDir.toFile());
processBuilder.redirectErrorStream(true);
// 设置环境变量强制Python使用UTF-8编码(解决Windows GBK编码问题)
Map<String, String> env = processBuilder.environment();
env.put("PYTHONIOENCODING", "utf-8"); // Python I/O编码
env.put("PYTHONUTF8", "1"); // Python 3.7+ UTF-8模式
// 启动进程
Process process = processBuilder.start();

View File

@@ -18,7 +18,6 @@ public abstract class PythonCommandTask extends CommandTask {
@Autowired
protected CrawlerProperties crawlerProperties;
/**
* 获取Python可执行文件路径
*/
@@ -47,18 +46,16 @@ public abstract class PythonCommandTask extends CommandTask {
/**
* 构建Python命令
*
* 注意: 不使用 cmd /c 或 bash -c直接调用Python可执行文件
* 这样可以避免shell对JSON参数中的引号进行错误处理
* ProcessBuilder可以直接启动exe文件参数会正确传递
*/
@Override
protected List<String> buildCommand(TaskParams taskParams) throws Exception {
List<String> command = new ArrayList<>();
// 检查操作系统
String os = System.getProperty("os.name").toLowerCase();
if (os.contains("win")) {
command.add("cmd");
command.add("/c");
}
// 直接调用Python可执行文件不使用shell
command.add(getPythonPath());
// 添加Python脚本和参数由子类实现

View File

@@ -7,6 +7,7 @@ import org.springframework.stereotype.Component;
import org.xyzh.api.crontab.DataCollectionItemService;
import org.xyzh.common.core.domain.ResultDomain;
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
import org.xyzh.common.utils.IDUtils;
import org.xyzh.crontab.config.CrontabProperties;
import org.xyzh.crontab.pojo.TaskParams;
import org.xyzh.crontab.task.PythonCommandTask;
@@ -17,7 +18,9 @@ import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @description 新闻爬虫定时任务
@@ -42,43 +45,58 @@ public class NewsCrawlerTask extends PythonCommandTask {
protected List<String> buildPythonArgs(TaskParams taskParams) throws Exception {
List<String> args = new ArrayList<>();
String methodName = taskParams.getMethodName();
String source = "rmrb";
String category = "politics";
String limit = "20";
// 根据不同的方法名称构建不同的参数
if ("关键字搜索爬取".equals(methodName)) {
String query = taskParams.getParamAsString("query");
Integer total = taskParams.getParamAsInt("total");
category = query != null ? query : "politics";
limit = total != null ? total.toString() : "20";
} else if ("排行榜爬取".equals(methodName)) {
category = "ranking";
} else if ("往日精彩头条爬取".equals(methodName)) {
String startDate = taskParams.getParamAsString("startDate");
String endDate = taskParams.getParamAsString("endDate");
Boolean isYesterday = taskParams.getParamAsBoolean("isYesterday");
category = "history";
// 这里可以将日期参数传递给Python脚本
// 1. 从params获取scriptPath
String scriptPath = taskParams.getParamAsString("scriptPath");
if (scriptPath == null || scriptPath.isEmpty()) {
throw new Exception("scriptPath参数缺失");
}
// 生成输出文件名
// 2. 生成输出文件名
String timestamp = String.valueOf(System.currentTimeMillis());
String outputFile = String.format("output/news_%s_%s_%s.json", source, category, timestamp);
String outputFile = String.format("output/news_%s.json", timestamp);
// 保存输出文件路径到params中供handleResult使用
taskParams.setParam("_outputFile", outputFile);
// 添加脚本和参数
args.add("main.py");
args.add(category);
args.add(limit);
// 4. 构建命令参数
args.add(scriptPath); // 动态脚本路径
// 5. 遍历params动态构建命令行参数
if (taskParams.getParams() != null) {
for (Map.Entry<String, Object> entry : taskParams.getParams().entrySet()) {
String key = entry.getKey();
Object value = entry.getValue();
// 跳过特殊参数
if (key.startsWith("_") || key.equals("scriptPath") ||
key.equals("taskId") || key.equals("logId")) {
continue;
}
// 获取对应的Python参数名
String pythonArg = "--"+key;
if (pythonArg != null && value != null) {
if (value instanceof Boolean) {
// Boolean类型: true时只传参数名false时不传
if ((Boolean) value) {
args.add(pythonArg);
}
} else {
// String/Integer类型: 传参数名+值
args.add(pythonArg);
args.add(value.toString());
}
}
}
}
// 6. 统一添加output参数
args.add("--output");
args.add(outputFile);
logger.info("爬虫参数 - 来源: {}, 分类: {}, 数: {}", source, category, limit);
logger.info("Python脚本: {}, 命令行参数: {}", scriptPath, String.join(" ", args.subList(1, args.size())));
return args;
}
@@ -98,11 +116,12 @@ public class NewsCrawlerTask extends PythonCommandTask {
// 读取并解析结果文件
String jsonContent = Files.readString(outputPath);
List<ArticleStruct> newsList = JSON.parseObject(
jsonContent,
new TypeReference<List<ArticleStruct>>() {}
);
ResultDomain<ArticleStruct> result = JSON.parseObject(jsonContent, new TypeReference<ResultDomain<ArticleStruct>>(){});
if (!result.isSuccess()) {
logger.error("爬取新闻失败: {}", result.getMessage());
return;
}
List<ArticleStruct> newsList = result.getDataList();
logger.info("成功爬取 {} 条新闻", newsList.size());
// 获取taskId和logId
@@ -126,6 +145,8 @@ public class NewsCrawlerTask extends PythonCommandTask {
try {
List<TbDataCollectionItem> itemList = new ArrayList<>();
Date now = new Date();
SimpleDateFormat parser = new SimpleDateFormat("yyyy年MM月dd日HH:mm");
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
for (ArticleStruct news : newsList) {
@@ -133,6 +154,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
TbDataCollectionItem item = new TbDataCollectionItem();
// 基本信息
item.setID(IDUtils.generateID());
item.setTaskId(taskId);
item.setLogId(logId);
item.setTitle(news.getTitle());
@@ -156,7 +178,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
String publishTimeStr = news.getPublishTime();
if (publishTimeStr != null && !publishTimeStr.isEmpty()) {
try {
item.setPublishTime(dateFormat.parse(publishTimeStr));
item.setPublishTime(dateFormat.parse(dateFormat.format(parser.parse(publishTimeStr))));
} catch (Exception e) {
logger.warn("解析发布时间失败: {}", publishTimeStr);
item.setPublishTime(now);