爬虫实现,修改class转样式,前端渲染

This commit is contained in:
2025-11-21 16:49:37 +08:00
parent 7eab82c6af
commit a492b68fa2
11 changed files with 406 additions and 45 deletions

View File

@@ -7,7 +7,7 @@
-- 1. 关键字搜索爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'1',
@@ -22,19 +22,22 @@ INSERT INTO `tb_crontab_task_meta` (
{
"name": "query",
"description": "搜索关键字",
"type": "String",
"type": "Input",
"valueType": "String",
"value": "",
"required": true
},
{
"name": "total",
"description": "总新闻数量",
"type": "Integer",
"type": "InputNumber",
"valueType": "Integer",
"value": 10,
"required": true
}
]',
1,
1,
'system',
NOW()
);
@@ -42,7 +45,7 @@ INSERT INTO `tb_crontab_task_meta` (
-- 2. 排行榜爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'2',
@@ -54,6 +57,7 @@ INSERT INTO `tb_crontab_task_meta` (
'execute',
'crawler/rmrb/RmrbHotPoint.py',
'[]',
1,
2,
'system',
NOW()
@@ -62,7 +66,7 @@ INSERT INTO `tb_crontab_task_meta` (
-- 3. 往日精彩头条爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'3',
@@ -75,28 +79,199 @@ INSERT INTO `tb_crontab_task_meta` (
'crawler/rmrb/RmrbTrending.py',
'[
{
"name": "startDate",
"description": "开始日期",
"type": "String",
"name": "dateRange",
"description": "日期范围",
"type": "DateRangePicker",
"valueType": "String",
"value": "",
"required": false
},
{
"name": "endDate",
"description": "结束日期",
"type": "String",
"value": "",
"required": false
"required": false,
"startKey": "startDate",
"endKey": "endDate"
},
{
"name": "yesterday",
"description": "是否是昨天",
"type": "Boolean",
"type": "Switch",
"valueType": "Boolean",
"value": true,
"required": false
}
]',
1,
3,
'system',
NOW()
);
);
-- 4. 新华网关键字搜索爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'4',
'xhw_keyword_search',
'关键字搜索爬取',
'根据关键字搜索新华网新闻内容',
'新华网新闻爬取',
'newsCrewerTask',
'execute',
'crawler/xhw/XhwSearch.py',
'[
{
"name": "query",
"description": "搜索关键字",
"type": "Input",
"valueType": "String",
"value": "",
"required": true
},
{
"name": "total",
"description": "抓取数量",
"type": "InputNumber",
"valueType": "Integer",
"value": 10,
"required": true
}
]',
1,
4,
'system',
NOW()
);
-- 5. 新华网热点新闻爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'5',
'xhw_hot_point',
'热点新闻爬取',
'爬取新华网热点新闻',
'新华网新闻爬取',
'newsCrewerTask',
'execute',
'crawler/xhw/XhwHotPoint.py',
'[]',
1,
5,
'system',
NOW()
);
-- 6. 新华网推荐新闻爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'6',
'xhw_commend',
'推荐新闻爬取',
'爬取新华网推荐新闻',
'新华网新闻爬取',
'newsCrewerTask',
'execute',
'crawler/xhw/XhwCommend.py',
'[]',
1,
6,
'system',
NOW()
);
-- 7. 学习强国关键字搜索爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'7',
'xxqg_keyword_search',
'关键字搜索爬取',
'根据关键字搜索学习强国新闻内容',
'学习强国新闻爬取',
'newsCrewerTask',
'execute',
'crawler/xxqg/XxqgSearch.py',
'[
{
"name": "query",
"description": "搜索关键字",
"type": "Input",
"valueType": "String",
"value": "",
"required": true
},
{
"name": "total",
"description": "抓取数量",
"type": "InputNumber",
"valueType": "Integer",
"value": 10,
"required": true
}
]',
1,
7,
'system',
NOW()
);
-- 8. 学习强国栏目新闻爬取
INSERT INTO `tb_crontab_task_meta` (
`id`, `meta_id`, `name`, `description`, `category`,
`bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`,
`sort_order`, `creator`, `create_time`
) VALUES (
'8',
'xxqg_column_crawl',
'栏目新闻爬取',
'爬取学习强国指定栏目的新闻内容',
'学习强国新闻爬取',
'newsCrewerTask',
'execute',
'crawler/xxqg/XxqgColumn.py',
'[
{
"name": "column",
"description": "栏目名称",
"type": "Select",
"valueType": "String",
"value": "important",
"required": true,
"options": [
{"label": "重要新闻", "value": "important"},
{"label": "学习时评", "value": "xuexishiping"},
{"label": "综合新闻", "value": "zonghexinwen"},
{"label": "中宣部发布", "value": "zhongxuanbu"}
]
},
{
"name": "yesterday",
"description": "是否抓取昨天的数据",
"type": "Switch",
"valueType": "Boolean",
"value": true,
"required": false
},
{
"name": "dateRange",
"description": "日期范围",
"type": "DateRangePicker",
"valueType": "String",
"value": "",
"required": false,
"startKey": "start",
"endKey": "end"
}
]',
1,
8,
'system',
NOW()
);

View File

@@ -46,7 +46,7 @@ public class TbCrontabLog extends BaseDTO {
private String methodParams;
/**
* @description 执行状态0:失败 1:成功)
* @description 执行状态0:失败 1:成功,2 运行中
*/
private Integer executeStatus;

View File

@@ -58,7 +58,7 @@ public class TaskExecutor {
log.setDeleted(false);
try {
log.setExecuteStatus(0);
log.setExecuteStatus(2);
log.setExecuteMessage("执行中");
int i = logMapper.insertLog(log);

View File

@@ -15,6 +15,7 @@ import org.xyzh.api.system.role.RoleService;
import org.xyzh.common.core.domain.ResultDomain;
import org.xyzh.common.dto.crontab.TbCrontabEmailDefault;
import org.xyzh.common.dto.crontab.TbCrontabEmailRecipient;
import org.xyzh.common.dto.crontab.TbCrontabLog;
import org.xyzh.common.dto.crontab.TbCrontabTask;
import org.xyzh.common.dto.crontab.TbCrontabTaskMeta;
import org.xyzh.common.dto.crontab.TbDataCollectionItem;
@@ -25,6 +26,7 @@ import org.xyzh.common.utils.NonUtils;
import org.xyzh.common.vo.DataCollectionItemVO;
import org.xyzh.common.vo.ResourceVO;
import org.xyzh.common.vo.UserDeptRoleVO;
import org.xyzh.crontab.mapper.CrontabLogMapper;
import org.xyzh.crontab.pojo.TaskParams;
import org.xyzh.crontab.task.PythonCommandTask;
@@ -79,6 +81,8 @@ public class NewsCrawlerTask extends PythonCommandTask {
@Autowired
private RoleService roleService;
@Autowired
private CrontabLogMapper logMapper;
/**
* 构建Python脚本参数
*/
@@ -132,9 +136,12 @@ public class NewsCrawlerTask extends PythonCommandTask {
String pythonArg = "--"+key;
if (pythonArg != null && value != null) {
if (value instanceof Boolean) {
// Boolean类型: true时只传参数名false时不传
if ((Boolean) value) {
args.add(pythonArg);
args.add("true");
}else{
args.add(pythonArg);
args.add("false");
}
} else {
// String/Integer类型: 传参数名+值
@@ -305,6 +312,13 @@ public class NewsCrawlerTask extends PythonCommandTask {
} else {
logger.warn("没有有效的新闻数据需要保存");
}
if(passList.isEmpty() && notPassList.isEmpty()){
TbCrontabLog log = new TbCrontabLog();
log.setID(logId);
log.setExecuteStatus(1);
log.setExecuteMessage("未爬取到数据");
int i = logMapper.updateLog(log);
}
// 自动发布并记录成功发布的 URL 集合
Set<String> publishedUrls = new HashSet<>();