From a492b68fa21992e8cb3a892e22cf93464f908ea8 Mon Sep 17 00:00:00 2001 From: wangys <3401275564@qq.com> Date: Fri, 21 Nov 2025 16:49:37 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E5=AE=9E=E7=8E=B0=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E6=94=B9class=E8=BD=AC=E6=A0=B7=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E5=89=8D=E7=AB=AF=E6=B8=B2=E6=9F=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- schoolNewsCrawler/crawler/xhw/XhwCommend.py | 6 +- schoolNewsCrawler/crawler/xhw/XhwHotPoint.py | 6 +- schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py | 64 +++++- .../.bin/mysql/sql/initCrontabMetaData.sql | 211 ++++++++++++++++-- .../xyzh/common/dto/crontab/TbCrontabLog.java | 2 +- .../xyzh/crontab/scheduler/TaskExecutor.java | 2 +- .../task/newsTask/NewsCrawlerTask.java | 16 +- schoolNewsWeb/src/types/crontab/index.ts | 33 ++- .../manage/crontab/LogManagementView.vue | 15 +- .../admin/manage/crontab/NewsCrawlerView.vue | 94 +++++++- .../manage/resource/ArticleManagementView.vue | 2 +- 11 files changed, 406 insertions(+), 45 deletions(-) diff --git a/schoolNewsCrawler/crawler/xhw/XhwCommend.py b/schoolNewsCrawler/crawler/xhw/XhwCommend.py index 8af14f7..c188437 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwCommend.py +++ b/schoolNewsCrawler/crawler/xhw/XhwCommend.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """ -新华网搜索爬虫命令行工具 -用法: python RmrbSearch.py --key "关键词" --total 10 --type 0 +新华网特别推荐爬虫命令行工具 +用法: python XhwCommend.py --output "输出文件路径" """ import argparse @@ -20,7 +20,7 @@ from loguru import logger def main(): """主函数""" parser = argparse.ArgumentParser( - description='新华网新闻搜索工具', + description='新华网特别推荐爬虫工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" """ diff --git a/schoolNewsCrawler/crawler/xhw/XhwHotPoint.py b/schoolNewsCrawler/crawler/xhw/XhwHotPoint.py index 0216a20..356f4ae 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwHotPoint.py +++ b/schoolNewsCrawler/crawler/xhw/XhwHotPoint.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """ -新华网搜索爬虫命令行工具 -用法: python RmrbSearch.py --key "关键词" --total 10 --type 0 +新华网热点爬虫命令行工具 +用法: python XhwHotPoint.py --output "输出文件路径" """ import argparse @@ -20,7 +20,7 @@ from loguru import logger def main(): """主函数""" parser = argparse.ArgumentParser( - description='新华网新闻搜索工具', + description='新华网热点爬虫工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" """ diff --git a/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py b/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py index f210db2..70fe8dd 100644 --- a/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py +++ b/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py @@ -250,6 +250,56 @@ class XxqgCrawler(BaseCrawler): # 相对路径,补全域名 return self.config.base_url + url + + def _extract_inline_style(self, element) -> str: + """ + 提取元素的计算样式并转换为inline style + + Args: + element: Selenium WebElement + + Returns: + inline style 字符串 + """ + # 需要提取的CSS属性列表 + css_properties = [ + 'text-align', + 'text-indent', + 'margin', + 'margin-top', + 'margin-bottom', + 'margin-left', + 'margin-right', + 'padding', + 'padding-top', + 'padding-bottom', + 'padding-left', + 'padding-right', + 'font-size', + 'font-weight', + 'font-style', + 'color', + 'background-color', + 'line-height', + 'letter-spacing', + 'word-spacing' + ] + + styles = [] + for prop in css_properties: + try: + value = element.value_of_css_property(prop) + # 过滤默认值和空值 + if value and value not in ['none', 'normal', 'auto', '0px', 'rgba(0, 0, 0, 0)', 'transparent']: + # 对于 margin/padding,如果都是 0px 就跳过 + if 'margin' in prop or 'padding' in prop: + if value == '0px' or value == '0': + continue + styles.append(f"{prop}: {value}") + except: + continue + + return "; ".join(styles) if styles else "" def parse_news_detail(self, url: str) -> NewsItem: news_item = NewsItem(title='', contentRows=[], url=url) @@ -355,11 +405,21 @@ class XxqgCrawler(BaseCrawler): text_content = child.text.strip() # 过滤空内容 if text_content: + # 提取计算样式并转换为inline style + inline_style = self._extract_inline_style(child) + tag_name = child.tag_name + + # 构建新的HTML标签(用inline style替代class) + if inline_style: + content_html = f'<{tag_name} style="{inline_style}">{child.get_attribute("innerHTML")}' + else: + content_html = f'<{tag_name}>{child.get_attribute("innerHTML")}' + news_item.contentRows.append({ "type": "text", - "content": text_content + "content": content_html }) - # logger.debug(f"提取文字: {text_content[:50]}...") + logger.debug(f"提取文字(转换样式): {text_content[:50]}...") except Exception as e: logger.warning(f"处理内容元素失败: {str(e)}") diff --git a/schoolNewsServ/.bin/mysql/sql/initCrontabMetaData.sql b/schoolNewsServ/.bin/mysql/sql/initCrontabMetaData.sql index dc9913a..998e93c 100644 --- a/schoolNewsServ/.bin/mysql/sql/initCrontabMetaData.sql +++ b/schoolNewsServ/.bin/mysql/sql/initCrontabMetaData.sql @@ -7,7 +7,7 @@ -- 1. 关键字搜索爬取 INSERT INTO `tb_crontab_task_meta` ( `id`, `meta_id`, `name`, `description`, `category`, - `bean_name`, `method_name`, `script_path`, `param_schema`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, `sort_order`, `creator`, `create_time` ) VALUES ( '1', @@ -22,19 +22,22 @@ INSERT INTO `tb_crontab_task_meta` ( { "name": "query", "description": "搜索关键字", - "type": "String", + "type": "Input", + "valueType": "String", "value": "", "required": true }, { "name": "total", "description": "总新闻数量", - "type": "Integer", + "type": "InputNumber", + "valueType": "Integer", "value": 10, "required": true } ]', 1, + 1, 'system', NOW() ); @@ -42,7 +45,7 @@ INSERT INTO `tb_crontab_task_meta` ( -- 2. 排行榜爬取 INSERT INTO `tb_crontab_task_meta` ( `id`, `meta_id`, `name`, `description`, `category`, - `bean_name`, `method_name`, `script_path`, `param_schema`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, `sort_order`, `creator`, `create_time` ) VALUES ( '2', @@ -54,6 +57,7 @@ INSERT INTO `tb_crontab_task_meta` ( 'execute', 'crawler/rmrb/RmrbHotPoint.py', '[]', + 1, 2, 'system', NOW() @@ -62,7 +66,7 @@ INSERT INTO `tb_crontab_task_meta` ( -- 3. 往日精彩头条爬取 INSERT INTO `tb_crontab_task_meta` ( `id`, `meta_id`, `name`, `description`, `category`, - `bean_name`, `method_name`, `script_path`, `param_schema`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, `sort_order`, `creator`, `create_time` ) VALUES ( '3', @@ -75,28 +79,199 @@ INSERT INTO `tb_crontab_task_meta` ( 'crawler/rmrb/RmrbTrending.py', '[ { - "name": "startDate", - "description": "开始日期", - "type": "String", + "name": "dateRange", + "description": "日期范围", + "type": "DateRangePicker", + "valueType": "String", "value": "", - "required": false - }, - { - "name": "endDate", - "description": "结束日期", - "type": "String", - "value": "", - "required": false + "required": false, + "startKey": "startDate", + "endKey": "endDate" }, { "name": "yesterday", "description": "是否是昨天", - "type": "Boolean", + "type": "Switch", + "valueType": "Boolean", "value": true, "required": false } ]', + 1, 3, 'system', NOW() -); \ No newline at end of file +); + +-- 4. 新华网关键字搜索爬取 +INSERT INTO `tb_crontab_task_meta` ( + `id`, `meta_id`, `name`, `description`, `category`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, + `sort_order`, `creator`, `create_time` +) VALUES ( + '4', + 'xhw_keyword_search', + '关键字搜索爬取', + '根据关键字搜索新华网新闻内容', + '新华网新闻爬取', + 'newsCrewerTask', + 'execute', + 'crawler/xhw/XhwSearch.py', + '[ + { + "name": "query", + "description": "搜索关键字", + "type": "Input", + "valueType": "String", + "value": "", + "required": true + }, + { + "name": "total", + "description": "抓取数量", + "type": "InputNumber", + "valueType": "Integer", + "value": 10, + "required": true + } + ]', + 1, + 4, + 'system', + NOW() +); + +-- 5. 新华网热点新闻爬取 +INSERT INTO `tb_crontab_task_meta` ( + `id`, `meta_id`, `name`, `description`, `category`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, + `sort_order`, `creator`, `create_time` +) VALUES ( + '5', + 'xhw_hot_point', + '热点新闻爬取', + '爬取新华网热点新闻', + '新华网新闻爬取', + 'newsCrewerTask', + 'execute', + 'crawler/xhw/XhwHotPoint.py', + '[]', + 1, + 5, + 'system', + NOW() +); + +-- 6. 新华网推荐新闻爬取 +INSERT INTO `tb_crontab_task_meta` ( + `id`, `meta_id`, `name`, `description`, `category`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, + `sort_order`, `creator`, `create_time` +) VALUES ( + '6', + 'xhw_commend', + '推荐新闻爬取', + '爬取新华网推荐新闻', + '新华网新闻爬取', + 'newsCrewerTask', + 'execute', + 'crawler/xhw/XhwCommend.py', + '[]', + 1, + 6, + 'system', + NOW() +); + +-- 7. 学习强国关键字搜索爬取 +INSERT INTO `tb_crontab_task_meta` ( + `id`, `meta_id`, `name`, `description`, `category`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, + `sort_order`, `creator`, `create_time` +) VALUES ( + '7', + 'xxqg_keyword_search', + '关键字搜索爬取', + '根据关键字搜索学习强国新闻内容', + '学习强国新闻爬取', + 'newsCrewerTask', + 'execute', + 'crawler/xxqg/XxqgSearch.py', + '[ + { + "name": "query", + "description": "搜索关键字", + "type": "Input", + "valueType": "String", + "value": "", + "required": true + }, + { + "name": "total", + "description": "抓取数量", + "type": "InputNumber", + "valueType": "Integer", + "value": 10, + "required": true + } + ]', + 1, + 7, + 'system', + NOW() +); + +-- 8. 学习强国栏目新闻爬取 +INSERT INTO `tb_crontab_task_meta` ( + `id`, `meta_id`, `name`, `description`, `category`, + `bean_name`, `method_name`, `script_path`, `param_schema`, `auto_publish`, + `sort_order`, `creator`, `create_time` +) VALUES ( + '8', + 'xxqg_column_crawl', + '栏目新闻爬取', + '爬取学习强国指定栏目的新闻内容', + '学习强国新闻爬取', + 'newsCrewerTask', + 'execute', + 'crawler/xxqg/XxqgColumn.py', + '[ + { + "name": "column", + "description": "栏目名称", + "type": "Select", + "valueType": "String", + "value": "important", + "required": true, + "options": [ + {"label": "重要新闻", "value": "important"}, + {"label": "学习时评", "value": "xuexishiping"}, + {"label": "综合新闻", "value": "zonghexinwen"}, + {"label": "中宣部发布", "value": "zhongxuanbu"} + ] + }, + { + "name": "yesterday", + "description": "是否抓取昨天的数据", + "type": "Switch", + "valueType": "Boolean", + "value": true, + "required": false + }, + { + "name": "dateRange", + "description": "日期范围", + "type": "DateRangePicker", + "valueType": "String", + "value": "", + "required": false, + "startKey": "start", + "endKey": "end" + } + ]', + 1, + 8, + 'system', + NOW() +); + diff --git a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbCrontabLog.java b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbCrontabLog.java index 4ac36b9..52ffc8b 100644 --- a/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbCrontabLog.java +++ b/schoolNewsServ/common/common-dto/src/main/java/org/xyzh/common/dto/crontab/TbCrontabLog.java @@ -46,7 +46,7 @@ public class TbCrontabLog extends BaseDTO { private String methodParams; /** - * @description 执行状态(0:失败 1:成功) + * @description 执行状态(0:失败 1:成功,2 运行中) */ private Integer executeStatus; diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java index 18a6b0e..15888a6 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/scheduler/TaskExecutor.java @@ -58,7 +58,7 @@ public class TaskExecutor { log.setDeleted(false); try { - log.setExecuteStatus(0); + log.setExecuteStatus(2); log.setExecuteMessage("执行中"); int i = logMapper.insertLog(log); diff --git a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java index 105b2ee..1f98625 100644 --- a/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java +++ b/schoolNewsServ/crontab/src/main/java/org/xyzh/crontab/task/newsTask/NewsCrawlerTask.java @@ -15,6 +15,7 @@ import org.xyzh.api.system.role.RoleService; import org.xyzh.common.core.domain.ResultDomain; import org.xyzh.common.dto.crontab.TbCrontabEmailDefault; import org.xyzh.common.dto.crontab.TbCrontabEmailRecipient; +import org.xyzh.common.dto.crontab.TbCrontabLog; import org.xyzh.common.dto.crontab.TbCrontabTask; import org.xyzh.common.dto.crontab.TbCrontabTaskMeta; import org.xyzh.common.dto.crontab.TbDataCollectionItem; @@ -25,6 +26,7 @@ import org.xyzh.common.utils.NonUtils; import org.xyzh.common.vo.DataCollectionItemVO; import org.xyzh.common.vo.ResourceVO; import org.xyzh.common.vo.UserDeptRoleVO; +import org.xyzh.crontab.mapper.CrontabLogMapper; import org.xyzh.crontab.pojo.TaskParams; import org.xyzh.crontab.task.PythonCommandTask; @@ -79,6 +81,8 @@ public class NewsCrawlerTask extends PythonCommandTask { @Autowired private RoleService roleService; + @Autowired + private CrontabLogMapper logMapper; /** * 构建Python脚本参数 */ @@ -132,9 +136,12 @@ public class NewsCrawlerTask extends PythonCommandTask { String pythonArg = "--"+key; if (pythonArg != null && value != null) { if (value instanceof Boolean) { - // Boolean类型: true时只传参数名,false时不传 if ((Boolean) value) { args.add(pythonArg); + args.add("true"); + }else{ + args.add(pythonArg); + args.add("false"); } } else { // String/Integer类型: 传参数名+值 @@ -305,6 +312,13 @@ public class NewsCrawlerTask extends PythonCommandTask { } else { logger.warn("没有有效的新闻数据需要保存"); } + if(passList.isEmpty() && notPassList.isEmpty()){ + TbCrontabLog log = new TbCrontabLog(); + log.setID(logId); + log.setExecuteStatus(1); + log.setExecuteMessage("未爬取到数据"); + int i = logMapper.updateLog(log); + } // 自动发布并记录成功发布的 URL 集合 Set publishedUrls = new HashSet<>(); diff --git a/schoolNewsWeb/src/types/crontab/index.ts b/schoolNewsWeb/src/types/crontab/index.ts index 9c8cd84..3af7e8a 100644 --- a/schoolNewsWeb/src/types/crontab/index.ts +++ b/schoolNewsWeb/src/types/crontab/index.ts @@ -64,7 +64,7 @@ export interface CrontabLog extends BaseDTO { methodName?: string; /** 方法参数 */ methodParams?: string; - /** 执行状态(0:失败 1:成功) */ + /** 执行状态(0:失败 1:成功 2运行中) */ executeStatus?: number; /** 执行结果信息 */ executeMessage?: string; @@ -162,11 +162,38 @@ export interface CrontabParam { name: string; /** 参数描述 */ description: string; - /** 参数类型 */ - type: string; + /** + * 前端渲染的组件类型 + * - Input: 文本输入框 + * - InputNumber: 数字输入框 + * - DatePicker: 日期选择器 + * - DateRangePicker: 日期范围选择器 + * - Switch: 布尔开关 + * - Select: 下拉选择器 + */ + type: 'Input' | 'InputNumber' | 'DatePicker' | 'DateRangePicker' | 'Switch' | 'Select'; + /** + * 参数值的数据类型(后端处理使用) + * - String: 字符串 + * - Integer: 整数 + * - Boolean: 布尔值 + */ + valueType: 'String' | 'Integer' | 'Boolean'; /** 默认值 */ value: any; + /** 是否必填 */ required: boolean; + /** Select类型的选项列表 */ + options?: Array<{ + /** 选项显示文本 */ + label: string; + /** 选项值 */ + value: string; + }>; + /** DateRangePicker的开始日期参数名(如:startDate, start) */ + startKey?: string; + /** DateRangePicker的结束日期参数名(如:endDate, end) */ + endKey?: string; } /** diff --git a/schoolNewsWeb/src/views/admin/manage/crontab/LogManagementView.vue b/schoolNewsWeb/src/views/admin/manage/crontab/LogManagementView.vue index f5f6ce3..ddc7d15 100644 --- a/schoolNewsWeb/src/views/admin/manage/crontab/LogManagementView.vue +++ b/schoolNewsWeb/src/views/admin/manage/crontab/LogManagementView.vue @@ -41,6 +41,7 @@ > +
@@ -69,8 +70,11 @@ @@ -152,8 +156,11 @@
执行状态: - - {{ currentLog.executeStatus === 1 ? '成功' : '失败' }} + + {{ currentLog.executeStatus === 1 ? '成功' : currentLog.executeStatus === 2 ? '运行中' : '失败' }}
diff --git a/schoolNewsWeb/src/views/admin/manage/crontab/NewsCrawlerView.vue b/schoolNewsWeb/src/views/admin/manage/crontab/NewsCrawlerView.vue index 080c28f..730cb50 100644 --- a/schoolNewsWeb/src/views/admin/manage/crontab/NewsCrawlerView.vue +++ b/schoolNewsWeb/src/views/admin/manage/crontab/NewsCrawlerView.vue @@ -232,25 +232,67 @@ {{ param.description }} ({{ param.type }}) + + + + + + + + + + +
@@ -819,6 +861,24 @@ async function handleEdit(row: CrontabTask) { const params = JSON.parse(row.methodParams); // 排除系统参数 const { scriptPath, taskId, logId, ...restParams } = params; + + // 处理DateRangePicker参数:将startKey和endKey合并为dateRange数组 + if (method.params) { + for (const param of method.params) { + if (param.type === 'DateRangePicker' && param.startKey && param.endKey) { + const startValue = restParams[param.startKey]; + const endValue = restParams[param.endKey]; + if (startValue && endValue) { + // 合并为数组 + restParams[param.name] = [startValue, endValue]; + // 删除原始的start和end字段 + delete restParams[param.startKey]; + delete restParams[param.endKey]; + } + } + } + } + // 延迟设置,确保watch先执行完 setTimeout(() => { dynamicParams.value = restParams; @@ -971,11 +1031,12 @@ async function handleSubmit() { for (const param of selectedMethod.value.params) { const value = dynamicParams.value[param.name]; - if (param.required && param.type === 'String' && (!value || value.trim() === '')) { + // 使用valueType判断值类型 + if (param.required && param.valueType === 'String' && (!value || value.trim() === '')) { ElMessage.warning(`请输入${param.description}`); return; } - if (param.required && param.type === 'Integer' && (value === undefined || value === null || value === '')) { + if (param.required && param.valueType === 'Integer' && (value === undefined || value === null || value === '')) { ElMessage.warning(`请输入${param.description}`); return; } @@ -984,15 +1045,32 @@ async function handleSubmit() { submitting.value = true; try { + // 处理DateRangePicker参数,将数组拆分为开始和结束日期 + const processedParams = { ...dynamicParams.value }; + if (selectedMethod.value.params) { + for (const param of selectedMethod.value.params) { + if (param.type === 'DateRangePicker' && processedParams[param.name]) { + const dateRange = processedParams[param.name]; + if (Array.isArray(dateRange) && dateRange.length === 2) { + // 拆分为startKey和endKey + const startKey = (param as any).startKey || 'startDate'; + const endKey = (param as any).endKey || 'endDate'; + processedParams[startKey] = dateRange[0]; + processedParams[endKey] = dateRange[1]; + // 删除原始的range参数 + delete processedParams[param.name]; + } + } + } + } + // 构建CreateTaskRequest const requestData: CreateTaskRequest = { metaId: selectedMetaId.value, task: { ...formData, defaultRecipient: useDefaultRecipients.value, - methodParams: JSON.stringify({ - ...dynamicParams.value - }) + methodParams: JSON.stringify(processedParams) } as CrontabTask, additionalRecipients: additionalRecipients.value }; diff --git a/schoolNewsWeb/src/views/admin/manage/resource/ArticleManagementView.vue b/schoolNewsWeb/src/views/admin/manage/resource/ArticleManagementView.vue index bf6de8f..bbec306 100644 --- a/schoolNewsWeb/src/views/admin/manage/resource/ArticleManagementView.vue +++ b/schoolNewsWeb/src/views/admin/manage/resource/ArticleManagementView.vue @@ -6,11 +6,11 @@
+ 新增文章 - 数据采集