搜索关键字爬虫
This commit is contained in:
@@ -1,28 +0,0 @@
|
||||
crawler:
|
||||
python:
|
||||
path: F:\Environment\Conda\envs\shoolNewsCrewer
|
||||
base:
|
||||
path: F:/Project/schoolNews/schoolNewsCrawler
|
||||
|
||||
crontab:
|
||||
items: #可供前端选择的定时任务列表
|
||||
- name: 人民日报新闻爬取
|
||||
methods: #爬取方式
|
||||
- name: 关键字搜索爬取
|
||||
class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
|
||||
path: crawler/RmrbSearch.py
|
||||
params:
|
||||
query: String #搜索关键字
|
||||
total: Integer #总新闻数量
|
||||
- name: 排行榜爬取
|
||||
class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
|
||||
path: crawler/RmrbHotPoint.py
|
||||
- name: 往日精彩头条爬取
|
||||
class: org.xyzh.crontab.task.newsTask.NewsCrawlerTask
|
||||
path: crawler/RmrbTrending.py
|
||||
params:
|
||||
startDate: String #开始日期
|
||||
endDate: String #结束日期
|
||||
isYestoday: Boolean #是否是昨天
|
||||
|
||||
|
||||
47
schoolNewsServ/crontab/src/main/resources/application.yml
Normal file
47
schoolNewsServ/crontab/src/main/resources/application.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
crawler:
|
||||
# Python 可执行文件路径(Windows 建议指向 python.exe;如已在 PATH,可直接用 "python")
|
||||
pythonPath: F:/Environment/Conda/envs/schoolNewsCrawler/python.exe
|
||||
# 爬虫脚本根目录(NewsCrawlerTask 的工作目录)
|
||||
basePath: F:/Project/schoolNews/schoolNewsCrawler
|
||||
|
||||
# 下面为原有的定时任务清单(保持不变,仅修正到正确文件)
|
||||
crontab:
|
||||
items:
|
||||
- name: 人民日报新闻爬取
|
||||
methods:
|
||||
- name: 关键字搜索爬取
|
||||
clazz: newsCrewerTask
|
||||
excuete_method: execute
|
||||
path: crawler/RmrbSearch.py
|
||||
params:
|
||||
- name: query
|
||||
description: 搜索关键字
|
||||
type: String
|
||||
value: ""
|
||||
- name: total
|
||||
description: 总新闻数量
|
||||
type: Integer
|
||||
value: 10
|
||||
- name: 排行榜爬取
|
||||
clazz: newsCrewerTask
|
||||
excuete_method: execute
|
||||
path: crawler/RmrbHotPoint.py
|
||||
- name: 往日精彩头条爬取
|
||||
clazz: newsCrewerTask
|
||||
excuete_method: execute
|
||||
path: crawler/RmrbTrending.py
|
||||
params:
|
||||
- name: startDate
|
||||
description: 开始日期
|
||||
type: String
|
||||
value: ""
|
||||
- name: endDate
|
||||
description: 结束日期
|
||||
type: String
|
||||
value: ""
|
||||
- name: yesterday
|
||||
description: 是否是昨天
|
||||
type: Boolean
|
||||
value: true
|
||||
|
||||
|
||||
@@ -186,7 +186,7 @@
|
||||
UPDATE tb_crontab_task
|
||||
SET deleted = 1,
|
||||
delete_time = NOW()
|
||||
WHERE id = #{taskId} AND deleted = 0
|
||||
WHERE task_id=#{taskId} AND deleted = 0
|
||||
</update>
|
||||
|
||||
<!-- 根据ID查询任务 -->
|
||||
@@ -194,7 +194,7 @@
|
||||
SELECT
|
||||
<include refid="Base_Column_List" />
|
||||
FROM tb_crontab_task
|
||||
WHERE id = #{taskId} AND deleted = 0
|
||||
WHERE task_id=#{taskId} AND deleted = 0
|
||||
</select>
|
||||
|
||||
<!-- 根据过滤条件查询任务列表 -->
|
||||
@@ -272,7 +272,7 @@
|
||||
UPDATE tb_crontab_task
|
||||
SET status = #{status},
|
||||
update_time = NOW()
|
||||
WHERE id = #{taskId} AND deleted = 0
|
||||
WHERE task_id=#{taskId} AND deleted = 0
|
||||
</update>
|
||||
|
||||
<!-- 根据Bean名称和方法名称查询任务 -->
|
||||
|
||||
@@ -0,0 +1,400 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE mapper
|
||||
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
<mapper namespace="org.xyzh.crontab.mapper.DataCollectionItemMapper">
|
||||
|
||||
<!-- 结果映射 -->
|
||||
<resultMap id="BaseResultMap" type="org.xyzh.common.dto.crontab.TbDataCollectionItem">
|
||||
<id column="id" property="id" />
|
||||
<result column="task_id" property="taskId" />
|
||||
<result column="log_id" property="logId" />
|
||||
<result column="title" property="title" />
|
||||
<result column="content" property="content" />
|
||||
<result column="summary" property="summary" />
|
||||
<result column="source" property="source" />
|
||||
<result column="source_url" property="sourceUrl" />
|
||||
<result column="category" property="category" />
|
||||
<result column="author" property="author" />
|
||||
<result column="publish_time" property="publishTime" />
|
||||
<result column="cover_image" property="coverImage" />
|
||||
<result column="images" property="images" />
|
||||
<result column="tags" property="tags" />
|
||||
<result column="status" property="status" />
|
||||
<result column="resource_id" property="resourceId" />
|
||||
<result column="crawl_time" property="crawlTime" />
|
||||
<result column="process_time" property="processTime" />
|
||||
<result column="processor" property="processor" />
|
||||
<result column="create_time" property="createTime" />
|
||||
<result column="update_time" property="updateTime" />
|
||||
<result column="delete_time" property="deleteTime" />
|
||||
<result column="deleted" property="deleted" />
|
||||
</resultMap>
|
||||
|
||||
<!-- VO结果映射(平铺结构,包含关联的任务和日志信息) -->
|
||||
<resultMap id="VOResultMap" type="org.xyzh.common.vo.DataCollectionItemVO">
|
||||
<!-- 采集项基本信息 -->
|
||||
<result column="item_id" property="id" />
|
||||
<result column="task_id" property="taskId" />
|
||||
<result column="log_id" property="logId" />
|
||||
<result column="title" property="title" />
|
||||
<result column="content" property="content" />
|
||||
<result column="summary" property="summary" />
|
||||
<result column="source" property="source" />
|
||||
<result column="source_url" property="sourceUrl" />
|
||||
<result column="category" property="category" />
|
||||
<result column="author" property="author" />
|
||||
<result column="publish_time" property="publishTime" />
|
||||
<result column="cover_image" property="coverImage" />
|
||||
<result column="images" property="images" />
|
||||
<result column="tags" property="tags" />
|
||||
<result column="status" property="status" />
|
||||
<result column="resource_id" property="resourceId" />
|
||||
<result column="crawl_time" property="crawlTime" />
|
||||
<result column="process_time" property="processTime" />
|
||||
<result column="processor" property="processor" />
|
||||
<result column="item_create_time" property="createTime" />
|
||||
<result column="item_update_time" property="updateTime" />
|
||||
|
||||
<!-- 关联的任务信息 -->
|
||||
<result column="task_name" property="taskName" />
|
||||
<result column="task_group" property="taskGroup" />
|
||||
<result column="bean_name" property="beanName" />
|
||||
<result column="method_name" property="methodName" />
|
||||
<result column="method_params" property="methodParams" />
|
||||
|
||||
<!-- 关联的日志信息 -->
|
||||
<result column="execute_status" property="executeStatus" />
|
||||
<result column="execute_duration" property="executeDuration" />
|
||||
<result column="start_time" property="startTime" />
|
||||
<result column="end_time" property="endTime" />
|
||||
</resultMap>
|
||||
|
||||
<!-- 字段列表 -->
|
||||
<sql id="Base_Column_List">
|
||||
id, task_id, log_id, title, content, summary, source, source_url, category, author,
|
||||
publish_time, cover_image, images, tags, status, resource_id, crawl_time, process_time,
|
||||
processor, create_time, update_time, delete_time, deleted
|
||||
</sql>
|
||||
|
||||
<!-- VO查询字段列表(包含关联表) -->
|
||||
<sql id="VO_Column_List">
|
||||
i.id as item_id,
|
||||
i.task_id,
|
||||
i.log_id,
|
||||
i.title,
|
||||
i.content,
|
||||
i.summary,
|
||||
i.source,
|
||||
i.source_url,
|
||||
i.category,
|
||||
i.author,
|
||||
i.publish_time,
|
||||
i.cover_image,
|
||||
i.images,
|
||||
i.tags,
|
||||
i.status,
|
||||
i.resource_id,
|
||||
i.crawl_time,
|
||||
i.process_time,
|
||||
i.processor,
|
||||
i.create_time as item_create_time,
|
||||
i.update_time as item_update_time,
|
||||
t.task_name,
|
||||
t.task_group,
|
||||
t.bean_name,
|
||||
t.method_name,
|
||||
t.method_params,
|
||||
l.execute_status,
|
||||
l.execute_duration,
|
||||
l.start_time,
|
||||
l.end_time
|
||||
</sql>
|
||||
|
||||
<!-- 动态查询条件(用于有@Param("filter")的方法) -->
|
||||
<sql id="Filter_Where_Clause">
|
||||
<where>
|
||||
deleted = 0
|
||||
<if test="filter != null">
|
||||
<if test="filter.id != null and filter.id != ''">
|
||||
AND id = #{filter.id}
|
||||
</if>
|
||||
<if test="filter.taskId != null and filter.taskId != ''">
|
||||
AND task_id = #{filter.taskId}
|
||||
</if>
|
||||
<if test="filter.logId != null and filter.logId != ''">
|
||||
AND log_id = #{filter.logId}
|
||||
</if>
|
||||
<if test="filter.title != null and filter.title != ''">
|
||||
AND title LIKE CONCAT('%', #{filter.title}, '%')
|
||||
</if>
|
||||
<if test="filter.source != null and filter.source != ''">
|
||||
AND source = #{filter.source}
|
||||
</if>
|
||||
<if test="filter.sourceUrl != null and filter.sourceUrl != ''">
|
||||
AND source_url = #{filter.sourceUrl}
|
||||
</if>
|
||||
<if test="filter.category != null and filter.category != ''">
|
||||
AND category = #{filter.category}
|
||||
</if>
|
||||
<if test="filter.author != null and filter.author != ''">
|
||||
AND author LIKE CONCAT('%', #{filter.author}, '%')
|
||||
</if>
|
||||
<if test="filter.status != null">
|
||||
AND status = #{filter.status}
|
||||
</if>
|
||||
<if test="filter.resourceId != null and filter.resourceId != ''">
|
||||
AND resource_id = #{filter.resourceId}
|
||||
</if>
|
||||
<if test="filter.processor != null and filter.processor != ''">
|
||||
AND processor = #{filter.processor}
|
||||
</if>
|
||||
</if>
|
||||
</where>
|
||||
</sql>
|
||||
|
||||
<!-- 动态查询条件(用于没有@Param注解的方法,直接使用参数名) -->
|
||||
<sql id="Item_Where_Clause">
|
||||
<where>
|
||||
deleted = 0
|
||||
<if test="_parameter != null">
|
||||
<if test="id != null and id != ''">
|
||||
AND id = #{id}
|
||||
</if>
|
||||
<if test="taskId != null and taskId != ''">
|
||||
AND task_id = #{taskId}
|
||||
</if>
|
||||
<if test="logId != null and logId != ''">
|
||||
AND log_id = #{logId}
|
||||
</if>
|
||||
<if test="title != null and title != ''">
|
||||
AND title LIKE CONCAT('%', #{title}, '%')
|
||||
</if>
|
||||
<if test="source != null and source != ''">
|
||||
AND source = #{source}
|
||||
</if>
|
||||
<if test="sourceUrl != null and sourceUrl != ''">
|
||||
AND source_url = #{sourceUrl}
|
||||
</if>
|
||||
<if test="category != null and category != ''">
|
||||
AND category = #{category}
|
||||
</if>
|
||||
<if test="author != null and author != ''">
|
||||
AND author LIKE CONCAT('%', #{author}, '%')
|
||||
</if>
|
||||
<if test="status != null">
|
||||
AND status = #{status}
|
||||
</if>
|
||||
<if test="resourceId != null and resourceId != ''">
|
||||
AND resource_id = #{resourceId}
|
||||
</if>
|
||||
<if test="processor != null and processor != ''">
|
||||
AND processor = #{processor}
|
||||
</if>
|
||||
</if>
|
||||
</where>
|
||||
</sql>
|
||||
|
||||
<!-- 根据来源URL查询采集项(用于去重) -->
|
||||
<select id="selectBySourceUrl" resultMap="BaseResultMap">
|
||||
SELECT
|
||||
<include refid="Base_Column_List" />
|
||||
FROM tb_data_collection_item
|
||||
WHERE source_url = #{sourceUrl}
|
||||
AND deleted = 0
|
||||
LIMIT 1
|
||||
</select>
|
||||
|
||||
<!-- 根据任务ID查询采集项列表 -->
|
||||
<select id="selectByTaskId" resultMap="BaseResultMap">
|
||||
SELECT
|
||||
<include refid="Base_Column_List" />
|
||||
FROM tb_data_collection_item
|
||||
WHERE task_id = #{taskId}
|
||||
AND deleted = 0
|
||||
ORDER BY create_time DESC
|
||||
</select>
|
||||
|
||||
<!-- 查询采集项列表 -->
|
||||
<select id="selectItemList" resultMap="BaseResultMap">
|
||||
SELECT
|
||||
<include refid="Base_Column_List" />
|
||||
FROM tb_data_collection_item
|
||||
<include refid="Item_Where_Clause" />
|
||||
ORDER BY create_time DESC
|
||||
</select>
|
||||
|
||||
<!-- 分页查询采集项列表 -->
|
||||
<select id="selectItemPage" resultMap="BaseResultMap">
|
||||
SELECT
|
||||
<include refid="Base_Column_List" />
|
||||
FROM tb_data_collection_item
|
||||
<include refid="Filter_Where_Clause" />
|
||||
ORDER BY create_time DESC
|
||||
LIMIT #{pageParam.pageSize} OFFSET #{pageParam.offset}
|
||||
</select>
|
||||
|
||||
<!-- 统计采集项总数 -->
|
||||
<select id="countItems" resultType="long">
|
||||
SELECT COUNT(*)
|
||||
FROM tb_data_collection_item
|
||||
<include refid="Filter_Where_Clause" />
|
||||
</select>
|
||||
|
||||
<!-- 根据状态统计数量 -->
|
||||
<select id="countByStatus" resultType="long">
|
||||
SELECT COUNT(*)
|
||||
FROM tb_data_collection_item
|
||||
WHERE deleted = 0
|
||||
<if test="taskId != null and taskId != ''">
|
||||
AND task_id = #{taskId}
|
||||
</if>
|
||||
<if test="status != null">
|
||||
AND status = #{status}
|
||||
</if>
|
||||
</select>
|
||||
|
||||
<!-- 批量插入采集项 -->
|
||||
<insert id="batchInsertItems">
|
||||
INSERT INTO tb_data_collection_item (
|
||||
id, task_id, log_id, title, content, summary, source, source_url,
|
||||
category, author, publish_time, cover_image, images, tags, status,
|
||||
resource_id, crawl_time, process_time, processor,
|
||||
create_time, update_time, deleted
|
||||
)
|
||||
VALUES
|
||||
<foreach collection="itemList" item="item" separator=",">
|
||||
(
|
||||
#{item.id}, #{item.taskId}, #{item.logId}, #{item.title}, #{item.content},
|
||||
#{item.summary}, #{item.source}, #{item.sourceUrl}, #{item.category},
|
||||
#{item.author}, #{item.publishTime}, #{item.coverImage}, #{item.images},
|
||||
#{item.tags}, #{item.status}, #{item.resourceId}, #{item.crawlTime},
|
||||
#{item.processTime}, #{item.processor},
|
||||
NOW(), NOW(), 0
|
||||
)
|
||||
</foreach>
|
||||
</insert>
|
||||
|
||||
<!-- ==================== VO查询方法(使用JOIN返回完整VO) ==================== -->
|
||||
|
||||
<!-- 根据ID查询采集项VO -->
|
||||
<select id="selectVOById" resultMap="VOResultMap">
|
||||
SELECT
|
||||
<include refid="VO_Column_List" />
|
||||
FROM tb_data_collection_item i
|
||||
LEFT JOIN tb_crontab_task t ON i.task_id = t.task_id
|
||||
LEFT JOIN tb_crontab_log l ON i.log_id = l.id
|
||||
WHERE i.id = #{itemId}
|
||||
AND i.deleted = 0
|
||||
</select>
|
||||
|
||||
<!-- 查询采集项VO列表 -->
|
||||
<select id="selectVOList" resultMap="VOResultMap">
|
||||
SELECT
|
||||
<include refid="VO_Column_List" />
|
||||
FROM tb_data_collection_item i
|
||||
LEFT JOIN tb_crontab_task t ON i.task_id = t.task_id
|
||||
LEFT JOIN tb_crontab_log l ON i.log_id = l.id
|
||||
<where>
|
||||
i.deleted = 0
|
||||
<if test="_parameter != null">
|
||||
<if test="id != null and id != ''">
|
||||
AND i.id = #{id}
|
||||
</if>
|
||||
<if test="taskId != null and taskId != ''">
|
||||
AND i.task_id = #{taskId}
|
||||
</if>
|
||||
<if test="logId != null and logId != ''">
|
||||
AND i.log_id = #{logId}
|
||||
</if>
|
||||
<if test="title != null and title != ''">
|
||||
AND i.title LIKE CONCAT('%', #{title}, '%')
|
||||
</if>
|
||||
<if test="source != null and source != ''">
|
||||
AND i.source = #{source}
|
||||
</if>
|
||||
<if test="sourceUrl != null and sourceUrl != ''">
|
||||
AND i.source_url = #{sourceUrl}
|
||||
</if>
|
||||
<if test="category != null and category != ''">
|
||||
AND i.category = #{category}
|
||||
</if>
|
||||
<if test="author != null and author != ''">
|
||||
AND i.author LIKE CONCAT('%', #{author}, '%')
|
||||
</if>
|
||||
<if test="status != null">
|
||||
AND i.status = #{status}
|
||||
</if>
|
||||
<if test="resourceId != null and resourceId != ''">
|
||||
AND i.resource_id = #{resourceId}
|
||||
</if>
|
||||
<if test="processor != null and processor != ''">
|
||||
AND i.processor = #{processor}
|
||||
</if>
|
||||
</if>
|
||||
</where>
|
||||
ORDER BY i.create_time DESC
|
||||
</select>
|
||||
|
||||
<!-- 分页查询采集项VO列表 -->
|
||||
<select id="selectVOPage" resultMap="VOResultMap">
|
||||
SELECT
|
||||
<include refid="VO_Column_List" />
|
||||
FROM tb_data_collection_item i
|
||||
LEFT JOIN tb_crontab_task t ON i.task_id = t.task_id
|
||||
LEFT JOIN tb_crontab_log l ON i.log_id = l.id
|
||||
<where>
|
||||
i.deleted = 0
|
||||
<if test="filter != null">
|
||||
<if test="filter.id != null and filter.id != ''">
|
||||
AND i.id = #{filter.id}
|
||||
</if>
|
||||
<if test="filter.taskId != null and filter.taskId != ''">
|
||||
AND i.task_id = #{filter.taskId}
|
||||
</if>
|
||||
<if test="filter.logId != null and filter.logId != ''">
|
||||
AND i.log_id = #{filter.logId}
|
||||
</if>
|
||||
<if test="filter.title != null and filter.title != ''">
|
||||
AND i.title LIKE CONCAT('%', #{filter.title}, '%')
|
||||
</if>
|
||||
<if test="filter.source != null and filter.source != ''">
|
||||
AND i.source = #{filter.source}
|
||||
</if>
|
||||
<if test="filter.sourceUrl != null and filter.sourceUrl != ''">
|
||||
AND i.source_url = #{filter.sourceUrl}
|
||||
</if>
|
||||
<if test="filter.category != null and filter.category != ''">
|
||||
AND i.category = #{filter.category}
|
||||
</if>
|
||||
<if test="filter.author != null and filter.author != ''">
|
||||
AND i.author LIKE CONCAT('%', #{filter.author}, '%')
|
||||
</if>
|
||||
<if test="filter.status != null">
|
||||
AND i.status = #{filter.status}
|
||||
</if>
|
||||
<if test="filter.resourceId != null and filter.resourceId != ''">
|
||||
AND i.resource_id = #{filter.resourceId}
|
||||
</if>
|
||||
<if test="filter.processor != null and filter.processor != ''">
|
||||
AND i.processor = #{filter.processor}
|
||||
</if>
|
||||
</if>
|
||||
</where>
|
||||
ORDER BY i.create_time DESC
|
||||
LIMIT #{pageParam.pageSize} OFFSET #{pageParam.offset}
|
||||
</select>
|
||||
|
||||
<!-- 根据任务ID查询采集项VO列表 -->
|
||||
<select id="selectVOByTaskId" resultMap="VOResultMap">
|
||||
SELECT
|
||||
<include refid="VO_Column_List" />
|
||||
FROM tb_data_collection_item i
|
||||
LEFT JOIN tb_crontab_task t ON i.task_id = t.task_id
|
||||
LEFT JOIN tb_crontab_log l ON i.log_id = l.id
|
||||
WHERE i.task_id = #{taskId}
|
||||
AND i.deleted = 0
|
||||
ORDER BY i.create_time DESC
|
||||
</select>
|
||||
|
||||
</mapper>
|
||||
Reference in New Issue
Block a user