课程、文章审核

This commit is contained in:
2025-11-19 15:11:30 +08:00
parent 4ab8877b80
commit d568781ce9
24 changed files with 379 additions and 65 deletions

View File

@@ -9,6 +9,7 @@ import org.xyzh.api.crontab.DataCollectionItemService;
import org.xyzh.api.crontab.EmailDefaultService;
import org.xyzh.api.crontab.EmailRecipientService;
import org.xyzh.api.crontab.TaskMetaService;
import org.xyzh.api.news.resource.ResourceAuditService;
import org.xyzh.api.news.resource.ResourceService;
import org.xyzh.api.system.role.RoleService;
import org.xyzh.common.core.domain.ResultDomain;
@@ -65,6 +66,9 @@ public class NewsCrawlerTask extends PythonCommandTask {
@Autowired
private EmailRecipientService emailRecipientService;
@Autowired
private ResourceAuditService auditService;
@Autowired
private EmailUtils emailUtils;
@@ -196,7 +200,6 @@ public class NewsCrawlerTask extends PythonCommandTask {
logger.info("开始保存 {} 条新闻到数据库任务ID: {}日志ID: {}", newsList.size(), taskId, logId);
try {
List<TbDataCollectionItem> itemList = new ArrayList<>();
ResultDomain<TbCrontabTaskMeta> metaResult = taskMetaService.getTaskMetaByTaskId(taskId);
if (!metaResult.isSuccess() || metaResult.getData() == null) {
throw new Exception("未找到任务元数据: taskId=" + taskId);
@@ -206,7 +209,9 @@ public class NewsCrawlerTask extends PythonCommandTask {
Date now = new Date();
SimpleDateFormat parser = new SimpleDateFormat("yyyy年MM月dd日HH:mm");
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<TbDataCollectionItem> itemList = new ArrayList<>();
List<TbDataCollectionItem> notPassList = new ArrayList<>();
List<TbDataCollectionItem> passList = new ArrayList<>();
for (ArticleStruct news : newsList) {
try {
TbDataCollectionItem item = new TbDataCollectionItem();
@@ -260,7 +265,14 @@ public class NewsCrawlerTask extends PythonCommandTask {
// 状态和时间
item.setStatus(0); // 未处理
item.setCrawlTime(now);
ResultDomain<Boolean> pass = auditService.auditText(item.getContent());
if(pass.isSuccess() && pass.getData()){
item.setIsAudited(true);
passList.add(item);
}else{
item.setIsAudited(false);
notPassList.add(item);
}
itemList.add(item);
} catch (Exception e) {
logger.error("转换新闻数据失败: ", e);
@@ -269,26 +281,40 @@ public class NewsCrawlerTask extends PythonCommandTask {
// 批量保存
Set<String> insertedUrls = new HashSet<>();
ResultDomain<TbDataCollectionItem> dataResult = new ResultDomain<>();
if (!itemList.isEmpty()) {
dataResult = itemService.batchCreateItems(itemList);
if (dataResult.isSuccess()) {
logger.info("成功保存 {} 条新闻到数据库", itemList.size());
insertedUrls.addAll(dataResult.getDataList().stream().map(TbDataCollectionItem::getSourceUrl).toList());
ResultDomain<TbDataCollectionItem> passDataResult = new ResultDomain<>();
if (!passList.isEmpty()) {
passDataResult = itemService.batchCreateItems(passList);
if (passDataResult.isSuccess()) {
logger.info("成功保存 {} 条新闻到数据库", passList.size());
insertedUrls.addAll(passDataResult.getDataList().stream().map(TbDataCollectionItem::getSourceUrl).toList());
} else {
logger.error("保存新闻到数据库失败: {}", dataResult.getMessage());
logger.error("保存新闻到数据库失败: {}", passDataResult.getMessage());
}
} else {
logger.warn("没有有效的新闻数据需要保存");
}
ResultDomain<TbDataCollectionItem> notPassDataResult = new ResultDomain<>();
if (!notPassList.isEmpty()) {
notPassDataResult = itemService.batchCreateItems(notPassList);
if (notPassDataResult.isSuccess()) {
logger.info("成功保存 {} 条新闻到数据库", notPassList.size());
insertedUrls.addAll(notPassDataResult.getDataList().stream().map(TbDataCollectionItem::getSourceUrl).toList());
} else {
logger.error("保存新闻到数据库失败: {}", notPassDataResult.getMessage());
}
} else {
logger.warn("没有有效的新闻数据需要保存");
}
// 自动发布并记录成功发布的 URL 集合
Set<String> publishedUrls = new HashSet<>();
if (taskMeta.getAutoPublish().booleanValue()){
publishedUrls = publishNewsToArticle(dataResult.getDataList(), task, logId);
publishedUrls = publishNewsToArticle(passDataResult.getDataList(), task, logId);
}
Set<String> notPathUrls = new HashSet<>(notPassList.stream().map(TbDataCollectionItem::getSourceUrl).toList());
// 发送邮件通知,包含自动发布与新增信息
sendEmailNotification(task.getTaskId(), task, newsList, insertedUrls, publishedUrls);
sendEmailNotification(task.getTaskId(), task, newsList, insertedUrls, publishedUrls, notPathUrls);
} catch (Exception e) {
logger.error("保存新闻数据到数据库异常: ", e);
@@ -300,7 +326,9 @@ public class NewsCrawlerTask extends PythonCommandTask {
*/
private void sendEmailNotification(String taskId, TbCrontabTask task, List<ArticleStruct> newsList,
Set<String> insertedUrls,
Set<String> publishedUrls) {
Set<String> publishedUrls,
Set<String> notPassUrls
) {
try {
List<String> recipients = new ArrayList<>();
@@ -336,7 +364,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
// 5. 构建邮件内容
String subject = "【新闻爬虫通知】" + task.getTaskName() + " 执行完成";
String content = buildEmailContent(task.getTaskName(), newsList, insertedUrls, publishedUrls);
String content = buildEmailContent(task.getTaskName(), newsList, insertedUrls, publishedUrls, notPassUrls);
// 6. 发送邮件
int successCount = 0;
@@ -357,8 +385,9 @@ public class NewsCrawlerTask extends PythonCommandTask {
* 构建邮件HTML内容
*/
private String buildEmailContent(String taskName, List<ArticleStruct> newsList,
java.util.Set<String> insertedUrls,
java.util.Set<String> publishedUrls) {
Set<String> insertedUrls,
Set<String> publishedUrls,
Set<String> notPathUrls) {
StringBuilder html = new StringBuilder();
html.append("<!DOCTYPE html>")
.append("<html>")
@@ -416,6 +445,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
html.append(" | <a href='").append(news.getUrl()).append("' class='news-link' target='_blank'>查看原文</a>");
}
// 入库标记(新增 / 历史已存在)
if (news.getUrl() != null && !news.getUrl().isEmpty() && insertedUrls != null) {
if (insertedUrls.contains(news.getUrl())) {
@@ -425,6 +455,12 @@ public class NewsCrawlerTask extends PythonCommandTask {
}
}
// 如果该未审核通过,追加标记
if (notPathUrls != null && !notPathUrls.isEmpty()
&& news.getUrl() != null && notPathUrls.contains(news.getUrl())) {
html.append(" | <span style='color:#ff0000;font-weight:bold;'>【未通过审核】</span>");
}
// 如果该新闻已自动发布,追加标记
if (publishedUrls != null && !publishedUrls.isEmpty()
&& news.getUrl() != null && publishedUrls.contains(news.getUrl())) {
@@ -435,12 +471,6 @@ public class NewsCrawlerTask extends PythonCommandTask {
.append("</div>");
}
if (newsList.size() > 10) {
html.append("<p style='text-align: center; color: #666; margin-top: 15px;'>")
.append("还有 ").append(newsList.size() - 10).append(" 条新闻未显示,请登录系统查看详情")
.append("</p>");
}
html.append("</div>"); // news-list
html.append("</div>"); // content
@@ -498,7 +528,7 @@ public class NewsCrawlerTask extends PythonCommandTask {
resource.setAuthor(item.getAuthor());
resource.setSource(item.getSource());
resource.setSourceUrl(item.getSourceUrl());
resource.setIsAudited(true);
// 发布时间:优先使用采集表中的时间
Date publishTime = item.getPublishTime() != null ? item.getPublishTime() : now;
resource.setPublishTime(publishTime);

View File

@@ -21,6 +21,7 @@
<result column="images" property="images" />
<result column="tags" property="tags" />
<result column="status" property="status" />
<result column="is_audited" property="isAudited" />
<result column="resource_id" property="resourceId" />
<result column="crawl_time" property="crawlTime" />
<result column="process_time" property="processTime" />
@@ -51,6 +52,7 @@
<result column="images" property="images" />
<result column="tags" property="tags" />
<result column="status" property="status" />
<result column="is_audited" property="isAudited" />
<result column="resource_id" property="resourceId" />
<result column="crawl_time" property="crawlTime" />
<result column="process_time" property="processTime" />
@@ -77,7 +79,7 @@
<!-- 字段列表 -->
<sql id="Base_Column_List">
id, task_id, log_id, title, content, summary, source, source_url, category, author,
publish_time, cover_image, images, tags, status, resource_id, crawl_time, process_time,
publish_time, cover_image, images, tags, status, is_audited, resource_id, crawl_time, process_time,
processor, execute_status, execute_message, create_time, update_time, delete_time, deleted
</sql>
@@ -98,6 +100,7 @@
i.images,
i.tags,
i.status,
i.is_audited,
i.resource_id,
i.crawl_time,
i.process_time,