Files
schoolNews/schoolNewsServ/.bin/mysql/sql/sensitiveData/importSensitiveWords.sh
2025-11-24 11:50:15 +08:00

183 lines
4.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
##############################################
# 敏感词批量导入脚本 (纯Shell实现)
# 功能:从 sensitive_word_dict.txt 读取敏感词并导入数据库
# 优势不需要Python环境只需要MySQL客户端
##############################################
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 数据库配置
DB_HOST="${DB_HOST:-localhost}"
DB_PORT="${DB_PORT:-3306}"
DB_USER="${DB_USER:-root}"
DB_PASSWORD="${DB_PASSWORD:-123456}"
DB_NAME="${DB_NAME:-school_news}"
# 脚本目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DICT_FILE="${SCRIPT_DIR}/sensitive_word_dict.txt"
echo "=================================================="
echo "敏感词批量导入工具 (Shell版本)"
echo "=================================================="
log_info "数据库: ${DB_HOST}:${DB_PORT}/${DB_NAME}"
log_info "敏感词文件: ${DICT_FILE}"
echo "=================================================="
echo ""
# 检查敏感词文件
if [ ! -f "${DICT_FILE}" ]; then
log_error "敏感词文件不存在: ${DICT_FILE}"
exit 1
fi
# 统计敏感词数量
TOTAL_WORDS=$(grep -v '^$' "${DICT_FILE}" | wc -l)
log_info "检测到 ${TOTAL_WORDS} 个敏感词"
if [ ${TOTAL_WORDS} -eq 0 ]; then
log_warn "敏感词文件为空"
exit 0
fi
# 检查MySQL连接
log_info "检查数据库连接..."
if ! mysql -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" -e "SELECT 1;" &>/dev/null; then
log_error "数据库连接失败"
exit 1
fi
log_info "数据库连接成功"
echo ""
# 确认导入
log_warn "准备导入 ${TOTAL_WORDS} 个敏感词到数据库"
log_warn "这将清除现有的 deny 类型敏感词"
echo ""
# 检查是否是自动模式(通过 -y 参数或环境变量)
AUTO_CONFIRM=${AUTO_CONFIRM:-false}
if [ "$1" = "-y" ] || [ "$1" = "--yes" ] || [ "${AUTO_CONFIRM}" = "true" ]; then
log_info "自动确认模式,开始导入..."
else
read -p "是否继续?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
log_warn "用户取消导入"
exit 0
fi
fi
echo ""
# 开始导入
log_info "开始导入敏感词..."
START_TIME=$(date +%s)
# 创建临时SQL文件
TEMP_SQL=$(mktemp)
trap "rm -f ${TEMP_SQL}" EXIT
# 生成SQL语句
log_info "生成SQL语句..."
cat > "${TEMP_SQL}" <<EOF
-- 敏感词批量导入
USE ${DB_NAME};
-- 设置字符集
SET NAMES utf8mb4;
-- 清除现有的deny类型敏感词
DELETE FROM tb_sensitive_word WHERE type = 'deny';
-- 批量插入敏感词
INSERT INTO tb_sensitive_word (word, type) VALUES
EOF
# 读取敏感词并生成INSERT语句
COUNTER=0
while IFS= read -r word || [ -n "$word" ]; do
# 跳过空行
[ -z "$word" ] && continue
# 转义单引号
word=$(echo "$word" | sed "s/'/''/g")
COUNTER=$((COUNTER + 1))
# 添加到SQL最后一个不加逗号
if [ $COUNTER -eq ${TOTAL_WORDS} ]; then
echo "('${word}', 'deny');" >> "${TEMP_SQL}"
else
echo "('${word}', 'deny')," >> "${TEMP_SQL}"
fi
# 进度提示每1000个
if [ $((COUNTER % 1000)) -eq 0 ]; then
log_info "已处理 ${COUNTER}/${TOTAL_WORDS} 个敏感词..."
fi
done < "${DICT_FILE}"
# 添加查询语句
cat >> "${TEMP_SQL}" <<EOF
-- 验证导入结果
SELECT COUNT(*) AS '导入数量' FROM tb_sensitive_word WHERE type = 'deny';
EOF
log_info "SQL语句生成完成${COUNTER}个敏感词)"
echo ""
# 执行SQL
log_info "执行数据库导入..."
if mysql -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" < "${TEMP_SQL}"; then
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
echo ""
echo "=================================================="
log_info "导入完成!"
log_info "成功导入: ${COUNTER} 个敏感词"
log_info "耗时: ${DURATION}"
echo "=================================================="
else
log_error "数据库导入失败"
exit 1
fi
# 验证结果
log_info "验证导入结果..."
IMPORTED_COUNT=$(mysql -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" -D"${DB_NAME}" -sNe \
"SELECT COUNT(*) FROM tb_sensitive_word WHERE type = 'deny';")
echo ""
log_info "数据库中当前有 ${IMPORTED_COUNT} 个 deny 类型敏感词"
if [ "${IMPORTED_COUNT}" -eq "${COUNTER}" ]; then
log_info "✅ 验证通过:导入数量与预期一致"
else
log_warn "⚠️ 导入数量不匹配:预期 ${COUNTER},实际 ${IMPORTED_COUNT}"
fi
echo ""
log_info "敏感词导入完成!"