2025-11-24 11:50:15 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
##############################################
|
|
|
|
|
|
# 敏感词批量导入脚本 (纯Shell实现)
|
|
|
|
|
|
# 功能:从 sensitive_word_dict.txt 读取敏感词并导入数据库
|
|
|
|
|
|
# 优势:不需要Python环境,只需要MySQL客户端
|
|
|
|
|
|
##############################################
|
|
|
|
|
|
|
|
|
|
|
|
set -e
|
|
|
|
|
|
|
|
|
|
|
|
# 颜色定义
|
|
|
|
|
|
RED='\033[0;31m'
|
|
|
|
|
|
GREEN='\033[0;32m'
|
|
|
|
|
|
YELLOW='\033[1;33m'
|
|
|
|
|
|
BLUE='\033[0;34m'
|
|
|
|
|
|
NC='\033[0m'
|
|
|
|
|
|
|
|
|
|
|
|
log_info() {
|
|
|
|
|
|
echo -e "${GREEN}[INFO]${NC} $1"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
log_warn() {
|
|
|
|
|
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
log_error() {
|
|
|
|
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 数据库配置
|
|
|
|
|
|
DB_HOST="${DB_HOST:-localhost}"
|
|
|
|
|
|
DB_PORT="${DB_PORT:-3306}"
|
|
|
|
|
|
DB_USER="${DB_USER:-root}"
|
|
|
|
|
|
DB_PASSWORD="${DB_PASSWORD:-123456}"
|
|
|
|
|
|
DB_NAME="${DB_NAME:-school_news}"
|
|
|
|
|
|
|
|
|
|
|
|
# 脚本目录
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
|
DICT_FILE="${SCRIPT_DIR}/sensitive_word_dict.txt"
|
|
|
|
|
|
|
|
|
|
|
|
echo "=================================================="
|
|
|
|
|
|
echo "敏感词批量导入工具 (Shell版本)"
|
|
|
|
|
|
echo "=================================================="
|
|
|
|
|
|
log_info "数据库: ${DB_HOST}:${DB_PORT}/${DB_NAME}"
|
|
|
|
|
|
log_info "敏感词文件: ${DICT_FILE}"
|
|
|
|
|
|
echo "=================================================="
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 检查敏感词文件
|
|
|
|
|
|
if [ ! -f "${DICT_FILE}" ]; then
|
|
|
|
|
|
log_error "敏感词文件不存在: ${DICT_FILE}"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 统计敏感词数量
|
|
|
|
|
|
TOTAL_WORDS=$(grep -v '^$' "${DICT_FILE}" | wc -l)
|
|
|
|
|
|
log_info "检测到 ${TOTAL_WORDS} 个敏感词"
|
|
|
|
|
|
|
|
|
|
|
|
if [ ${TOTAL_WORDS} -eq 0 ]; then
|
|
|
|
|
|
log_warn "敏感词文件为空"
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 检查MySQL连接
|
|
|
|
|
|
log_info "检查数据库连接..."
|
|
|
|
|
|
if ! mysql -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" -e "SELECT 1;" &>/dev/null; then
|
|
|
|
|
|
log_error "数据库连接失败"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
log_info "数据库连接成功"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 确认导入
|
|
|
|
|
|
log_warn "准备导入 ${TOTAL_WORDS} 个敏感词到数据库"
|
|
|
|
|
|
log_warn "这将清除现有的 deny 类型敏感词"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否是自动模式(通过 -y 参数或环境变量)
|
|
|
|
|
|
AUTO_CONFIRM=${AUTO_CONFIRM:-false}
|
|
|
|
|
|
if [ "$1" = "-y" ] || [ "$1" = "--yes" ] || [ "${AUTO_CONFIRM}" = "true" ]; then
|
|
|
|
|
|
log_info "自动确认模式,开始导入..."
|
|
|
|
|
|
else
|
|
|
|
|
|
read -p "是否继续?(y/N): " -n 1 -r
|
|
|
|
|
|
echo
|
|
|
|
|
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
|
|
|
|
|
log_warn "用户取消导入"
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 开始导入
|
|
|
|
|
|
log_info "开始导入敏感词..."
|
|
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
|
|
|
2025-11-24 16:53:17 +08:00
|
|
|
|
# 预处理:转义单引号,生成临时词表文件(仅一列:word)
|
|
|
|
|
|
TEMP_WORDS=$(mktemp)
|
|
|
|
|
|
trap "rm -f ${TEMP_WORDS}" EXIT
|
|
|
|
|
|
sed "s/'/''/g" "${DICT_FILE}" > "${TEMP_WORDS}"
|
|
|
|
|
|
|
|
|
|
|
|
# 使用 LOAD DATA LOCAL INFILE 批量导入,极大提升导入性能
|
|
|
|
|
|
log_info "使用 LOAD DATA LOCAL INFILE 批量导入敏感词..."
|
|
|
|
|
|
if mysql --local-infile=1 -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" "${DB_NAME}" <<EOF
|
2025-11-24 11:50:15 +08:00
|
|
|
|
SET NAMES utf8mb4;
|
2025-11-24 16:53:17 +08:00
|
|
|
|
-- 清除现有的 deny 类型敏感词
|
2025-11-24 11:50:15 +08:00
|
|
|
|
DELETE FROM tb_sensitive_word WHERE type = 'deny';
|
|
|
|
|
|
|
2025-11-24 16:53:17 +08:00
|
|
|
|
-- 从预处理文件批量导入敏感词(LOCAL 由客户端读取,避免 secure-file-priv 限制)
|
|
|
|
|
|
LOAD DATA LOCAL INFILE '${TEMP_WORDS}'
|
|
|
|
|
|
INTO TABLE tb_sensitive_word
|
|
|
|
|
|
CHARACTER SET utf8mb4
|
|
|
|
|
|
LINES TERMINATED BY '\n'
|
|
|
|
|
|
(word)
|
|
|
|
|
|
SET type = 'deny';
|
2025-11-24 11:50:15 +08:00
|
|
|
|
EOF
|
2025-11-24 16:53:17 +08:00
|
|
|
|
then
|
2025-11-24 11:50:15 +08:00
|
|
|
|
END_TIME=$(date +%s)
|
|
|
|
|
|
DURATION=$((END_TIME - START_TIME))
|
2025-11-24 16:53:17 +08:00
|
|
|
|
|
2025-11-24 11:50:15 +08:00
|
|
|
|
echo ""
|
|
|
|
|
|
echo "=================================================="
|
|
|
|
|
|
log_info "导入完成!"
|
2025-11-24 16:53:17 +08:00
|
|
|
|
log_info "成功导入: ${TOTAL_WORDS} 个敏感词(基于文件统计)"
|
2025-11-24 11:50:15 +08:00
|
|
|
|
log_info "耗时: ${DURATION} 秒"
|
|
|
|
|
|
echo "=================================================="
|
|
|
|
|
|
else
|
|
|
|
|
|
log_error "数据库导入失败"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 验证结果
|
|
|
|
|
|
log_info "验证导入结果..."
|
|
|
|
|
|
IMPORTED_COUNT=$(mysql -h"${DB_HOST}" -P"${DB_PORT}" -u"${DB_USER}" -p"${DB_PASSWORD}" -D"${DB_NAME}" -sNe \
|
|
|
|
|
|
"SELECT COUNT(*) FROM tb_sensitive_word WHERE type = 'deny';")
|
|
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
log_info "数据库中当前有 ${IMPORTED_COUNT} 个 deny 类型敏感词"
|
|
|
|
|
|
|
2025-11-24 16:53:17 +08:00
|
|
|
|
if [ "${IMPORTED_COUNT}" -eq "${TOTAL_WORDS}" ]; then
|
2025-11-24 11:50:15 +08:00
|
|
|
|
log_info "✅ 验证通过:导入数量与预期一致"
|
|
|
|
|
|
else
|
2025-11-24 16:53:17 +08:00
|
|
|
|
log_warn "⚠️ 导入数量不匹配:预期 ${TOTAL_WORDS},实际 ${IMPORTED_COUNT}"
|
2025-11-24 11:50:15 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
log_info "敏感词导入完成!"
|