#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 敏感词批量导入脚本 从 sensitive_word_dict.txt 文件读取敏感词并批量插入到数据库 """ import pymysql import os import sys import argparse from datetime import datetime # 数据库配置 DB_CONFIG = { 'host': 'localhost', 'port': 3306, 'user': 'root', 'password': '123456', 'database': 'school_news', 'charset': 'utf8mb4' } def get_db_connection(): """获取数据库连接""" try: connection = pymysql.connect(**DB_CONFIG) return connection except Exception as e: print(f"数据库连接失败: {e}") return None def read_sensitive_words(file_path): """读取敏感词文件""" words = [] try: with open(file_path, 'r', encoding='utf-8') as f: for line in f: word = line.strip() if word and len(word) > 0: words.append(word) print(f"成功读取 {len(words)} 个敏感词") return words except Exception as e: print(f"读取敏感词文件失败: {e}") return [] def batch_insert_words(connection, words, batch_size=1000): """批量插入敏感词到数据库""" cursor = connection.cursor() try: # 清空现有的deny类型敏感词(可选) print("清理现有的deny类型敏感词...") cursor.execute("DELETE FROM tb_sensitive_word WHERE type = 'deny'") # 准备批量插入SQL insert_sql = "INSERT INTO tb_sensitive_word (word, type) VALUES (%s, %s)" # 分批插入 total_words = len(words) inserted_count = 0 for i in range(0, total_words, batch_size): batch_words = words[i:i + batch_size] batch_data = [(word, 'deny') for word in batch_words] try: cursor.executemany(insert_sql, batch_data) connection.commit() inserted_count += len(batch_data) print(f"已插入 {inserted_count}/{total_words} 个敏感词") except Exception as e: print(f"批量插入失败: {e}") connection.rollback() break print(f"批量插入完成,共插入 {inserted_count} 个敏感词") return inserted_count except Exception as e: print(f"批量插入过程中发生错误: {e}") connection.rollback() return 0 finally: cursor.close() def check_duplicates(connection, words): """检查重复的敏感词""" cursor = connection.cursor() try: # 查询已存在的敏感词 cursor.execute("SELECT word FROM tb_sensitive_word WHERE type = 'deny'") existing_words = set(row[0] for row in cursor.fetchall()) # 过滤重复词 new_words = [word for word in words if word not in existing_words] duplicate_count = len(words) - len(new_words) if duplicate_count > 0: print(f"发现 {duplicate_count} 个重复敏感词,将跳过") return new_words except Exception as e: print(f"检查重复词时发生错误: {e}") return words finally: cursor.close() def main(): """主函数""" # 解析命令行参数 parser = argparse.ArgumentParser(description='敏感词批量导入工具') parser.add_argument('-y', '--yes', action='store_true', help='自动确认导入,跳过交互式确认') parser.add_argument('--file', type=str, help='指定敏感词文件路径(默认: sensitive_word_dict.txt)') args = parser.parse_args() print("=" * 50) print("敏感词批量导入工具") print("=" * 50) # 获取敏感词文件路径 if args.file: dict_file = args.file else: script_dir = os.path.dirname(os.path.abspath(__file__)) dict_file = os.path.join(script_dir, 'sensitive_word_dict.txt') # 检查敏感词文件是否存在 if not os.path.exists(dict_file): print(f"敏感词文件不存在: {dict_file}") return # 读取敏感词 print("正在读取敏感词文件...") words = read_sensitive_words(dict_file) if not words: print("没有读取到有效的敏感词") return # 连接数据库 print("正在连接数据库...") connection = get_db_connection() if not connection: print("数据库连接失败,程序退出") return try: # 检查重复词(可选,如果不需要可以注释掉) # print("正在检查重复敏感词...") # words = check_duplicates(connection, words) if not words: print("所有敏感词都已存在,无需导入") return # 确认导入 print(f"准备导入 {len(words)} 个敏感词到数据库") if args.yes: print("自动确认模式,开始导入...") else: try: confirm = input("是否继续?(y/N): ").strip().lower() if confirm != 'y': print("用户取消导入") return except EOFError: print("检测到非交互式环境,自动确认导入...") except KeyboardInterrupt: print("\n用户中断导入") return # 批量插入 print("开始批量导入敏感词...") start_time = datetime.now() inserted_count = batch_insert_words(connection, words) end_time = datetime.now() duration = (end_time - start_time).total_seconds() print("=" * 50) print(f"导入完成!") print(f"成功导入: {inserted_count} 个敏感词") print(f"耗时: {duration:.2f} 秒") print("=" * 50) except Exception as e: print(f"程序执行过程中发生错误: {e}") finally: connection.close() print("数据库连接已关闭") if __name__ == "__main__": main()