""" 爬虫配置文件 """ import os from typing import Dict from pydantic_settings import BaseSettings from pydantic import Field class Settings(BaseSettings): """应用配置""" # 爬虫配置 REQUEST_TIMEOUT: int = Field(default=30, description="请求超时时间(秒)") RETRY_TIMES: int = Field(default=3, description="重试次数") CONCURRENT_REQUESTS: int = Field(default=5, description="并发请求数") DOWNLOAD_DELAY: float = Field(default=1.0, description="下载延迟(秒)") # 代理配置 USE_PROXY: bool = Field(default=False, description="是否使用代理") PROXY_URL: str = Field(default="", description="代理地址") # 数据库配置(用于存储爬取的数据) DB_HOST: str = Field(default="localhost", description="数据库主机") DB_PORT: int = Field(default=3306, description="数据库端口") DB_USER: str = Field(default="root", description="数据库用户名") DB_PASSWORD: str = Field(default="", description="数据库密码") DB_NAME: str = Field(default="school_news", description="数据库名称") # 日志配置 LOG_LEVEL: str = Field(default="INFO", description="日志级别") LOG_DIR: str = Field(default="logs", description="日志目录") LOG_RETENTION: str = Field(default="30 days", description="日志保留时间") # 输出配置 OUTPUT_DIR: str = Field(default="output", description="输出目录") OUTPUT_FORMAT: str = Field(default="json", description="输出格式(json/csv)") # 人民日报配置 RMRB_BASE_URL: str = Field(default="http://www.people.com.cn", description="人民日报基础URL") RMRB_CATEGORIES: Dict[str, str] = Field( default_factory=lambda: { "politics": "时政", "society": "社会", "world": "国际", "finance": "财经", "tech": "科技", "culture": "文化", "education": "教育" }, description="人民日报新闻分类" ) # API配置(用于将爬取的数据推送到后端) API_BASE_URL: str = Field(default="http://localhost:8080", description="后端API地址") API_TOKEN: str = Field(default="", description="API认证Token") class Config: env_file = ".env" env_file_encoding = "utf-8" case_sensitive = True # 创建全局配置实例 settings = Settings() # 常用User-Agent列表 USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', ]