Files
2025-11-10 15:22:44 +08:00

78 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
爬虫配置文件
"""
import os
from typing import Dict
from pydantic_settings import BaseSettings
from pydantic import Field
class Settings(BaseSettings):
"""应用配置"""
# 爬虫配置
REQUEST_TIMEOUT: int = Field(default=30, description="请求超时时间(秒)")
RETRY_TIMES: int = Field(default=3, description="重试次数")
CONCURRENT_REQUESTS: int = Field(default=5, description="并发请求数")
DOWNLOAD_DELAY: float = Field(default=1.0, description="下载延迟(秒)")
# 代理配置
USE_PROXY: bool = Field(default=False, description="是否使用代理")
PROXY_URL: str = Field(default="", description="代理地址")
# 数据库配置(用于存储爬取的数据)
DB_HOST: str = Field(default="localhost", description="数据库主机")
DB_PORT: int = Field(default=3306, description="数据库端口")
DB_USER: str = Field(default="root", description="数据库用户名")
DB_PASSWORD: str = Field(default="", description="数据库密码")
DB_NAME: str = Field(default="school_news", description="数据库名称")
# 日志配置
LOG_LEVEL: str = Field(default="INFO", description="日志级别")
LOG_DIR: str = Field(default="logs", description="日志目录")
LOG_RETENTION: str = Field(default="30 days", description="日志保留时间")
# 输出配置
OUTPUT_DIR: str = Field(default="output", description="输出目录")
OUTPUT_FORMAT: str = Field(default="json", description="输出格式(json/csv)")
# 人民日报配置
RMRB_BASE_URL: str = Field(default="http://www.people.com.cn", description="人民日报基础URL")
RMRB_CATEGORIES: Dict[str, str] = Field(
default_factory=lambda: {
"politics": "时政",
"society": "社会",
"world": "国际",
"finance": "财经",
"tech": "科技",
"culture": "文化",
"education": "教育"
},
description="人民日报新闻分类"
)
# API配置用于将爬取的数据推送到后端
API_BASE_URL: str = Field(default="http://localhost:8080", description="后端API地址")
API_TOKEN: str = Field(default="", description="API认证Token")
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = True
# 创建全局配置实例
settings = Settings()
# 常用User-Agent列表
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
]