78 lines
3.0 KiB
Python
78 lines
3.0 KiB
Python
|
|
"""
|
|||
|
|
爬虫配置文件
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from typing import Dict
|
|||
|
|
from pydantic_settings import BaseSettings
|
|||
|
|
from pydantic import Field
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Settings(BaseSettings):
|
|||
|
|
"""应用配置"""
|
|||
|
|
|
|||
|
|
# 爬虫配置
|
|||
|
|
REQUEST_TIMEOUT: int = Field(default=30, description="请求超时时间(秒)")
|
|||
|
|
RETRY_TIMES: int = Field(default=3, description="重试次数")
|
|||
|
|
CONCURRENT_REQUESTS: int = Field(default=5, description="并发请求数")
|
|||
|
|
DOWNLOAD_DELAY: float = Field(default=1.0, description="下载延迟(秒)")
|
|||
|
|
|
|||
|
|
# 代理配置
|
|||
|
|
USE_PROXY: bool = Field(default=False, description="是否使用代理")
|
|||
|
|
PROXY_URL: str = Field(default="", description="代理地址")
|
|||
|
|
|
|||
|
|
# 数据库配置(用于存储爬取的数据)
|
|||
|
|
DB_HOST: str = Field(default="localhost", description="数据库主机")
|
|||
|
|
DB_PORT: int = Field(default=3306, description="数据库端口")
|
|||
|
|
DB_USER: str = Field(default="root", description="数据库用户名")
|
|||
|
|
DB_PASSWORD: str = Field(default="", description="数据库密码")
|
|||
|
|
DB_NAME: str = Field(default="school_news", description="数据库名称")
|
|||
|
|
|
|||
|
|
# 日志配置
|
|||
|
|
LOG_LEVEL: str = Field(default="INFO", description="日志级别")
|
|||
|
|
LOG_DIR: str = Field(default="logs", description="日志目录")
|
|||
|
|
LOG_RETENTION: str = Field(default="30 days", description="日志保留时间")
|
|||
|
|
|
|||
|
|
# 输出配置
|
|||
|
|
OUTPUT_DIR: str = Field(default="output", description="输出目录")
|
|||
|
|
OUTPUT_FORMAT: str = Field(default="json", description="输出格式(json/csv)")
|
|||
|
|
|
|||
|
|
# 人民日报配置
|
|||
|
|
RMRB_BASE_URL: str = Field(default="http://www.people.com.cn", description="人民日报基础URL")
|
|||
|
|
RMRB_CATEGORIES: Dict[str, str] = Field(
|
|||
|
|
default_factory=lambda: {
|
|||
|
|
"politics": "时政",
|
|||
|
|
"society": "社会",
|
|||
|
|
"world": "国际",
|
|||
|
|
"finance": "财经",
|
|||
|
|
"tech": "科技",
|
|||
|
|
"culture": "文化",
|
|||
|
|
"education": "教育"
|
|||
|
|
},
|
|||
|
|
description="人民日报新闻分类"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# API配置(用于将爬取的数据推送到后端)
|
|||
|
|
API_BASE_URL: str = Field(default="http://localhost:8080", description="后端API地址")
|
|||
|
|
API_TOKEN: str = Field(default="", description="API认证Token")
|
|||
|
|
|
|||
|
|
class Config:
|
|||
|
|
env_file = ".env"
|
|||
|
|
env_file_encoding = "utf-8"
|
|||
|
|
case_sensitive = True
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 创建全局配置实例
|
|||
|
|
settings = Settings()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 常用User-Agent列表
|
|||
|
|
USER_AGENTS = [
|
|||
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|||
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
|||
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|||
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|||
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
|||
|
|
]
|
|||
|
|
|