人民日报爬虫
This commit is contained in:
77
schoolNewsCrawler/config.py
Normal file
77
schoolNewsCrawler/config.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
爬虫配置文件
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict
|
||||
from pydantic_settings import BaseSettings
|
||||
from pydantic import Field
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""应用配置"""
|
||||
|
||||
# 爬虫配置
|
||||
REQUEST_TIMEOUT: int = Field(default=30, description="请求超时时间(秒)")
|
||||
RETRY_TIMES: int = Field(default=3, description="重试次数")
|
||||
CONCURRENT_REQUESTS: int = Field(default=5, description="并发请求数")
|
||||
DOWNLOAD_DELAY: float = Field(default=1.0, description="下载延迟(秒)")
|
||||
|
||||
# 代理配置
|
||||
USE_PROXY: bool = Field(default=False, description="是否使用代理")
|
||||
PROXY_URL: str = Field(default="", description="代理地址")
|
||||
|
||||
# 数据库配置(用于存储爬取的数据)
|
||||
DB_HOST: str = Field(default="localhost", description="数据库主机")
|
||||
DB_PORT: int = Field(default=3306, description="数据库端口")
|
||||
DB_USER: str = Field(default="root", description="数据库用户名")
|
||||
DB_PASSWORD: str = Field(default="", description="数据库密码")
|
||||
DB_NAME: str = Field(default="school_news", description="数据库名称")
|
||||
|
||||
# 日志配置
|
||||
LOG_LEVEL: str = Field(default="INFO", description="日志级别")
|
||||
LOG_DIR: str = Field(default="logs", description="日志目录")
|
||||
LOG_RETENTION: str = Field(default="30 days", description="日志保留时间")
|
||||
|
||||
# 输出配置
|
||||
OUTPUT_DIR: str = Field(default="output", description="输出目录")
|
||||
OUTPUT_FORMAT: str = Field(default="json", description="输出格式(json/csv)")
|
||||
|
||||
# 人民日报配置
|
||||
RMRB_BASE_URL: str = Field(default="http://www.people.com.cn", description="人民日报基础URL")
|
||||
RMRB_CATEGORIES: Dict[str, str] = Field(
|
||||
default_factory=lambda: {
|
||||
"politics": "时政",
|
||||
"society": "社会",
|
||||
"world": "国际",
|
||||
"finance": "财经",
|
||||
"tech": "科技",
|
||||
"culture": "文化",
|
||||
"education": "教育"
|
||||
},
|
||||
description="人民日报新闻分类"
|
||||
)
|
||||
|
||||
# API配置(用于将爬取的数据推送到后端)
|
||||
API_BASE_URL: str = Field(default="http://localhost:8080", description="后端API地址")
|
||||
API_TOKEN: str = Field(default="", description="API认证Token")
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
case_sensitive = True
|
||||
|
||||
|
||||
# 创建全局配置实例
|
||||
settings = Settings()
|
||||
|
||||
|
||||
# 常用User-Agent列表
|
||||
USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user