人民日报爬虫

This commit is contained in:
2025-11-10 15:22:44 +08:00
parent 08df5f1e8a
commit e8b76278e9
36 changed files with 4241 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
"""
爬虫配置文件
"""
import os
from typing import Dict
from pydantic_settings import BaseSettings
from pydantic import Field
class Settings(BaseSettings):
"""应用配置"""
# 爬虫配置
REQUEST_TIMEOUT: int = Field(default=30, description="请求超时时间(秒)")
RETRY_TIMES: int = Field(default=3, description="重试次数")
CONCURRENT_REQUESTS: int = Field(default=5, description="并发请求数")
DOWNLOAD_DELAY: float = Field(default=1.0, description="下载延迟(秒)")
# 代理配置
USE_PROXY: bool = Field(default=False, description="是否使用代理")
PROXY_URL: str = Field(default="", description="代理地址")
# 数据库配置(用于存储爬取的数据)
DB_HOST: str = Field(default="localhost", description="数据库主机")
DB_PORT: int = Field(default=3306, description="数据库端口")
DB_USER: str = Field(default="root", description="数据库用户名")
DB_PASSWORD: str = Field(default="", description="数据库密码")
DB_NAME: str = Field(default="school_news", description="数据库名称")
# 日志配置
LOG_LEVEL: str = Field(default="INFO", description="日志级别")
LOG_DIR: str = Field(default="logs", description="日志目录")
LOG_RETENTION: str = Field(default="30 days", description="日志保留时间")
# 输出配置
OUTPUT_DIR: str = Field(default="output", description="输出目录")
OUTPUT_FORMAT: str = Field(default="json", description="输出格式(json/csv)")
# 人民日报配置
RMRB_BASE_URL: str = Field(default="http://www.people.com.cn", description="人民日报基础URL")
RMRB_CATEGORIES: Dict[str, str] = Field(
default_factory=lambda: {
"politics": "时政",
"society": "社会",
"world": "国际",
"finance": "财经",
"tech": "科技",
"culture": "文化",
"education": "教育"
},
description="人民日报新闻分类"
)
# API配置用于将爬取的数据推送到后端
API_BASE_URL: str = Field(default="http://localhost:8080", description="后端API地址")
API_TOKEN: str = Field(default="", description="API认证Token")
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
case_sensitive = True
# 创建全局配置实例
settings = Settings()
# 常用User-Agent列表
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
]