Initial commit: 招标信息爬虫与分析系统

This commit is contained in:
ztb-system
2026-02-13 18:15:20 +08:00
commit d2fa06801f
38 changed files with 5415 additions and 0 deletions

151
scheduler.py Normal file
View File

@@ -0,0 +1,151 @@
# -*- coding: utf-8 -*-
r"""
定时爬取入口 —— 每天自动采集前一天的数据
使用方式:
1. 直接运行(单次采集昨天数据):
python scheduler.py
2. Windows 计划任务(每天早上 8:00 自动运行):
schtasks /create /tn "ZTB_Spider" /tr "python <项目路径>\scheduler.py" /sc daily /st 08:00
3. Linux cron每天早上 8:00:
0 8 * * * cd /path/to/ztb && python scheduler.py >> logs/cron.log 2>&1
"""
import logging
import sys
import os
import traceback
from datetime import datetime
# 确保项目根目录在 sys.path 中
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
from spiders import ZhejiangSpider, TaizhouSpider
from spiders.base import setup_logging
logger = logging.getLogger("ztb")
# ============ 爬取任务配置 ============
# 在这里定义每天要跑哪些任务
DAILY_TASKS = [
# 浙江省 - 工程建设 - 招标文件公示
{
"site": "zhejiang",
"max_pages": 100,
"category": "工程建设",
"notice_type": "招标文件公示",
"process": True,
"upload": True,
},
# 浙江省 - 工程建设 - 招标公告
{
"site": "zhejiang",
"max_pages": 100,
"category": "工程建设",
"notice_type": "招标公告",
"process": True,
"upload": True,
},
# 浙江省 - 工程建设 - 澄清修改
{
"site": "zhejiang",
"max_pages": 100,
"category": "工程建设",
"notice_type": "澄清修改",
"process": True,
"upload": True,
},
# 台州 - 工程建设 - 招标文件公示
{
"site": "taizhou",
"max_pages": 100,
"category": "工程建设",
"notice_type": "招标文件公示",
"process": True,
"upload": True,
},
]
def run_task(task: dict, date_filter: str = "yesterday") -> int:
"""执行单个爬取任务,返回采集条数"""
site = task["site"]
max_pages = task.get("max_pages", 10)
category = task.get("category")
notice_type = task.get("notice_type")
if site == "zhejiang":
config = ZHEJIANG_CONFIG
spider = ZhejiangSpider(config, SPIDER_CONFIG, DATA_DIR)
elif site == "taizhou":
config = TAIZHOU_CONFIG
spider = TaizhouSpider(config, SPIDER_CONFIG, DATA_DIR)
else:
logger.error(f"未知站点: {site}")
return 0
spider.crawl(
max_pages=max_pages,
category=category,
notice_type=notice_type,
date_filter=date_filter,
)
spider.save_to_csv()
# AI 处理 + 简道云上传
if task.get("process") and spider.results and notice_type:
from processors import ProcessingPipeline
pipeline = ProcessingPipeline()
pipeline.process_results(
spider.results,
site=site,
notice_type=notice_type,
upload=task.get("upload", False),
)
return len(spider.results)
def run_daily():
"""执行每日定时任务"""
setup_logging()
start = datetime.now()
logger.info("=" * 40)
logger.info(f"定时任务启动: {start.strftime('%Y-%m-%d %H:%M:%S')}")
logger.info(f"{len(DAILY_TASKS)} 个任务")
logger.info("=" * 40)
total = 0
errors = []
for i, task in enumerate(DAILY_TASKS, 1):
desc = f"{task['site']} / {task.get('category', '全部')}"
if task.get("notice_type"):
desc += f" / {task['notice_type']}"
logger.info(f"[{i}/{len(DAILY_TASKS)}] {desc}")
try:
count = run_task(task)
total += count
logger.info(f"[{i}/{len(DAILY_TASKS)}] 完成,{count}")
except Exception as e:
logger.error(f"[{i}/{len(DAILY_TASKS)}] 失败: {e}")
logger.debug(traceback.format_exc())
errors.append(desc)
elapsed = (datetime.now() - start).total_seconds()
logger.info("=" * 40)
logger.info(f"定时任务完成: 共 {total} 条, 耗时 {elapsed:.0f}s")
if errors:
logger.error(f"失败任务: {', '.join(errors)}")
logger.info("=" * 40)
return total, errors
if __name__ == "__main__":
run_daily()