Files
ztb/main.py
2026-02-13 18:15:20 +08:00

154 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
公共资源交易中心爬虫 - 主程序
支持:浙江省、台州市
可选DeepSeek AI 处理 + 简道云上传
"""
import argparse
import logging
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
from spiders import ZhejiangSpider, TaizhouSpider
from spiders.base import setup_logging
logger = logging.getLogger("ztb")
def crawl_zhejiang(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取浙江省公共资源交易中心"""
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
date_filter=date_filter, download_attachment=download_attachment)
spider.save_to_csv()
return spider.results
def crawl_taizhou(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取台州公共资源交易中心"""
spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR)
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
date_filter=date_filter, download_attachment=download_attachment)
spider.save_to_csv()
return spider.results
def crawl_all(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取所有网站"""
all_results = []
logger.info("=" * 40)
results = crawl_zhejiang(max_pages, category, notice_type,
date_filter, download_attachment)
all_results.extend(results)
logger.info("=" * 40)
results = crawl_taizhou(max_pages, category, notice_type,
date_filter, download_attachment)
all_results.extend(results)
logger.info(f"全部爬取完成,共 {len(all_results)} 条数据")
return all_results
def main():
parser = argparse.ArgumentParser(description='公共资源交易中心爬虫')
parser.add_argument(
'-s', '--site',
choices=['zhejiang', 'taizhou', 'all'],
default='zhejiang',
help='选择爬取的网站 (默认: zhejiang)'
)
parser.add_argument(
'-p', '--pages',
type=int,
default=None,
help='爬取页数 (默认: 5, 指定日期时默认100)'
)
parser.add_argument(
'-c', '--category',
default=None,
help='交易领域 (如: 工程建设, 政府采购)'
)
parser.add_argument(
'-t', '--type',
default=None,
help='公告类型 (如: 招标公告, 招标文件公示)'
)
parser.add_argument(
'-d', '--date',
default=None,
help='日期过滤 (yesterday 或 2026-02-03)'
)
parser.add_argument(
'-a', '--attachment',
action='store_true',
help='下载并解析附件'
)
parser.add_argument(
'-P', '--process',
action='store_true',
help='启用 DeepSeek AI 处理(提取结构化字段)'
)
parser.add_argument(
'-U', '--upload',
action='store_true',
help='上传处理结果到简道云(需配合 -P 使用)'
)
args = parser.parse_args()
setup_logging()
# 页数:指定日期时自动放大,确保抓完全部数据
max_pages = args.pages
if max_pages is None:
max_pages = 100 if args.date else 5
# 爬取
results = []
# 为台州招标计划公示设置默认的工程建设类别
if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category:
args.category = '工程建设'
logger.info("为台州招标计划公示自动设置类别: 工程建设")
if args.site == 'zhejiang':
results = crawl_zhejiang(
max_pages, args.category, args.type, args.date, args.attachment)
elif args.site == 'taizhou':
results = crawl_taizhou(
max_pages, args.category, args.type, args.date, args.attachment)
elif args.site == 'all':
results = crawl_all(
max_pages, args.category, args.type, args.date, args.attachment)
# AI 处理
if args.process and results and args.type:
from processors import ProcessingPipeline
pipeline = ProcessingPipeline()
if args.site == 'all':
# 按站点分组处理
source_to_site = {
ZHEJIANG_CONFIG['name']: 'zhejiang',
TAIZHOU_CONFIG['name']: 'taizhou',
}
for source, site_name in source_to_site.items():
site_results = [
r for r in results if r.get('来源') == source]
if site_results:
pipeline.process_results(
site_results, site=site_name,
notice_type=args.type, upload=args.upload,
)
else:
pipeline.process_results(
results, site=args.site,
notice_type=args.type, upload=args.upload,
)
elif args.process and not args.type:
logger.warning("启用AI处理时需指定公告类型 (-t),已跳过")
if __name__ == '__main__':
main()