# -*- coding: utf-8 -*- """ 公共资源交易中心爬虫 - 主程序 支持:浙江省、台州市 可选:DeepSeek AI 处理 + 简道云上传 """ import argparse import logging from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR from spiders import ZhejiangSpider, TaizhouSpider from spiders.base import setup_logging logger = logging.getLogger("ztb") def crawl_zhejiang(max_pages=5, category=None, notice_type=None, date_filter=None, download_attachment=False): """爬取浙江省公共资源交易中心""" spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR) spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type, date_filter=date_filter, download_attachment=download_attachment) spider.save_to_csv() return spider.results def crawl_taizhou(max_pages=5, category=None, notice_type=None, date_filter=None, download_attachment=False): """爬取台州公共资源交易中心""" spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR) spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type, date_filter=date_filter, download_attachment=download_attachment) spider.save_to_csv() return spider.results def crawl_all(max_pages=5, category=None, notice_type=None, date_filter=None, download_attachment=False): """爬取所有网站""" all_results = [] logger.info("=" * 40) results = crawl_zhejiang(max_pages, category, notice_type, date_filter, download_attachment) all_results.extend(results) logger.info("=" * 40) results = crawl_taizhou(max_pages, category, notice_type, date_filter, download_attachment) all_results.extend(results) logger.info(f"全部爬取完成,共 {len(all_results)} 条数据") return all_results def main(): parser = argparse.ArgumentParser(description='公共资源交易中心爬虫') parser.add_argument( '-s', '--site', choices=['zhejiang', 'taizhou', 'all'], default='zhejiang', help='选择爬取的网站 (默认: zhejiang)' ) parser.add_argument( '-p', '--pages', type=int, default=None, help='爬取页数 (默认: 5, 指定日期时默认100)' ) parser.add_argument( '-c', '--category', default=None, help='交易领域 (如: 工程建设, 政府采购)' ) parser.add_argument( '-t', '--type', default=None, help='公告类型 (如: 招标公告, 招标文件公示)' ) parser.add_argument( '-d', '--date', default=None, help='日期过滤 (yesterday 或 2026-02-03)' ) parser.add_argument( '-a', '--attachment', action='store_true', help='下载并解析附件' ) parser.add_argument( '-P', '--process', action='store_true', help='启用 DeepSeek AI 处理(提取结构化字段)' ) parser.add_argument( '-U', '--upload', action='store_true', help='上传处理结果到简道云(需配合 -P 使用)' ) args = parser.parse_args() setup_logging() # 页数:指定日期时自动放大,确保抓完全部数据 max_pages = args.pages if max_pages is None: max_pages = 100 if args.date else 5 # 爬取 results = [] # 为台州招标计划公示设置默认的工程建设类别 if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category: args.category = '工程建设' logger.info("为台州招标计划公示自动设置类别: 工程建设") if args.site == 'zhejiang': results = crawl_zhejiang( max_pages, args.category, args.type, args.date, args.attachment) elif args.site == 'taizhou': results = crawl_taizhou( max_pages, args.category, args.type, args.date, args.attachment) elif args.site == 'all': results = crawl_all( max_pages, args.category, args.type, args.date, args.attachment) # AI 处理 if args.process and results and args.type: from processors import ProcessingPipeline pipeline = ProcessingPipeline() if args.site == 'all': # 按站点分组处理 source_to_site = { ZHEJIANG_CONFIG['name']: 'zhejiang', TAIZHOU_CONFIG['name']: 'taizhou', } for source, site_name in source_to_site.items(): site_results = [ r for r in results if r.get('来源') == source] if site_results: pipeline.process_results( site_results, site=site_name, notice_type=args.type, upload=args.upload, ) else: pipeline.process_results( results, site=args.site, notice_type=args.type, upload=args.upload, ) elif args.process and not args.type: logger.warning("启用AI处理时需指定公告类型 (-t),已跳过") if __name__ == '__main__': main()