Files
ztb/main.py

154 lines
5.2 KiB
Python
Raw Permalink Normal View History

# -*- coding: utf-8 -*-
"""
公共资源交易中心爬虫 - 主程序
支持浙江省台州市
可选DeepSeek AI 处理 + 简道云上传
"""
import argparse
import logging
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
from spiders import ZhejiangSpider, TaizhouSpider
from spiders.base import setup_logging
logger = logging.getLogger("ztb")
def crawl_zhejiang(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取浙江省公共资源交易中心"""
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
date_filter=date_filter, download_attachment=download_attachment)
spider.save_to_csv()
return spider.results
def crawl_taizhou(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取台州公共资源交易中心"""
spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR)
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
date_filter=date_filter, download_attachment=download_attachment)
spider.save_to_csv()
return spider.results
def crawl_all(max_pages=5, category=None, notice_type=None,
date_filter=None, download_attachment=False):
"""爬取所有网站"""
all_results = []
logger.info("=" * 40)
results = crawl_zhejiang(max_pages, category, notice_type,
date_filter, download_attachment)
all_results.extend(results)
logger.info("=" * 40)
results = crawl_taizhou(max_pages, category, notice_type,
date_filter, download_attachment)
all_results.extend(results)
logger.info(f"全部爬取完成,共 {len(all_results)} 条数据")
return all_results
def main():
parser = argparse.ArgumentParser(description='公共资源交易中心爬虫')
parser.add_argument(
'-s', '--site',
choices=['zhejiang', 'taizhou', 'all'],
default='zhejiang',
help='选择爬取的网站 (默认: zhejiang)'
)
parser.add_argument(
'-p', '--pages',
type=int,
default=None,
help='爬取页数 (默认: 5, 指定日期时默认100)'
)
parser.add_argument(
'-c', '--category',
default=None,
help='交易领域 (如: 工程建设, 政府采购)'
)
parser.add_argument(
'-t', '--type',
default=None,
help='公告类型 (如: 招标公告, 招标文件公示)'
)
parser.add_argument(
'-d', '--date',
default=None,
help='日期过滤 (yesterday 或 2026-02-03)'
)
parser.add_argument(
'-a', '--attachment',
action='store_true',
help='下载并解析附件'
)
parser.add_argument(
'-P', '--process',
action='store_true',
help='启用 DeepSeek AI 处理(提取结构化字段)'
)
parser.add_argument(
'-U', '--upload',
action='store_true',
help='上传处理结果到简道云(需配合 -P 使用)'
)
args = parser.parse_args()
setup_logging()
# 页数:指定日期时自动放大,确保抓完全部数据
max_pages = args.pages
if max_pages is None:
max_pages = 100 if args.date else 5
# 爬取
results = []
# 为台州招标计划公示设置默认的工程建设类别
if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category:
args.category = '工程建设'
logger.info("为台州招标计划公示自动设置类别: 工程建设")
if args.site == 'zhejiang':
results = crawl_zhejiang(
max_pages, args.category, args.type, args.date, args.attachment)
elif args.site == 'taizhou':
results = crawl_taizhou(
max_pages, args.category, args.type, args.date, args.attachment)
elif args.site == 'all':
results = crawl_all(
max_pages, args.category, args.type, args.date, args.attachment)
# AI 处理
if args.process and results and args.type:
from processors import ProcessingPipeline
pipeline = ProcessingPipeline()
if args.site == 'all':
# 按站点分组处理
source_to_site = {
ZHEJIANG_CONFIG['name']: 'zhejiang',
TAIZHOU_CONFIG['name']: 'taizhou',
}
for source, site_name in source_to_site.items():
site_results = [
r for r in results if r.get('来源') == source]
if site_results:
pipeline.process_results(
site_results, site=site_name,
notice_type=args.type, upload=args.upload,
)
else:
pipeline.process_results(
results, site=args.site,
notice_type=args.type, upload=args.upload,
)
elif args.process and not args.type:
logger.warning("启用AI处理时需指定公告类型 (-t),已跳过")
if __name__ == '__main__':
main()