154 lines
5.2 KiB
Python
154 lines
5.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
公共资源交易中心爬虫 - 主程序
|
||
支持:浙江省、台州市
|
||
可选:DeepSeek AI 处理 + 简道云上传
|
||
"""
|
||
import argparse
|
||
import logging
|
||
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
|
||
from spiders import ZhejiangSpider, TaizhouSpider
|
||
from spiders.base import setup_logging
|
||
|
||
logger = logging.getLogger("ztb")
|
||
|
||
|
||
def crawl_zhejiang(max_pages=5, category=None, notice_type=None,
|
||
date_filter=None, download_attachment=False):
|
||
"""爬取浙江省公共资源交易中心"""
|
||
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
||
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
||
date_filter=date_filter, download_attachment=download_attachment)
|
||
spider.save_to_csv()
|
||
return spider.results
|
||
|
||
|
||
def crawl_taizhou(max_pages=5, category=None, notice_type=None,
|
||
date_filter=None, download_attachment=False):
|
||
"""爬取台州公共资源交易中心"""
|
||
spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
||
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
||
date_filter=date_filter, download_attachment=download_attachment)
|
||
spider.save_to_csv()
|
||
return spider.results
|
||
|
||
|
||
def crawl_all(max_pages=5, category=None, notice_type=None,
|
||
date_filter=None, download_attachment=False):
|
||
"""爬取所有网站"""
|
||
all_results = []
|
||
|
||
logger.info("=" * 40)
|
||
results = crawl_zhejiang(max_pages, category, notice_type,
|
||
date_filter, download_attachment)
|
||
all_results.extend(results)
|
||
|
||
logger.info("=" * 40)
|
||
results = crawl_taizhou(max_pages, category, notice_type,
|
||
date_filter, download_attachment)
|
||
all_results.extend(results)
|
||
|
||
logger.info(f"全部爬取完成,共 {len(all_results)} 条数据")
|
||
return all_results
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='公共资源交易中心爬虫')
|
||
parser.add_argument(
|
||
'-s', '--site',
|
||
choices=['zhejiang', 'taizhou', 'all'],
|
||
default='zhejiang',
|
||
help='选择爬取的网站 (默认: zhejiang)'
|
||
)
|
||
parser.add_argument(
|
||
'-p', '--pages',
|
||
type=int,
|
||
default=None,
|
||
help='爬取页数 (默认: 5, 指定日期时默认100)'
|
||
)
|
||
parser.add_argument(
|
||
'-c', '--category',
|
||
default=None,
|
||
help='交易领域 (如: 工程建设, 政府采购)'
|
||
)
|
||
parser.add_argument(
|
||
'-t', '--type',
|
||
default=None,
|
||
help='公告类型 (如: 招标公告, 招标文件公示)'
|
||
)
|
||
parser.add_argument(
|
||
'-d', '--date',
|
||
default=None,
|
||
help='日期过滤 (yesterday 或 2026-02-03)'
|
||
)
|
||
parser.add_argument(
|
||
'-a', '--attachment',
|
||
action='store_true',
|
||
help='下载并解析附件'
|
||
)
|
||
parser.add_argument(
|
||
'-P', '--process',
|
||
action='store_true',
|
||
help='启用 DeepSeek AI 处理(提取结构化字段)'
|
||
)
|
||
parser.add_argument(
|
||
'-U', '--upload',
|
||
action='store_true',
|
||
help='上传处理结果到简道云(需配合 -P 使用)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
setup_logging()
|
||
|
||
# 页数:指定日期时自动放大,确保抓完全部数据
|
||
max_pages = args.pages
|
||
if max_pages is None:
|
||
max_pages = 100 if args.date else 5
|
||
|
||
# 爬取
|
||
results = []
|
||
# 为台州招标计划公示设置默认的工程建设类别
|
||
if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category:
|
||
args.category = '工程建设'
|
||
logger.info("为台州招标计划公示自动设置类别: 工程建设")
|
||
|
||
if args.site == 'zhejiang':
|
||
results = crawl_zhejiang(
|
||
max_pages, args.category, args.type, args.date, args.attachment)
|
||
elif args.site == 'taizhou':
|
||
results = crawl_taizhou(
|
||
max_pages, args.category, args.type, args.date, args.attachment)
|
||
elif args.site == 'all':
|
||
results = crawl_all(
|
||
max_pages, args.category, args.type, args.date, args.attachment)
|
||
|
||
# AI 处理
|
||
if args.process and results and args.type:
|
||
from processors import ProcessingPipeline
|
||
pipeline = ProcessingPipeline()
|
||
if args.site == 'all':
|
||
# 按站点分组处理
|
||
source_to_site = {
|
||
ZHEJIANG_CONFIG['name']: 'zhejiang',
|
||
TAIZHOU_CONFIG['name']: 'taizhou',
|
||
}
|
||
for source, site_name in source_to_site.items():
|
||
site_results = [
|
||
r for r in results if r.get('来源') == source]
|
||
if site_results:
|
||
pipeline.process_results(
|
||
site_results, site=site_name,
|
||
notice_type=args.type, upload=args.upload,
|
||
)
|
||
else:
|
||
pipeline.process_results(
|
||
results, site=args.site,
|
||
notice_type=args.type, upload=args.upload,
|
||
)
|
||
elif args.process and not args.type:
|
||
logger.warning("启用AI处理时需指定公告类型 (-t),已跳过")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|