154 lines
5.2 KiB
Python
154 lines
5.2 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
公共资源交易中心爬虫 - 主程序
|
|||
|
|
支持:浙江省、台州市
|
|||
|
|
可选:DeepSeek AI 处理 + 简道云上传
|
|||
|
|
"""
|
|||
|
|
import argparse
|
|||
|
|
import logging
|
|||
|
|
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
|
|||
|
|
from spiders import ZhejiangSpider, TaizhouSpider
|
|||
|
|
from spiders.base import setup_logging
|
|||
|
|
|
|||
|
|
logger = logging.getLogger("ztb")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_zhejiang(max_pages=5, category=None, notice_type=None,
|
|||
|
|
date_filter=None, download_attachment=False):
|
|||
|
|
"""爬取浙江省公共资源交易中心"""
|
|||
|
|
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
|||
|
|
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
|||
|
|
date_filter=date_filter, download_attachment=download_attachment)
|
|||
|
|
spider.save_to_csv()
|
|||
|
|
return spider.results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_taizhou(max_pages=5, category=None, notice_type=None,
|
|||
|
|
date_filter=None, download_attachment=False):
|
|||
|
|
"""爬取台州公共资源交易中心"""
|
|||
|
|
spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
|||
|
|
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
|||
|
|
date_filter=date_filter, download_attachment=download_attachment)
|
|||
|
|
spider.save_to_csv()
|
|||
|
|
return spider.results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl_all(max_pages=5, category=None, notice_type=None,
|
|||
|
|
date_filter=None, download_attachment=False):
|
|||
|
|
"""爬取所有网站"""
|
|||
|
|
all_results = []
|
|||
|
|
|
|||
|
|
logger.info("=" * 40)
|
|||
|
|
results = crawl_zhejiang(max_pages, category, notice_type,
|
|||
|
|
date_filter, download_attachment)
|
|||
|
|
all_results.extend(results)
|
|||
|
|
|
|||
|
|
logger.info("=" * 40)
|
|||
|
|
results = crawl_taizhou(max_pages, category, notice_type,
|
|||
|
|
date_filter, download_attachment)
|
|||
|
|
all_results.extend(results)
|
|||
|
|
|
|||
|
|
logger.info(f"全部爬取完成,共 {len(all_results)} 条数据")
|
|||
|
|
return all_results
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description='公共资源交易中心爬虫')
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-s', '--site',
|
|||
|
|
choices=['zhejiang', 'taizhou', 'all'],
|
|||
|
|
default='zhejiang',
|
|||
|
|
help='选择爬取的网站 (默认: zhejiang)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-p', '--pages',
|
|||
|
|
type=int,
|
|||
|
|
default=None,
|
|||
|
|
help='爬取页数 (默认: 5, 指定日期时默认100)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-c', '--category',
|
|||
|
|
default=None,
|
|||
|
|
help='交易领域 (如: 工程建设, 政府采购)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-t', '--type',
|
|||
|
|
default=None,
|
|||
|
|
help='公告类型 (如: 招标公告, 招标文件公示)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-d', '--date',
|
|||
|
|
default=None,
|
|||
|
|
help='日期过滤 (yesterday 或 2026-02-03)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-a', '--attachment',
|
|||
|
|
action='store_true',
|
|||
|
|
help='下载并解析附件'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-P', '--process',
|
|||
|
|
action='store_true',
|
|||
|
|
help='启用 DeepSeek AI 处理(提取结构化字段)'
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
'-U', '--upload',
|
|||
|
|
action='store_true',
|
|||
|
|
help='上传处理结果到简道云(需配合 -P 使用)'
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
setup_logging()
|
|||
|
|
|
|||
|
|
# 页数:指定日期时自动放大,确保抓完全部数据
|
|||
|
|
max_pages = args.pages
|
|||
|
|
if max_pages is None:
|
|||
|
|
max_pages = 100 if args.date else 5
|
|||
|
|
|
|||
|
|
# 爬取
|
|||
|
|
results = []
|
|||
|
|
# 为台州招标计划公示设置默认的工程建设类别
|
|||
|
|
if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category:
|
|||
|
|
args.category = '工程建设'
|
|||
|
|
logger.info("为台州招标计划公示自动设置类别: 工程建设")
|
|||
|
|
|
|||
|
|
if args.site == 'zhejiang':
|
|||
|
|
results = crawl_zhejiang(
|
|||
|
|
max_pages, args.category, args.type, args.date, args.attachment)
|
|||
|
|
elif args.site == 'taizhou':
|
|||
|
|
results = crawl_taizhou(
|
|||
|
|
max_pages, args.category, args.type, args.date, args.attachment)
|
|||
|
|
elif args.site == 'all':
|
|||
|
|
results = crawl_all(
|
|||
|
|
max_pages, args.category, args.type, args.date, args.attachment)
|
|||
|
|
|
|||
|
|
# AI 处理
|
|||
|
|
if args.process and results and args.type:
|
|||
|
|
from processors import ProcessingPipeline
|
|||
|
|
pipeline = ProcessingPipeline()
|
|||
|
|
if args.site == 'all':
|
|||
|
|
# 按站点分组处理
|
|||
|
|
source_to_site = {
|
|||
|
|
ZHEJIANG_CONFIG['name']: 'zhejiang',
|
|||
|
|
TAIZHOU_CONFIG['name']: 'taizhou',
|
|||
|
|
}
|
|||
|
|
for source, site_name in source_to_site.items():
|
|||
|
|
site_results = [
|
|||
|
|
r for r in results if r.get('来源') == source]
|
|||
|
|
if site_results:
|
|||
|
|
pipeline.process_results(
|
|||
|
|
site_results, site=site_name,
|
|||
|
|
notice_type=args.type, upload=args.upload,
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
pipeline.process_results(
|
|||
|
|
results, site=args.site,
|
|||
|
|
notice_type=args.type, upload=args.upload,
|
|||
|
|
)
|
|||
|
|
elif args.process and not args.type:
|
|||
|
|
logger.warning("启用AI处理时需指定公告类型 (-t),已跳过")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|