Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
153
main.py
Normal file
153
main.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
公共资源交易中心爬虫 - 主程序
|
||||
支持:浙江省、台州市
|
||||
可选:DeepSeek AI 处理 + 简道云上传
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
from config import ZHEJIANG_CONFIG, TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR
|
||||
from spiders import ZhejiangSpider, TaizhouSpider
|
||||
from spiders.base import setup_logging
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
def crawl_zhejiang(max_pages=5, category=None, notice_type=None,
|
||||
date_filter=None, download_attachment=False):
|
||||
"""爬取浙江省公共资源交易中心"""
|
||||
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
||||
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
||||
date_filter=date_filter, download_attachment=download_attachment)
|
||||
spider.save_to_csv()
|
||||
return spider.results
|
||||
|
||||
|
||||
def crawl_taizhou(max_pages=5, category=None, notice_type=None,
|
||||
date_filter=None, download_attachment=False):
|
||||
"""爬取台州公共资源交易中心"""
|
||||
spider = TaizhouSpider(TAIZHOU_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
||||
spider.crawl(max_pages=max_pages, category=category, notice_type=notice_type,
|
||||
date_filter=date_filter, download_attachment=download_attachment)
|
||||
spider.save_to_csv()
|
||||
return spider.results
|
||||
|
||||
|
||||
def crawl_all(max_pages=5, category=None, notice_type=None,
|
||||
date_filter=None, download_attachment=False):
|
||||
"""爬取所有网站"""
|
||||
all_results = []
|
||||
|
||||
logger.info("=" * 40)
|
||||
results = crawl_zhejiang(max_pages, category, notice_type,
|
||||
date_filter, download_attachment)
|
||||
all_results.extend(results)
|
||||
|
||||
logger.info("=" * 40)
|
||||
results = crawl_taizhou(max_pages, category, notice_type,
|
||||
date_filter, download_attachment)
|
||||
all_results.extend(results)
|
||||
|
||||
logger.info(f"全部爬取完成,共 {len(all_results)} 条数据")
|
||||
return all_results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='公共资源交易中心爬虫')
|
||||
parser.add_argument(
|
||||
'-s', '--site',
|
||||
choices=['zhejiang', 'taizhou', 'all'],
|
||||
default='zhejiang',
|
||||
help='选择爬取的网站 (默认: zhejiang)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-p', '--pages',
|
||||
type=int,
|
||||
default=None,
|
||||
help='爬取页数 (默认: 5, 指定日期时默认100)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--category',
|
||||
default=None,
|
||||
help='交易领域 (如: 工程建设, 政府采购)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-t', '--type',
|
||||
default=None,
|
||||
help='公告类型 (如: 招标公告, 招标文件公示)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--date',
|
||||
default=None,
|
||||
help='日期过滤 (yesterday 或 2026-02-03)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-a', '--attachment',
|
||||
action='store_true',
|
||||
help='下载并解析附件'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-P', '--process',
|
||||
action='store_true',
|
||||
help='启用 DeepSeek AI 处理(提取结构化字段)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-U', '--upload',
|
||||
action='store_true',
|
||||
help='上传处理结果到简道云(需配合 -P 使用)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
|
||||
# 页数:指定日期时自动放大,确保抓完全部数据
|
||||
max_pages = args.pages
|
||||
if max_pages is None:
|
||||
max_pages = 100 if args.date else 5
|
||||
|
||||
# 爬取
|
||||
results = []
|
||||
# 为台州招标计划公示设置默认的工程建设类别
|
||||
if args.site == 'taizhou' and args.type == '招标计划公示' and not args.category:
|
||||
args.category = '工程建设'
|
||||
logger.info("为台州招标计划公示自动设置类别: 工程建设")
|
||||
|
||||
if args.site == 'zhejiang':
|
||||
results = crawl_zhejiang(
|
||||
max_pages, args.category, args.type, args.date, args.attachment)
|
||||
elif args.site == 'taizhou':
|
||||
results = crawl_taizhou(
|
||||
max_pages, args.category, args.type, args.date, args.attachment)
|
||||
elif args.site == 'all':
|
||||
results = crawl_all(
|
||||
max_pages, args.category, args.type, args.date, args.attachment)
|
||||
|
||||
# AI 处理
|
||||
if args.process and results and args.type:
|
||||
from processors import ProcessingPipeline
|
||||
pipeline = ProcessingPipeline()
|
||||
if args.site == 'all':
|
||||
# 按站点分组处理
|
||||
source_to_site = {
|
||||
ZHEJIANG_CONFIG['name']: 'zhejiang',
|
||||
TAIZHOU_CONFIG['name']: 'taizhou',
|
||||
}
|
||||
for source, site_name in source_to_site.items():
|
||||
site_results = [
|
||||
r for r in results if r.get('来源') == source]
|
||||
if site_results:
|
||||
pipeline.process_results(
|
||||
site_results, site=site_name,
|
||||
notice_type=args.type, upload=args.upload,
|
||||
)
|
||||
else:
|
||||
pipeline.process_results(
|
||||
results, site=args.site,
|
||||
notice_type=args.type, upload=args.upload,
|
||||
)
|
||||
elif args.process and not args.type:
|
||||
logger.warning("启用AI处理时需指定公告类型 (-t),已跳过")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user