87 lines
2.4 KiB
Python
87 lines
2.4 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
测试上传招标公告和招标计划表单
|
||
|
|
"""
|
||
|
|
import logging
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
|
||
|
|
# 添加当前目录到模块搜索路径
|
||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
|
||
|
|
# 导入配置和处理器
|
||
|
|
from config import ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR
|
||
|
|
from spiders import ZhejiangSpider
|
||
|
|
from processors import ProcessingPipeline
|
||
|
|
|
||
|
|
# 配置日志
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
def crawl_and_upload(notice_type, max_pages=1):
|
||
|
|
"""爬取并上传指定类型的表单"""
|
||
|
|
logger.info(f"\n{'='*70}")
|
||
|
|
logger.info(f"开始处理: {notice_type}")
|
||
|
|
logger.info(f"{'='*70}")
|
||
|
|
|
||
|
|
# 1. 爬取数据
|
||
|
|
logger.info("1. 爬取数据:")
|
||
|
|
spider = ZhejiangSpider(ZHEJIANG_CONFIG, SPIDER_CONFIG, DATA_DIR)
|
||
|
|
|
||
|
|
# 爬取数据
|
||
|
|
spider.crawl(
|
||
|
|
max_pages=max_pages,
|
||
|
|
category="工程建设",
|
||
|
|
notice_type=notice_type
|
||
|
|
)
|
||
|
|
|
||
|
|
# 保存到CSV
|
||
|
|
spider.save_to_csv()
|
||
|
|
|
||
|
|
# 获取爬取结果
|
||
|
|
results = spider.results
|
||
|
|
logger.info(f"爬取完成,共获取 {len(results)} 条数据")
|
||
|
|
|
||
|
|
if len(results) == 0:
|
||
|
|
logger.error("爬取失败,无数据")
|
||
|
|
return
|
||
|
|
|
||
|
|
# 2. 处理数据
|
||
|
|
logger.info("\n2. 处理数据:")
|
||
|
|
pipeline = ProcessingPipeline()
|
||
|
|
|
||
|
|
processed = pipeline.process_results(
|
||
|
|
results,
|
||
|
|
site="zhejiang",
|
||
|
|
notice_type=notice_type,
|
||
|
|
upload=True # 上传到简道云
|
||
|
|
)
|
||
|
|
|
||
|
|
# 3. 展示结果
|
||
|
|
logger.info("\n3. 处理结果:")
|
||
|
|
logger.info(f"成功处理 {len(processed)} 条数据")
|
||
|
|
|
||
|
|
# 展示前2条的关键信息
|
||
|
|
logger.info("\n前2条数据关键信息:")
|
||
|
|
for i, record in enumerate(processed[:2], 1):
|
||
|
|
logger.info(f"\n测试 {i}")
|
||
|
|
logger.info(f"项目名称: {record.get('项目名称', '文档未提及')}")
|
||
|
|
logger.info(f"项目批准文号: {record.get('项目批准文号', '文档未提及')}")
|
||
|
|
logger.info(f"批准文号: {record.get('批准文号', '文档未提及')}")
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""主函数"""
|
||
|
|
logger.info("开始上传招标公告和招标计划表单")
|
||
|
|
|
||
|
|
# 处理招标公告
|
||
|
|
crawl_and_upload("招标公告", max_pages=1)
|
||
|
|
|
||
|
|
# 处理招标计划
|
||
|
|
crawl_and_upload("招标计划", max_pages=1)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|