Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
100
test_single_item.py
Normal file
100
test_single_item.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
测试单条数据处理
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 添加当前目录到模块搜索路径
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# 导入配置和处理器
|
||||
from config import REGION_CONFIGS, PROCESSING_CONFIG
|
||||
from spiders.zhejiang import ZhejiangSpider
|
||||
from processors.pipeline import ProcessingPipeline
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 测试URL
|
||||
TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html"
|
||||
|
||||
# 模拟爬虫结果
|
||||
TEST_ITEM = {
|
||||
"标题": "[招标文件](测-试)临海市房建施工0212-2招标文件公示[A3300000090000695005001]",
|
||||
"发布日期": "2026-02-12",
|
||||
"地区": "临海市",
|
||||
"公告类型": "招标文件公示",
|
||||
"链接": TEST_URL,
|
||||
"来源": "浙江省公共资源交易中心"
|
||||
}
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
logger.info("开始单条数据测试")
|
||||
|
||||
# 1. 测试标题解析
|
||||
from spiders.zhejiang import ZhejiangSpider
|
||||
# 模拟标题解析
|
||||
import re
|
||||
title = TEST_ITEM["标题"]
|
||||
# 使用修复后的正则表达式
|
||||
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
project_name = match.group(1).strip()
|
||||
project_approval = match.group(2).strip()
|
||||
logger.info(f"标题解析结果:")
|
||||
logger.info(f" 项目名称: {project_name}")
|
||||
logger.info(f" 项目批准文号: {project_approval}")
|
||||
else:
|
||||
logger.warning("标题解析失败")
|
||||
|
||||
# 2. 测试处理管道
|
||||
logger.info("\n测试处理管道:")
|
||||
pipeline = ProcessingPipeline()
|
||||
|
||||
# 模拟ZhejiangSpider的处理过程,添加项目名称和项目批准文号
|
||||
test_item_with_fields = TEST_ITEM.copy()
|
||||
# 使用修复后的标题解析
|
||||
title = test_item_with_fields["标题"]
|
||||
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
test_item_with_fields["项目名称"] = match.group(1).strip()
|
||||
test_item_with_fields["项目批准文号"] = match.group(2).strip()
|
||||
|
||||
logger.info(f"添加字段后的测试项:")
|
||||
logger.info(f" 项目名称: {test_item_with_fields.get('项目名称', '无')}")
|
||||
logger.info(f" 项目批准文号: {test_item_with_fields.get('项目批准文号', '无')}")
|
||||
|
||||
# 模拟爬虫结果列表
|
||||
results = [test_item_with_fields]
|
||||
|
||||
# 处理结果
|
||||
processed = pipeline.process_results(
|
||||
results,
|
||||
site="zhejiang",
|
||||
notice_type="招标文件公示",
|
||||
upload=False
|
||||
)
|
||||
|
||||
# 分析结果
|
||||
if processed:
|
||||
record = processed[0]
|
||||
logger.info("\n处理结果:")
|
||||
logger.info(f" 项目名称: {record.get('项目名称', '文档未提及')}")
|
||||
logger.info(f" 项目批准文号: {record.get('项目批准文号', '文档未提及')}")
|
||||
logger.info(f" 批准文号: {record.get('批准文号', '文档未提及')}")
|
||||
logger.info(f" 最高投标限价: {record.get('最高投标限价', '文档未提及')}")
|
||||
logger.info(f" 最高限价: {record.get('最高限价', '文档未提及')}")
|
||||
else:
|
||||
logger.error("处理失败")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user