101 lines
3.4 KiB
Python
101 lines
3.4 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
测试单条数据处理
|
|||
|
|
"""
|
|||
|
|
import logging
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
# 添加当前目录到模块搜索路径
|
|||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|||
|
|
|
|||
|
|
# 导入配置和处理器
|
|||
|
|
from config import REGION_CONFIGS, PROCESSING_CONFIG
|
|||
|
|
from spiders.zhejiang import ZhejiangSpider
|
|||
|
|
from processors.pipeline import ProcessingPipeline
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 测试URL
|
|||
|
|
TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html"
|
|||
|
|
|
|||
|
|
# 模拟爬虫结果
|
|||
|
|
TEST_ITEM = {
|
|||
|
|
"标题": "[招标文件](测-试)临海市房建施工0212-2招标文件公示[A3300000090000695005001]",
|
|||
|
|
"发布日期": "2026-02-12",
|
|||
|
|
"地区": "临海市",
|
|||
|
|
"公告类型": "招标文件公示",
|
|||
|
|
"链接": TEST_URL,
|
|||
|
|
"来源": "浙江省公共资源交易中心"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
logger.info("开始单条数据测试")
|
|||
|
|
|
|||
|
|
# 1. 测试标题解析
|
|||
|
|
from spiders.zhejiang import ZhejiangSpider
|
|||
|
|
# 模拟标题解析
|
|||
|
|
import re
|
|||
|
|
title = TEST_ITEM["标题"]
|
|||
|
|
# 使用修复后的正则表达式
|
|||
|
|
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
|||
|
|
match = re.search(title_pattern, title)
|
|||
|
|
if match:
|
|||
|
|
project_name = match.group(1).strip()
|
|||
|
|
project_approval = match.group(2).strip()
|
|||
|
|
logger.info(f"标题解析结果:")
|
|||
|
|
logger.info(f" 项目名称: {project_name}")
|
|||
|
|
logger.info(f" 项目批准文号: {project_approval}")
|
|||
|
|
else:
|
|||
|
|
logger.warning("标题解析失败")
|
|||
|
|
|
|||
|
|
# 2. 测试处理管道
|
|||
|
|
logger.info("\n测试处理管道:")
|
|||
|
|
pipeline = ProcessingPipeline()
|
|||
|
|
|
|||
|
|
# 模拟ZhejiangSpider的处理过程,添加项目名称和项目批准文号
|
|||
|
|
test_item_with_fields = TEST_ITEM.copy()
|
|||
|
|
# 使用修复后的标题解析
|
|||
|
|
title = test_item_with_fields["标题"]
|
|||
|
|
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
|||
|
|
match = re.search(title_pattern, title)
|
|||
|
|
if match:
|
|||
|
|
test_item_with_fields["项目名称"] = match.group(1).strip()
|
|||
|
|
test_item_with_fields["项目批准文号"] = match.group(2).strip()
|
|||
|
|
|
|||
|
|
logger.info(f"添加字段后的测试项:")
|
|||
|
|
logger.info(f" 项目名称: {test_item_with_fields.get('项目名称', '无')}")
|
|||
|
|
logger.info(f" 项目批准文号: {test_item_with_fields.get('项目批准文号', '无')}")
|
|||
|
|
|
|||
|
|
# 模拟爬虫结果列表
|
|||
|
|
results = [test_item_with_fields]
|
|||
|
|
|
|||
|
|
# 处理结果
|
|||
|
|
processed = pipeline.process_results(
|
|||
|
|
results,
|
|||
|
|
site="zhejiang",
|
|||
|
|
notice_type="招标文件公示",
|
|||
|
|
upload=False
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 分析结果
|
|||
|
|
if processed:
|
|||
|
|
record = processed[0]
|
|||
|
|
logger.info("\n处理结果:")
|
|||
|
|
logger.info(f" 项目名称: {record.get('项目名称', '文档未提及')}")
|
|||
|
|
logger.info(f" 项目批准文号: {record.get('项目批准文号', '文档未提及')}")
|
|||
|
|
logger.info(f" 批准文号: {record.get('批准文号', '文档未提及')}")
|
|||
|
|
logger.info(f" 最高投标限价: {record.get('最高投标限价', '文档未提及')}")
|
|||
|
|
logger.info(f" 最高限价: {record.get('最高限价', '文档未提及')}")
|
|||
|
|
else:
|
|||
|
|
logger.error("处理失败")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|