71 lines
2.9 KiB
Python
71 lines
2.9 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
测试修复后的项目名称和批准文号提取逻辑
|
|||
|
|
"""
|
|||
|
|
import logging
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
# 添加当前目录到模块搜索路径
|
|||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
def test_project_name_extraction_fix():
|
|||
|
|
"""测试修复后的项目名称提取逻辑"""
|
|||
|
|
logger.info("开始测试修复后的项目名称提取逻辑")
|
|||
|
|
|
|||
|
|
# 测试用例
|
|||
|
|
test_titles = [
|
|||
|
|
"湖堤生态修复工程[A3306010720060234001001]",
|
|||
|
|
"[招标文件](测-试)临海市房建施工0212-2招标文件公示[A3300000090000695005001]",
|
|||
|
|
"[招标文件]通途路(大闸路-湖西路)拓宽改造工程(监理)项目招标文件预公示[A3302010220026373001001]",
|
|||
|
|
"[招标文件]集成电路链主企业配套产业园(南片)B、H、F~G地块及配套项目-B地块建设工程(01地块)施工招标文件公示[A3306021280001738001001]",
|
|||
|
|
"[招标文件]临海市副中心城市片区基础设施更新改造工程—沿河路、前王路及镇政府停车场改造提升招标文件公示[A3300000090000689001001]",
|
|||
|
|
"[招标文件]宁波市海曙绿道提升工程(施工)招标文件预公示[A3302030230026386001001]",
|
|||
|
|
"[招标文件]嘉科微二号园一号楼改造提升工程设计采购施工总承包(EPC)招标文件公示[A3304010550007317001001]",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 导入爬虫的解析函数
|
|||
|
|
from spiders.zhejiang import ZhejiangSpider
|
|||
|
|
|
|||
|
|
for title in test_titles:
|
|||
|
|
logger.info(f"\n测试标题: {title}")
|
|||
|
|
|
|||
|
|
# 模拟解析过程 - 注意:_parse_record 函数的参数是 (record, source)
|
|||
|
|
# record 应该是 API 返回的原始记录,包含 "title" 字段
|
|||
|
|
api_record = {
|
|||
|
|
"title": title,
|
|||
|
|
"linkurl": "",
|
|||
|
|
"webdate": "2026-02-13",
|
|||
|
|
"infod": "",
|
|||
|
|
"categoryname": "",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 调用爬虫的解析函数
|
|||
|
|
parsed_item = ZhejiangSpider._parse_record(api_record, "测试")
|
|||
|
|
|
|||
|
|
logger.info(f" 提取结果:")
|
|||
|
|
logger.info(f" 项目名称: {parsed_item.get('项目名称', '未提取')}")
|
|||
|
|
logger.info(f" 项目批准文号: {parsed_item.get('项目批准文号', '未提取')}")
|
|||
|
|
|
|||
|
|
# 验证批准文号是否从项目名称中删除
|
|||
|
|
project_name = parsed_item.get('项目名称', '')
|
|||
|
|
approval_number = parsed_item.get('项目批准文号', '')
|
|||
|
|
if approval_number and approval_number in project_name:
|
|||
|
|
logger.error(f" ❌ 错误: 批准文号 '{approval_number}' 仍在项目名称中")
|
|||
|
|
else:
|
|||
|
|
logger.info(f" ✅ 正确: 批准文号已从项目名称中删除")
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
test_project_name_extraction_fix()
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|