Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
105
test_publish_time_extraction.py
Normal file
105
test_publish_time_extraction.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
测试发布时间提取功能
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import csv
|
||||
|
||||
# 添加当前目录到模块搜索路径
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# 导入配置和处理器
|
||||
from processors.content_fetcher import ContentFetcher
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 最新的CSV文件路径
|
||||
CSV_FILE = "data/浙江省公共资源交易中心_20260213_161312.csv"
|
||||
|
||||
def read_csv_data(file_path):
|
||||
"""读取CSV文件数据"""
|
||||
data = []
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
data.append(row)
|
||||
return data
|
||||
|
||||
def test_publish_time_extraction():
|
||||
"""测试发布时间提取功能"""
|
||||
logger.info("开始测试发布时间提取功能")
|
||||
|
||||
# 1. 读取CSV数据
|
||||
if not os.path.exists(CSV_FILE):
|
||||
logger.error(f"CSV文件不存在: {CSV_FILE}")
|
||||
return
|
||||
|
||||
data = read_csv_data(CSV_FILE)
|
||||
logger.info(f"读取完成,共 {len(data)} 条数据")
|
||||
|
||||
if len(data) == 0:
|
||||
logger.error("无数据可测试")
|
||||
return
|
||||
|
||||
# 2. 选择前3条数据进行测试
|
||||
test_data = data[:3]
|
||||
logger.info(f"选择前 {len(test_data)} 条数据进行测试")
|
||||
|
||||
# 3. 测试发布时间提取
|
||||
logger.info("\n开始测试发布时间提取:")
|
||||
fetcher = ContentFetcher()
|
||||
|
||||
for i, item in enumerate(test_data, 1):
|
||||
title = item.get("标题", "")
|
||||
url = item.get("链接", "")
|
||||
csv_publish_date = item.get("发布日期", "")
|
||||
|
||||
logger.info(f"\n{'-'*60}")
|
||||
logger.info(f"测试 {i}")
|
||||
logger.info(f"{'-'*60}")
|
||||
logger.info(f"标题: {title}")
|
||||
logger.info(f"URL: {url}")
|
||||
logger.info(f"CSV发布日期: {csv_publish_date}")
|
||||
|
||||
if not url:
|
||||
logger.warning("无链接,跳过")
|
||||
continue
|
||||
|
||||
# 获取内容
|
||||
content = fetcher.get_full_content(url)
|
||||
if not content:
|
||||
logger.warning("获取内容失败,跳过")
|
||||
continue
|
||||
|
||||
# 检查是否包含发布时间
|
||||
if "发布时间:" in content:
|
||||
# 提取发布时间
|
||||
import re
|
||||
match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||||
if match:
|
||||
publish_time = match.group(1).strip()
|
||||
logger.info(f"提取的发布时间: {publish_time}")
|
||||
|
||||
# 比较CSV发布日期和提取的发布时间
|
||||
if csv_publish_date in publish_time:
|
||||
logger.info("✓ 发布时间提取正确")
|
||||
else:
|
||||
logger.warning("✗ 发布时间与CSV日期不一致")
|
||||
else:
|
||||
logger.warning("✗ 发布时间格式不正确")
|
||||
else:
|
||||
logger.warning("✗ 未提取到发布时间")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
test_publish_time_extraction()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user