106 lines
3.1 KiB
Python
106 lines
3.1 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
测试发布时间提取功能
|
||
|
|
"""
|
||
|
|
import logging
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import csv
|
||
|
|
|
||
|
|
# 添加当前目录到模块搜索路径
|
||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
|
|
||
|
|
# 导入配置和处理器
|
||
|
|
from processors.content_fetcher import ContentFetcher
|
||
|
|
|
||
|
|
# 配置日志
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# 最新的CSV文件路径
|
||
|
|
CSV_FILE = "data/浙江省公共资源交易中心_20260213_161312.csv"
|
||
|
|
|
||
|
|
def read_csv_data(file_path):
|
||
|
|
"""读取CSV文件数据"""
|
||
|
|
data = []
|
||
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||
|
|
reader = csv.DictReader(f)
|
||
|
|
for row in reader:
|
||
|
|
data.append(row)
|
||
|
|
return data
|
||
|
|
|
||
|
|
def test_publish_time_extraction():
|
||
|
|
"""测试发布时间提取功能"""
|
||
|
|
logger.info("开始测试发布时间提取功能")
|
||
|
|
|
||
|
|
# 1. 读取CSV数据
|
||
|
|
if not os.path.exists(CSV_FILE):
|
||
|
|
logger.error(f"CSV文件不存在: {CSV_FILE}")
|
||
|
|
return
|
||
|
|
|
||
|
|
data = read_csv_data(CSV_FILE)
|
||
|
|
logger.info(f"读取完成,共 {len(data)} 条数据")
|
||
|
|
|
||
|
|
if len(data) == 0:
|
||
|
|
logger.error("无数据可测试")
|
||
|
|
return
|
||
|
|
|
||
|
|
# 2. 选择前3条数据进行测试
|
||
|
|
test_data = data[:3]
|
||
|
|
logger.info(f"选择前 {len(test_data)} 条数据进行测试")
|
||
|
|
|
||
|
|
# 3. 测试发布时间提取
|
||
|
|
logger.info("\n开始测试发布时间提取:")
|
||
|
|
fetcher = ContentFetcher()
|
||
|
|
|
||
|
|
for i, item in enumerate(test_data, 1):
|
||
|
|
title = item.get("标题", "")
|
||
|
|
url = item.get("链接", "")
|
||
|
|
csv_publish_date = item.get("发布日期", "")
|
||
|
|
|
||
|
|
logger.info(f"\n{'-'*60}")
|
||
|
|
logger.info(f"测试 {i}")
|
||
|
|
logger.info(f"{'-'*60}")
|
||
|
|
logger.info(f"标题: {title}")
|
||
|
|
logger.info(f"URL: {url}")
|
||
|
|
logger.info(f"CSV发布日期: {csv_publish_date}")
|
||
|
|
|
||
|
|
if not url:
|
||
|
|
logger.warning("无链接,跳过")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# 获取内容
|
||
|
|
content = fetcher.get_full_content(url)
|
||
|
|
if not content:
|
||
|
|
logger.warning("获取内容失败,跳过")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# 检查是否包含发布时间
|
||
|
|
if "发布时间:" in content:
|
||
|
|
# 提取发布时间
|
||
|
|
import re
|
||
|
|
match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||
|
|
if match:
|
||
|
|
publish_time = match.group(1).strip()
|
||
|
|
logger.info(f"提取的发布时间: {publish_time}")
|
||
|
|
|
||
|
|
# 比较CSV发布日期和提取的发布时间
|
||
|
|
if csv_publish_date in publish_time:
|
||
|
|
logger.info("✓ 发布时间提取正确")
|
||
|
|
else:
|
||
|
|
logger.warning("✗ 发布时间与CSV日期不一致")
|
||
|
|
else:
|
||
|
|
logger.warning("✗ 发布时间格式不正确")
|
||
|
|
else:
|
||
|
|
logger.warning("✗ 未提取到发布时间")
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""主函数"""
|
||
|
|
test_publish_time_extraction()
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|