fix: 爬虫已提取发布时间时不再被content_fetcher覆盖

This commit is contained in:
ztb-system
2026-03-03 18:07:01 +08:00
parent 233d41a833
commit baa94a13b7

View File

@@ -87,16 +87,16 @@ class ProcessingPipeline:
extracted = self.deepseek.extract_fields(
content, ai_fields, region_name)
# 4. 提取发布时间从content中
import re
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
if publish_time_match:
extracted_publish_time = publish_time_match.group(1).strip()
# 如果提取到了更详细的发布时间(包含时分秒),更新记录
if extracted_publish_time:
record["发布时间"] = extracted_publish_time
record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性
logger.info(f" ✓ 发布时间: {extracted_publish_time}")
# 4. 提取发布时间:爬虫已提取则优先使用,否则从content中提取
if not record.get("发布时间"):
import re
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
if publish_time_match:
extracted_publish_time = publish_time_match.group(1).strip()
if extracted_publish_time:
record["发布时间"] = extracted_publish_time
record["项目发布时间"] = extracted_publish_time
logger.info(f" ✓ 发布时间: {record.get('发布时间', '未知')}")
# 5. 合并结果AI 优先,原有值保底)
for field in ai_fields: