diff --git a/processors/pipeline.py b/processors/pipeline.py index fa7eec9..6a1892e 100644 --- a/processors/pipeline.py +++ b/processors/pipeline.py @@ -87,16 +87,16 @@ class ProcessingPipeline: extracted = self.deepseek.extract_fields( content, ai_fields, region_name) - # 4. 提取发布时间(从content中) - import re - publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content) - if publish_time_match: - extracted_publish_time = publish_time_match.group(1).strip() - # 如果提取到了更详细的发布时间(包含时分秒),更新记录 - if extracted_publish_time: - record["发布时间"] = extracted_publish_time - record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性 - logger.info(f" ✓ 发布时间: {extracted_publish_time}") + # 4. 提取发布时间:爬虫已提取则优先使用,否则从content中提取 + if not record.get("发布时间"): + import re + publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content) + if publish_time_match: + extracted_publish_time = publish_time_match.group(1).strip() + if extracted_publish_time: + record["发布时间"] = extracted_publish_time + record["项目发布时间"] = extracted_publish_time + logger.info(f" ✓ 发布时间: {record.get('发布时间', '未知')}") # 5. 合并结果(AI 优先,原有值保底) for field in ai_fields: