From baa94a13b7da6307b0659307d2895c1403a5903e Mon Sep 17 00:00:00 2001 From: ztb-system Date: Tue, 3 Mar 2026 18:07:01 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E7=88=AC=E8=99=AB=E5=B7=B2=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E5=8F=91=E5=B8=83=E6=97=B6=E9=97=B4=E6=97=B6=E4=B8=8D?= =?UTF-8?q?=E5=86=8D=E8=A2=ABcontent=5Ffetcher=E8=A6=86=E7=9B=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- processors/pipeline.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/processors/pipeline.py b/processors/pipeline.py index fa7eec9..6a1892e 100644 --- a/processors/pipeline.py +++ b/processors/pipeline.py @@ -87,16 +87,16 @@ class ProcessingPipeline: extracted = self.deepseek.extract_fields( content, ai_fields, region_name) - # 4. 提取发布时间(从content中) - import re - publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content) - if publish_time_match: - extracted_publish_time = publish_time_match.group(1).strip() - # 如果提取到了更详细的发布时间(包含时分秒),更新记录 - if extracted_publish_time: - record["发布时间"] = extracted_publish_time - record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性 - logger.info(f" ✓ 发布时间: {extracted_publish_time}") + # 4. 提取发布时间:爬虫已提取则优先使用,否则从content中提取 + if not record.get("发布时间"): + import re + publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content) + if publish_time_match: + extracted_publish_time = publish_time_match.group(1).strip() + if extracted_publish_time: + record["发布时间"] = extracted_publish_time + record["项目发布时间"] = extracted_publish_time + logger.info(f" ✓ 发布时间: {record.get('发布时间', '未知')}") # 5. 合并结果(AI 优先,原有值保底) for field in ai_fields: