fix: 爬虫已提取发布时间时不再被content_fetcher覆盖
This commit is contained in:
@@ -87,16 +87,16 @@ class ProcessingPipeline:
|
|||||||
extracted = self.deepseek.extract_fields(
|
extracted = self.deepseek.extract_fields(
|
||||||
content, ai_fields, region_name)
|
content, ai_fields, region_name)
|
||||||
|
|
||||||
# 4. 提取发布时间(从content中)
|
# 4. 提取发布时间:爬虫已提取则优先使用,否则从content中提取
|
||||||
|
if not record.get("发布时间"):
|
||||||
import re
|
import re
|
||||||
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
|
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||||||
if publish_time_match:
|
if publish_time_match:
|
||||||
extracted_publish_time = publish_time_match.group(1).strip()
|
extracted_publish_time = publish_time_match.group(1).strip()
|
||||||
# 如果提取到了更详细的发布时间(包含时分秒),更新记录
|
|
||||||
if extracted_publish_time:
|
if extracted_publish_time:
|
||||||
record["发布时间"] = extracted_publish_time
|
record["发布时间"] = extracted_publish_time
|
||||||
record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性
|
record["项目发布时间"] = extracted_publish_time
|
||||||
logger.info(f" ✓ 发布时间: {extracted_publish_time}")
|
logger.info(f" ✓ 发布时间: {record.get('发布时间', '未知')}")
|
||||||
|
|
||||||
# 5. 合并结果(AI 优先,原有值保底)
|
# 5. 合并结果(AI 优先,原有值保底)
|
||||||
for field in ai_fields:
|
for field in ai_fields:
|
||||||
|
|||||||
Reference in New Issue
Block a user