fix: 爬虫已提取发布时间时不再被content_fetcher覆盖
This commit is contained in:
@@ -87,16 +87,16 @@ class ProcessingPipeline:
|
||||
extracted = self.deepseek.extract_fields(
|
||||
content, ai_fields, region_name)
|
||||
|
||||
# 4. 提取发布时间(从content中)
|
||||
import re
|
||||
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||||
if publish_time_match:
|
||||
extracted_publish_time = publish_time_match.group(1).strip()
|
||||
# 如果提取到了更详细的发布时间(包含时分秒),更新记录
|
||||
if extracted_publish_time:
|
||||
record["发布时间"] = extracted_publish_time
|
||||
record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性
|
||||
logger.info(f" ✓ 发布时间: {extracted_publish_time}")
|
||||
# 4. 提取发布时间:爬虫已提取则优先使用,否则从content中提取
|
||||
if not record.get("发布时间"):
|
||||
import re
|
||||
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||||
if publish_time_match:
|
||||
extracted_publish_time = publish_time_match.group(1).strip()
|
||||
if extracted_publish_time:
|
||||
record["发布时间"] = extracted_publish_time
|
||||
record["项目发布时间"] = extracted_publish_time
|
||||
logger.info(f" ✓ 发布时间: {record.get('发布时间', '未知')}")
|
||||
|
||||
# 5. 合并结果(AI 优先,原有值保底)
|
||||
for field in ai_fields:
|
||||
|
||||
Reference in New Issue
Block a user