Files
yiliao/backend/deepseek_analyzer.py

284 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
DeepSeek医疗数据分析器
使用DeepSeek API分析OCR提取的医疗数据补充缺失的参考范围和单位
"""
import json
import requests
from typing import List, Dict
class DeepSeekAnalyzer:
def __init__(self, api_key: str):
self.api_key = api_key
self.api_url = "https://api.deepseek.com/v1/chat/completions"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def analyze_medical_data(self, items: List[Dict]) -> List[Dict]:
"""
分析医疗数据,补充缺失的参考范围、单位和提示
Args:
items: OCR提取的医疗检测项列表
Returns:
补充完整的医疗检测项列表
"""
# 分批处理每批20个项目
batch_size = 20
all_results = []
for i in range(0, len(items), batch_size):
batch = items[i:i+batch_size]
print(f" 处理第 {i//batch_size + 1} 批 ({len(batch)} 项)...")
result = self._analyze_batch(batch)
if result:
all_results.extend(result)
else:
# 如果API调用失败保留原始数据
all_results.extend(batch)
return all_results
def _analyze_batch(self, items: List[Dict]) -> List[Dict]:
"""分析一批医疗数据"""
# 构建提示词
prompt = self._build_prompt(items)
try:
response = requests.post(
self.api_url,
headers=self.headers,
json={
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": """你是一个专业的医学检验数据分析专家。你的任务是:
1. 分析医疗检测项目数据
2. 为缺失参考范围(reference)的项目补充标准参考范围
3. 为缺失单位(unit)的项目补充正确单位
4. 判断结果是否在正常范围内:
- 如果结果在正常范围内point字段设为空字符串""
- 如果结果高于正常范围point字段设为""
- 如果结果低于正常范围point字段设为""
- 如果是定性结果如Negative/Positive且结果正常point为空异常则标注
请严格按照JSON格式返回不要添加任何额外说明。"""
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.1,
"max_tokens": 4000
},
timeout=60
)
if response.status_code == 200:
result = response.json()
content = result['choices'][0]['message']['content']
# 解析JSON响应
# 处理可能的markdown代码块
if '```json' in content:
content = content.split('```json')[1].split('```')[0]
elif '```' in content:
content = content.split('```')[1].split('```')[0]
return json.loads(content.strip())
else:
print(f" ⚠ API错误: {response.status_code} - {response.text[:100]}")
return None
except json.JSONDecodeError as e:
print(f" ⚠ JSON解析错误: {e}")
return None
except requests.exceptions.Timeout:
print(" ⚠ API请求超时")
return None
except Exception as e:
print(f" ⚠ 请求错误: {e}")
return None
def _build_prompt(self, items: List[Dict]) -> str:
"""构建分析提示词"""
# 简化数据,只保留必要字段
simplified = []
for item in items:
simplified.append({
"abb": item.get("abb", ""),
"project": item.get("project", ""),
"result": item.get("result", ""),
"point": item.get("point", ""),
"unit": item.get("unit", ""),
"reference": item.get("reference", "")
})
prompt = f"""请分析以下医疗检测数据,补充缺失的参考范围和单位,并判断结果是否正常:
{json.dumps(simplified, ensure_ascii=False, indent=2)}
要求:
1. 为每个项目补充完整的reference参考范围和unit单位
2. 根据result判断是否在正常范围内设置point字段正常为空偏高为"",偏低为""
3. 定性结果如Negative、Positive正常时point为空异常时根据具体情况标注
4. 保持原有的abb、project、result字段不变
请直接返回JSON数组格式不要添加任何说明文字"""
return prompt
def test_deepseek():
"""测试DeepSeek API"""
# 需要替换为实际的API Key
api_key = "YOUR_DEEPSEEK_API_KEY"
analyzer = DeepSeekAnalyzer(api_key)
# 测试数据
test_items = [
{"abb": "WBC", "project": "White Blood Cell", "result": "5.95", "point": "", "unit": "", "reference": ""},
{"abb": "PRO", "project": "Protein", "result": "Negative", "point": "", "unit": "", "reference": ""},
{"abb": "GLU", "project": "Glucose", "result": "6.5", "point": "", "unit": "", "reference": ""},
]
result = analyzer.analyze_medical_data(test_items)
print(json.dumps(result, ensure_ascii=False, indent=2))
def call_deepseek(prompt: str, api_key: str) -> str:
"""调用DeepSeek API通用接口"""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"model": "deepseek-chat",
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.1,
"max_tokens": 8000
}
response = requests.post(url, headers=headers, json=data, timeout=120)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
def process_with_deepseek(ocr_data: list, template_abbs: list, api_key: str) -> dict:
"""让DeepSeek处理OCR数据并匹配到模板ABB"""
prompt = f"""你是医疗数据处理专家。请处理以下OCR提取的医疗检测数据并匹配到模板中的ABB。
## OCR提取的原始数据
```json
{json.dumps(ocr_data, ensure_ascii=False, indent=2)}
```
## 模板中需要填充的ABB列表
{template_abbs}
## 任务要求:
1. 清理OCR数据中的错误和噪音
2. 将每个有效数据项匹配到正确的模板ABB
3. 正确分离result结果、unit单位、reference参考范围
4. 对于尿检项目如PRO、GLU、KET、NIT等结果通常是Negative/Positive这是正确的定性结果
5. 过滤掉明显错误的数据如result为".""0"、空值等)
6. 如果同一个ABB有多条数据选择最合理的一条
## 输出格式:
请返回JSON格式结构如下
```json
{{
"ABB1": {{"result": "数值或定性结果", "unit": "单位", "reference": "参考范围", "point": "提示"}},
"ABB2": {{"result": "...", "unit": "...", "reference": "...", "point": ""}},
...
}}
```
只返回JSON不要其他说明文字。确保JSON格式正确可解析。
"""
print("正在调用DeepSeek处理数据...")
result = call_deepseek(prompt, api_key)
# 提取JSON
try:
# 尝试直接解析
return json.loads(result)
except:
# 尝试从markdown代码块提取
import re
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', result)
if json_match:
return json.loads(json_match.group(1))
raise ValueError(f"无法解析DeepSeek返回的JSON: {result[:500]}")
def process_ocr_data_main():
"""
处理OCR数据的主函数原deepseek_process.py的main函数
"""
from pathlib import Path
import os
# 从环境变量获取API Key
api_key = os.environ.get("DEEPSEEK_API_KEY", "")
if not api_key:
api_key = input("请输入DeepSeek API Key: ").strip()
if not api_key:
print("❌ API Key不能为空")
return
# 加载OCR数据
ocr_file = Path(__file__).parent / "extracted_medical_data.json"
if not ocr_file.exists():
print("❌ 未找到OCR数据文件")
return
with open(ocr_file, 'r', encoding='utf-8') as f:
data = json.load(f)
ocr_items = data.get('items', data) if isinstance(data, dict) else data
print(f"加载 {len(ocr_items)} 条OCR数据")
# 加载模板ABB配置
from config import load_abb_config
config = load_abb_config()
template_abbs = config.get('abb_list', [])
print(f"模板中有 {len(template_abbs)} 个ABB")
# 调用DeepSeek处理
processed_data = process_with_deepseek(ocr_items, template_abbs, api_key)
# 保存处理后的数据
output_file = Path(__file__).parent / "deepseek_processed_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(processed_data, f, ensure_ascii=False, indent=2)
print(f"✅ DeepSeek处理完成{len(processed_data)} 个有效项")
print(f"✅ 已保存到: {output_file}")
return processed_data
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == '--process':
process_ocr_data_main()
else:
test_deepseek()