Files
yiliao/backend/deepseek_analyzer.py

284 lines
9.6 KiB
Python
Raw Normal View History

"""
DeepSeek医疗数据分析器
使用DeepSeek API分析OCR提取的医疗数据补充缺失的参考范围和单位
"""
import json
import requests
from typing import List, Dict
class DeepSeekAnalyzer:
def __init__(self, api_key: str):
self.api_key = api_key
self.api_url = "https://api.deepseek.com/v1/chat/completions"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def analyze_medical_data(self, items: List[Dict]) -> List[Dict]:
"""
分析医疗数据补充缺失的参考范围单位和提示
Args:
items: OCR提取的医疗检测项列表
Returns:
补充完整的医疗检测项列表
"""
# 分批处理每批20个项目
batch_size = 20
all_results = []
for i in range(0, len(items), batch_size):
batch = items[i:i+batch_size]
print(f" 处理第 {i//batch_size + 1} 批 ({len(batch)} 项)...")
result = self._analyze_batch(batch)
if result:
all_results.extend(result)
else:
# 如果API调用失败保留原始数据
all_results.extend(batch)
return all_results
def _analyze_batch(self, items: List[Dict]) -> List[Dict]:
"""分析一批医疗数据"""
# 构建提示词
prompt = self._build_prompt(items)
try:
response = requests.post(
self.api_url,
headers=self.headers,
json={
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": """你是一个专业的医学检验数据分析专家。你的任务是:
1. 分析医疗检测项目数据
2. 为缺失参考范围(reference)的项目补充标准参考范围
3. 为缺失单位(unit)的项目补充正确单位
4. 判断结果是否在正常范围内
- 如果结果在正常范围内point字段设为空字符串""
- 如果结果高于正常范围point字段设为""
- 如果结果低于正常范围point字段设为""
- 如果是定性结果如Negative/Positive且结果正常point为空异常则标注
请严格按照JSON格式返回不要添加任何额外说明"""
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.1,
"max_tokens": 4000
},
timeout=60
)
if response.status_code == 200:
result = response.json()
content = result['choices'][0]['message']['content']
# 解析JSON响应
# 处理可能的markdown代码块
if '```json' in content:
content = content.split('```json')[1].split('```')[0]
elif '```' in content:
content = content.split('```')[1].split('```')[0]
return json.loads(content.strip())
else:
print(f" ⚠ API错误: {response.status_code} - {response.text[:100]}")
return None
except json.JSONDecodeError as e:
print(f" ⚠ JSON解析错误: {e}")
return None
except requests.exceptions.Timeout:
print(" ⚠ API请求超时")
return None
except Exception as e:
print(f" ⚠ 请求错误: {e}")
return None
def _build_prompt(self, items: List[Dict]) -> str:
"""构建分析提示词"""
# 简化数据,只保留必要字段
simplified = []
for item in items:
simplified.append({
"abb": item.get("abb", ""),
"project": item.get("project", ""),
"result": item.get("result", ""),
"point": item.get("point", ""),
"unit": item.get("unit", ""),
"reference": item.get("reference", "")
})
prompt = f"""请分析以下医疗检测数据,补充缺失的参考范围和单位,并判断结果是否正常:
{json.dumps(simplified, ensure_ascii=False, indent=2)}
要求
1. 为每个项目补充完整的reference参考范围和unit单位
2. 根据result判断是否在正常范围内设置point字段正常为空偏高为""偏低为""
3. 定性结果如NegativePositive正常时point为空异常时根据具体情况标注
4. 保持原有的abbprojectresult字段不变
请直接返回JSON数组格式不要添加任何说明文字"""
return prompt
def test_deepseek():
"""测试DeepSeek API"""
# 需要替换为实际的API Key
api_key = "YOUR_DEEPSEEK_API_KEY"
analyzer = DeepSeekAnalyzer(api_key)
# 测试数据
test_items = [
{"abb": "WBC", "project": "White Blood Cell", "result": "5.95", "point": "", "unit": "", "reference": ""},
{"abb": "PRO", "project": "Protein", "result": "Negative", "point": "", "unit": "", "reference": ""},
{"abb": "GLU", "project": "Glucose", "result": "6.5", "point": "", "unit": "", "reference": ""},
]
result = analyzer.analyze_medical_data(test_items)
print(json.dumps(result, ensure_ascii=False, indent=2))
def call_deepseek(prompt: str, api_key: str) -> str:
"""调用DeepSeek API通用接口"""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"model": "deepseek-chat",
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.1,
"max_tokens": 8000
}
response = requests.post(url, headers=headers, json=data, timeout=120)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
def process_with_deepseek(ocr_data: list, template_abbs: list, api_key: str) -> dict:
"""让DeepSeek处理OCR数据并匹配到模板ABB"""
prompt = f"""你是医疗数据处理专家。请处理以下OCR提取的医疗检测数据并匹配到模板中的ABB。
## OCR提取的原始数据
```json
{json.dumps(ocr_data, ensure_ascii=False, indent=2)}
```
## 模板中需要填充的ABB列表
{template_abbs}
## 任务要求:
1. 清理OCR数据中的错误和噪音
2. 将每个有效数据项匹配到正确的模板ABB
3. 正确分离result结果unit单位reference参考范围
4. 对于尿检项目如PROGLUKETNIT等结果通常是Negative/Positive这是正确的定性结果
5. 过滤掉明显错误的数据如result为".""0"空值等
6. 如果同一个ABB有多条数据选择最合理的一条
## 输出格式:
请返回JSON格式结构如下
```json
{{
"ABB1": {{"result": "数值或定性结果", "unit": "单位", "reference": "参考范围", "point": "提示"}},
"ABB2": {{"result": "...", "unit": "...", "reference": "...", "point": ""}},
...
}}
```
只返回JSON不要其他说明文字确保JSON格式正确可解析
"""
print("正在调用DeepSeek处理数据...")
result = call_deepseek(prompt, api_key)
# 提取JSON
try:
# 尝试直接解析
return json.loads(result)
except:
# 尝试从markdown代码块提取
import re
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', result)
if json_match:
return json.loads(json_match.group(1))
raise ValueError(f"无法解析DeepSeek返回的JSON: {result[:500]}")
def process_ocr_data_main():
"""
处理OCR数据的主函数原deepseek_process.py的main函数
"""
from pathlib import Path
import os
# 从环境变量获取API Key
api_key = os.environ.get("DEEPSEEK_API_KEY", "")
if not api_key:
api_key = input("请输入DeepSeek API Key: ").strip()
if not api_key:
print("❌ API Key不能为空")
return
# 加载OCR数据
ocr_file = Path(__file__).parent / "extracted_medical_data.json"
if not ocr_file.exists():
print("❌ 未找到OCR数据文件")
return
with open(ocr_file, 'r', encoding='utf-8') as f:
data = json.load(f)
ocr_items = data.get('items', data) if isinstance(data, dict) else data
print(f"加载 {len(ocr_items)} 条OCR数据")
# 加载模板ABB配置
from config import load_abb_config
config = load_abb_config()
template_abbs = config.get('abb_list', [])
print(f"模板中有 {len(template_abbs)} 个ABB")
# 调用DeepSeek处理
processed_data = process_with_deepseek(ocr_items, template_abbs, api_key)
# 保存处理后的数据
output_file = Path(__file__).parent / "deepseek_processed_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(processed_data, f, ensure_ascii=False, indent=2)
print(f"✅ DeepSeek处理完成{len(processed_data)} 个有效项")
print(f"✅ 已保存到: {output_file}")
return processed_data
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == '--process':
process_ocr_data_main()
else:
test_deepseek()