284 lines
9.6 KiB
Python
284 lines
9.6 KiB
Python
"""
|
||
DeepSeek医疗数据分析器
|
||
使用DeepSeek API分析OCR提取的医疗数据,补充缺失的参考范围和单位
|
||
"""
|
||
|
||
import json
|
||
import requests
|
||
from typing import List, Dict
|
||
|
||
|
||
class DeepSeekAnalyzer:
|
||
def __init__(self, api_key: str):
|
||
self.api_key = api_key
|
||
self.api_url = "https://api.deepseek.com/v1/chat/completions"
|
||
self.headers = {
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
def analyze_medical_data(self, items: List[Dict]) -> List[Dict]:
|
||
"""
|
||
分析医疗数据,补充缺失的参考范围、单位和提示
|
||
|
||
Args:
|
||
items: OCR提取的医疗检测项列表
|
||
|
||
Returns:
|
||
补充完整的医疗检测项列表
|
||
"""
|
||
# 分批处理,每批20个项目
|
||
batch_size = 20
|
||
all_results = []
|
||
|
||
for i in range(0, len(items), batch_size):
|
||
batch = items[i:i+batch_size]
|
||
print(f" 处理第 {i//batch_size + 1} 批 ({len(batch)} 项)...")
|
||
|
||
result = self._analyze_batch(batch)
|
||
if result:
|
||
all_results.extend(result)
|
||
else:
|
||
# 如果API调用失败,保留原始数据
|
||
all_results.extend(batch)
|
||
|
||
return all_results
|
||
|
||
def _analyze_batch(self, items: List[Dict]) -> List[Dict]:
|
||
"""分析一批医疗数据"""
|
||
|
||
# 构建提示词
|
||
prompt = self._build_prompt(items)
|
||
|
||
try:
|
||
response = requests.post(
|
||
self.api_url,
|
||
headers=self.headers,
|
||
json={
|
||
"model": "deepseek-chat",
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": """你是一个专业的医学检验数据分析专家。你的任务是:
|
||
1. 分析医疗检测项目数据
|
||
2. 为缺失参考范围(reference)的项目补充标准参考范围
|
||
3. 为缺失单位(unit)的项目补充正确单位
|
||
4. 判断结果是否在正常范围内:
|
||
- 如果结果在正常范围内,point字段设为空字符串""
|
||
- 如果结果高于正常范围,point字段设为"↑"
|
||
- 如果结果低于正常范围,point字段设为"↓"
|
||
- 如果是定性结果(如Negative/Positive),且结果正常,point为空;异常则标注
|
||
|
||
请严格按照JSON格式返回,不要添加任何额外说明。"""
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": prompt
|
||
}
|
||
],
|
||
"temperature": 0.1,
|
||
"max_tokens": 4000
|
||
},
|
||
timeout=60
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
result = response.json()
|
||
content = result['choices'][0]['message']['content']
|
||
|
||
# 解析JSON响应
|
||
# 处理可能的markdown代码块
|
||
if '```json' in content:
|
||
content = content.split('```json')[1].split('```')[0]
|
||
elif '```' in content:
|
||
content = content.split('```')[1].split('```')[0]
|
||
|
||
return json.loads(content.strip())
|
||
else:
|
||
print(f" ⚠ API错误: {response.status_code} - {response.text[:100]}")
|
||
return None
|
||
|
||
except json.JSONDecodeError as e:
|
||
print(f" ⚠ JSON解析错误: {e}")
|
||
return None
|
||
except requests.exceptions.Timeout:
|
||
print(" ⚠ API请求超时")
|
||
return None
|
||
except Exception as e:
|
||
print(f" ⚠ 请求错误: {e}")
|
||
return None
|
||
|
||
def _build_prompt(self, items: List[Dict]) -> str:
|
||
"""构建分析提示词"""
|
||
|
||
# 简化数据,只保留必要字段
|
||
simplified = []
|
||
for item in items:
|
||
simplified.append({
|
||
"abb": item.get("abb", ""),
|
||
"project": item.get("project", ""),
|
||
"result": item.get("result", ""),
|
||
"point": item.get("point", ""),
|
||
"unit": item.get("unit", ""),
|
||
"reference": item.get("reference", "")
|
||
})
|
||
|
||
prompt = f"""请分析以下医疗检测数据,补充缺失的参考范围和单位,并判断结果是否正常:
|
||
|
||
{json.dumps(simplified, ensure_ascii=False, indent=2)}
|
||
|
||
要求:
|
||
1. 为每个项目补充完整的reference(参考范围)和unit(单位)
|
||
2. 根据result判断是否在正常范围内,设置point字段(正常为空,偏高为"↑",偏低为"↓")
|
||
3. 定性结果(如Negative、Positive):正常时point为空,异常时根据具体情况标注
|
||
4. 保持原有的abb、project、result字段不变
|
||
|
||
请直接返回JSON数组格式,不要添加任何说明文字:"""
|
||
|
||
return prompt
|
||
|
||
|
||
def test_deepseek():
|
||
"""测试DeepSeek API"""
|
||
# 需要替换为实际的API Key
|
||
api_key = "YOUR_DEEPSEEK_API_KEY"
|
||
|
||
analyzer = DeepSeekAnalyzer(api_key)
|
||
|
||
# 测试数据
|
||
test_items = [
|
||
{"abb": "WBC", "project": "White Blood Cell", "result": "5.95", "point": "", "unit": "", "reference": ""},
|
||
{"abb": "PRO", "project": "Protein", "result": "Negative", "point": "", "unit": "", "reference": ""},
|
||
{"abb": "GLU", "project": "Glucose", "result": "6.5", "point": "", "unit": "", "reference": ""},
|
||
]
|
||
|
||
result = analyzer.analyze_medical_data(test_items)
|
||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||
|
||
|
||
def call_deepseek(prompt: str, api_key: str) -> str:
|
||
"""调用DeepSeek API(通用接口)"""
|
||
url = "https://api.deepseek.com/v1/chat/completions"
|
||
headers = {
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
data = {
|
||
"model": "deepseek-chat",
|
||
"messages": [
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
"temperature": 0.1,
|
||
"max_tokens": 8000
|
||
}
|
||
|
||
response = requests.post(url, headers=headers, json=data, timeout=120)
|
||
response.raise_for_status()
|
||
return response.json()["choices"][0]["message"]["content"]
|
||
|
||
|
||
def process_with_deepseek(ocr_data: list, template_abbs: list, api_key: str) -> dict:
|
||
"""让DeepSeek处理OCR数据并匹配到模板ABB"""
|
||
|
||
prompt = f"""你是医疗数据处理专家。请处理以下OCR提取的医疗检测数据,并匹配到模板中的ABB。
|
||
|
||
## OCR提取的原始数据:
|
||
```json
|
||
{json.dumps(ocr_data, ensure_ascii=False, indent=2)}
|
||
```
|
||
|
||
## 模板中需要填充的ABB列表:
|
||
{template_abbs}
|
||
|
||
## 任务要求:
|
||
1. 清理OCR数据中的错误和噪音
|
||
2. 将每个有效数据项匹配到正确的模板ABB
|
||
3. 正确分离result(结果)、unit(单位)、reference(参考范围)
|
||
4. 对于尿检项目(如PRO、GLU、KET、NIT等),结果通常是Negative/Positive,这是正确的定性结果
|
||
5. 过滤掉明显错误的数据(如result为"."、"0"、空值等)
|
||
6. 如果同一个ABB有多条数据,选择最合理的一条
|
||
|
||
## 输出格式:
|
||
请返回JSON格式,结构如下:
|
||
```json
|
||
{{
|
||
"ABB1": {{"result": "数值或定性结果", "unit": "单位", "reference": "参考范围", "point": "提示"}},
|
||
"ABB2": {{"result": "...", "unit": "...", "reference": "...", "point": ""}},
|
||
...
|
||
}}
|
||
```
|
||
|
||
只返回JSON,不要其他说明文字。确保JSON格式正确可解析。
|
||
"""
|
||
|
||
print("正在调用DeepSeek处理数据...")
|
||
result = call_deepseek(prompt, api_key)
|
||
|
||
# 提取JSON
|
||
try:
|
||
# 尝试直接解析
|
||
return json.loads(result)
|
||
except:
|
||
# 尝试从markdown代码块提取
|
||
import re
|
||
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', result)
|
||
if json_match:
|
||
return json.loads(json_match.group(1))
|
||
raise ValueError(f"无法解析DeepSeek返回的JSON: {result[:500]}")
|
||
|
||
|
||
def process_ocr_data_main():
|
||
"""
|
||
处理OCR数据的主函数(原deepseek_process.py的main函数)
|
||
"""
|
||
from pathlib import Path
|
||
import os
|
||
|
||
# 从环境变量获取API Key
|
||
api_key = os.environ.get("DEEPSEEK_API_KEY", "")
|
||
|
||
if not api_key:
|
||
api_key = input("请输入DeepSeek API Key: ").strip()
|
||
|
||
if not api_key:
|
||
print("❌ API Key不能为空")
|
||
return
|
||
|
||
# 加载OCR数据
|
||
ocr_file = Path(__file__).parent / "extracted_medical_data.json"
|
||
if not ocr_file.exists():
|
||
print("❌ 未找到OCR数据文件")
|
||
return
|
||
|
||
with open(ocr_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
ocr_items = data.get('items', data) if isinstance(data, dict) else data
|
||
print(f"加载 {len(ocr_items)} 条OCR数据")
|
||
|
||
# 加载模板ABB配置
|
||
from config import load_abb_config
|
||
config = load_abb_config()
|
||
template_abbs = config.get('abb_list', [])
|
||
print(f"模板中有 {len(template_abbs)} 个ABB")
|
||
|
||
# 调用DeepSeek处理
|
||
processed_data = process_with_deepseek(ocr_items, template_abbs, api_key)
|
||
|
||
# 保存处理后的数据
|
||
output_file = Path(__file__).parent / "deepseek_processed_data.json"
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(processed_data, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"✅ DeepSeek处理完成,共 {len(processed_data)} 个有效项")
|
||
print(f"✅ 已保存到: {output_file}")
|
||
|
||
return processed_data
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
if len(sys.argv) > 1 and sys.argv[1] == '--process':
|
||
process_ocr_data_main()
|
||
else:
|
||
test_deepseek()
|