""" DeepSeek医疗数据分析器 使用DeepSeek API分析OCR提取的医疗数据,补充缺失的参考范围和单位 """ import json import requests from typing import List, Dict class DeepSeekAnalyzer: def __init__(self, api_key: str): self.api_key = api_key self.api_url = "https://api.deepseek.com/v1/chat/completions" self.headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } def analyze_medical_data(self, items: List[Dict]) -> List[Dict]: """ 分析医疗数据,补充缺失的参考范围、单位和提示 Args: items: OCR提取的医疗检测项列表 Returns: 补充完整的医疗检测项列表 """ # 分批处理,每批20个项目 batch_size = 20 all_results = [] for i in range(0, len(items), batch_size): batch = items[i:i+batch_size] print(f" 处理第 {i//batch_size + 1} 批 ({len(batch)} 项)...") result = self._analyze_batch(batch) if result: all_results.extend(result) else: # 如果API调用失败,保留原始数据 all_results.extend(batch) return all_results def _analyze_batch(self, items: List[Dict]) -> List[Dict]: """分析一批医疗数据""" # 构建提示词 prompt = self._build_prompt(items) try: response = requests.post( self.api_url, headers=self.headers, json={ "model": "deepseek-chat", "messages": [ { "role": "system", "content": """你是一个专业的医学检验数据分析专家。你的任务是: 1. 分析医疗检测项目数据 2. 为缺失参考范围(reference)的项目补充标准参考范围 3. 为缺失单位(unit)的项目补充正确单位 4. 判断结果是否在正常范围内: - 如果结果在正常范围内,point字段设为空字符串"" - 如果结果高于正常范围,point字段设为"↑" - 如果结果低于正常范围,point字段设为"↓" - 如果是定性结果(如Negative/Positive),且结果正常,point为空;异常则标注 请严格按照JSON格式返回,不要添加任何额外说明。""" }, { "role": "user", "content": prompt } ], "temperature": 0.1, "max_tokens": 4000 }, timeout=60 ) if response.status_code == 200: result = response.json() content = result['choices'][0]['message']['content'] # 解析JSON响应 # 处理可能的markdown代码块 if '```json' in content: content = content.split('```json')[1].split('```')[0] elif '```' in content: content = content.split('```')[1].split('```')[0] return json.loads(content.strip()) else: print(f" ⚠ API错误: {response.status_code} - {response.text[:100]}") return None except json.JSONDecodeError as e: print(f" ⚠ JSON解析错误: {e}") return None except requests.exceptions.Timeout: print(" ⚠ API请求超时") return None except Exception as e: print(f" ⚠ 请求错误: {e}") return None def _build_prompt(self, items: List[Dict]) -> str: """构建分析提示词""" # 简化数据,只保留必要字段 simplified = [] for item in items: simplified.append({ "abb": item.get("abb", ""), "project": item.get("project", ""), "result": item.get("result", ""), "point": item.get("point", ""), "unit": item.get("unit", ""), "reference": item.get("reference", "") }) prompt = f"""请分析以下医疗检测数据,补充缺失的参考范围和单位,并判断结果是否正常: {json.dumps(simplified, ensure_ascii=False, indent=2)} 要求: 1. 为每个项目补充完整的reference(参考范围)和unit(单位) 2. 根据result判断是否在正常范围内,设置point字段(正常为空,偏高为"↑",偏低为"↓") 3. 定性结果(如Negative、Positive):正常时point为空,异常时根据具体情况标注 4. 保持原有的abb、project、result字段不变 请直接返回JSON数组格式,不要添加任何说明文字:""" return prompt def test_deepseek(): """测试DeepSeek API""" # 需要替换为实际的API Key api_key = "YOUR_DEEPSEEK_API_KEY" analyzer = DeepSeekAnalyzer(api_key) # 测试数据 test_items = [ {"abb": "WBC", "project": "White Blood Cell", "result": "5.95", "point": "", "unit": "", "reference": ""}, {"abb": "PRO", "project": "Protein", "result": "Negative", "point": "", "unit": "", "reference": ""}, {"abb": "GLU", "project": "Glucose", "result": "6.5", "point": "", "unit": "", "reference": ""}, ] result = analyzer.analyze_medical_data(test_items) print(json.dumps(result, ensure_ascii=False, indent=2)) def call_deepseek(prompt: str, api_key: str) -> str: """调用DeepSeek API(通用接口)""" url = "https://api.deepseek.com/v1/chat/completions" headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } data = { "model": "deepseek-chat", "messages": [ {"role": "user", "content": prompt} ], "temperature": 0.1, "max_tokens": 8000 } response = requests.post(url, headers=headers, json=data, timeout=120) response.raise_for_status() return response.json()["choices"][0]["message"]["content"] def process_with_deepseek(ocr_data: list, template_abbs: list, api_key: str) -> dict: """让DeepSeek处理OCR数据并匹配到模板ABB""" prompt = f"""你是医疗数据处理专家。请处理以下OCR提取的医疗检测数据,并匹配到模板中的ABB。 ## OCR提取的原始数据: ```json {json.dumps(ocr_data, ensure_ascii=False, indent=2)} ``` ## 模板中需要填充的ABB列表: {template_abbs} ## 任务要求: 1. 清理OCR数据中的错误和噪音 2. 将每个有效数据项匹配到正确的模板ABB 3. 正确分离result(结果)、unit(单位)、reference(参考范围) 4. 对于尿检项目(如PRO、GLU、KET、NIT等),结果通常是Negative/Positive,这是正确的定性结果 5. 过滤掉明显错误的数据(如result为"."、"0"、空值等) 6. 如果同一个ABB有多条数据,选择最合理的一条 ## 输出格式: 请返回JSON格式,结构如下: ```json {{ "ABB1": {{"result": "数值或定性结果", "unit": "单位", "reference": "参考范围", "point": "提示"}}, "ABB2": {{"result": "...", "unit": "...", "reference": "...", "point": ""}}, ... }} ``` 只返回JSON,不要其他说明文字。确保JSON格式正确可解析。 """ print("正在调用DeepSeek处理数据...") result = call_deepseek(prompt, api_key) # 提取JSON try: # 尝试直接解析 return json.loads(result) except: # 尝试从markdown代码块提取 import re json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', result) if json_match: return json.loads(json_match.group(1)) raise ValueError(f"无法解析DeepSeek返回的JSON: {result[:500]}") def process_ocr_data_main(): """ 处理OCR数据的主函数(原deepseek_process.py的main函数) """ from pathlib import Path import os # 从环境变量获取API Key api_key = os.environ.get("DEEPSEEK_API_KEY", "") if not api_key: api_key = input("请输入DeepSeek API Key: ").strip() if not api_key: print("❌ API Key不能为空") return # 加载OCR数据 ocr_file = Path(__file__).parent / "extracted_medical_data.json" if not ocr_file.exists(): print("❌ 未找到OCR数据文件") return with open(ocr_file, 'r', encoding='utf-8') as f: data = json.load(f) ocr_items = data.get('items', data) if isinstance(data, dict) else data print(f"加载 {len(ocr_items)} 条OCR数据") # 加载模板ABB配置 from config import load_abb_config config = load_abb_config() template_abbs = config.get('abb_list', []) print(f"模板中有 {len(template_abbs)} 个ABB") # 调用DeepSeek处理 processed_data = process_with_deepseek(ocr_items, template_abbs, api_key) # 保存处理后的数据 output_file = Path(__file__).parent / "deepseek_processed_data.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(processed_data, f, ensure_ascii=False, indent=2) print(f"✅ DeepSeek处理完成,共 {len(processed_data)} 个有效项") print(f"✅ 已保存到: {output_file}") return processed_data if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == '--process': process_ocr_data_main() else: test_deepseek()