695 lines
24 KiB
Python
695 lines
24 KiB
Python
"""
|
||
处理PDF中有但模板中没有的检测项目
|
||
- 识别额外项目
|
||
- 调用DeepSeek进行分类
|
||
- 在对应模块末尾插入表格
|
||
"""
|
||
|
||
import json
|
||
import requests
|
||
from typing import Dict, List, Tuple
|
||
from pathlib import Path
|
||
from docx import Document
|
||
from docx.shared import Pt, Cm
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||
from docx.oxml.ns import qn
|
||
from docx.oxml import OxmlElement
|
||
from copy import deepcopy
|
||
|
||
|
||
def clean_reference_range(reference: str) -> str:
|
||
"""清理参考范围格式:去掉括号,将<X转换为0-X"""
|
||
import re
|
||
|
||
if not reference:
|
||
return reference
|
||
|
||
ref = reference.strip()
|
||
|
||
# 去掉各种括号
|
||
if ref.startswith('(') and ref.endswith(')'):
|
||
ref = ref[1:-1]
|
||
elif ref.startswith('(') and ref.endswith(')'):
|
||
ref = ref[1:-1]
|
||
elif ref.startswith('[') and ref.endswith(']'):
|
||
ref = ref[1:-1]
|
||
|
||
if ref.startswith('('):
|
||
ref = ref[1:]
|
||
if ref.endswith(')'):
|
||
ref = ref[:-1]
|
||
if ref.startswith('('):
|
||
ref = ref[1:]
|
||
if ref.endswith(')'):
|
||
ref = ref[:-1]
|
||
|
||
ref = ref.strip()
|
||
|
||
# 将 <X 或 ≤X 转换为 0-X 格式
|
||
match = re.match(r'^[<≤]\s*([\d\.]+)\s*$', ref)
|
||
if match:
|
||
upper_value = match.group(1)
|
||
ref = f"0-{upper_value}"
|
||
|
||
match = re.match(r'^<=\s*([\d\.]+)\s*$', ref)
|
||
if match:
|
||
upper_value = match.group(1)
|
||
ref = f"0-{upper_value}"
|
||
|
||
return ref.strip()
|
||
|
||
|
||
class ExtraItemsHandler:
|
||
"""处理模板中没有的额外检测项目"""
|
||
|
||
def __init__(self, api_key: str = None):
|
||
self.api_key = api_key
|
||
self.api_url = "https://api.deepseek.com/v1/chat/completions"
|
||
|
||
# 加载ABB配置
|
||
from config import load_abb_config
|
||
self.abb_config = load_abb_config()
|
||
|
||
# 构建已知ABB集合(包括别名)
|
||
self.known_abbs = set()
|
||
for abb in self.abb_config.get('abb_list', []):
|
||
self.known_abbs.add(abb.upper())
|
||
for alias in self.abb_config.get('abb_aliases', {}).keys():
|
||
self.known_abbs.add(alias.upper())
|
||
|
||
# 模块关键词映射(用于在文档中定位模块)
|
||
self.module_keywords = {
|
||
'Urine Test': ['urine detection', 'urine test', '尿液检测'],
|
||
'Complete Blood Count': ['complete blood count', 'blood count', '血常规'],
|
||
'Blood Sugar': ['blood sugar', 'glucose', '血糖'],
|
||
'Lipid Profile': ['lipid profile', 'lipid panel', '血脂'],
|
||
'Blood Type': ['blood type', '血型'],
|
||
'Blood Coagulation': ['blood coagulation', 'coagulation', '凝血功能'],
|
||
'Four Infectious Diseases': ['infectious disease', '传染病'],
|
||
'Serum Electrolytes': ['electrolyte', '电解质'],
|
||
'Liver Function': ['liver function', '肝功能'],
|
||
'Kidney Function': ['kidney function', '肾功能'],
|
||
'Myocardial Enzyme': ['myocardial enzyme', 'cardiac enzyme', '心肌酶'],
|
||
'Thyroid Function': ['thyroid function', '甲状腺'],
|
||
'Thromboembolism': ['thromboembolism', 'cardiovascular risk', '心脑血管'],
|
||
'Bone Metabolism': ['bone metabolism', '骨代谢'],
|
||
'Microelement': ['microelement', 'trace element', '微量元素'],
|
||
'Lymphocyte Subpopulation': ['lymphocyte subpopulation', '淋巴细胞亚群'],
|
||
'Humoral Immunity': ['humoral immunity', 'immunoglobulin', '体液免疫'],
|
||
'Inflammatory Reaction': ['inflammatory', 'inflammation', '炎症'],
|
||
'Autoantibody': ['autoantibody', '自身抗体'],
|
||
'Female Hormone': ['female hormone', '女性激素'],
|
||
'Male Hormone': ['male hormone', '男性激素'],
|
||
'Tumor Markers': ['tumor marker', '肿瘤标志物'],
|
||
'Imaging': ['imaging', '影像'],
|
||
'Female-specific': ['female-specific', 'gynecological', '女性专项'],
|
||
'Other Tests': ['other test', '其他检测']
|
||
}
|
||
|
||
def identify_extra_items(self, extracted_items: List[Dict]) -> List[Dict]:
|
||
"""
|
||
识别模板中没有的额外项目
|
||
|
||
Args:
|
||
extracted_items: OCR提取的所有项目
|
||
|
||
Returns:
|
||
额外项目列表
|
||
"""
|
||
extra_items = []
|
||
|
||
for item in extracted_items:
|
||
abb = item.get('abb', '').upper()
|
||
|
||
# 跳过空ABB
|
||
if not abb:
|
||
continue
|
||
|
||
# 检查是否在已知ABB中
|
||
if abb not in self.known_abbs:
|
||
extra_items.append(item)
|
||
|
||
print(f" 识别到 {len(extra_items)} 个额外项目(模板中没有)")
|
||
return extra_items
|
||
|
||
def classify_items_with_deepseek(self, extra_items: List[Dict]) -> Dict[str, List[Dict]]:
|
||
"""
|
||
使用DeepSeek对额外项目进行分类
|
||
|
||
Args:
|
||
extra_items: 额外项目列表
|
||
|
||
Returns:
|
||
{模块名: [项目列表]}
|
||
"""
|
||
if not extra_items:
|
||
return {}
|
||
|
||
if not self.api_key:
|
||
print(" ⚠️ 未配置DeepSeek API Key,使用默认分类")
|
||
return self._default_classify(extra_items)
|
||
|
||
# 构建项目描述
|
||
items_desc = []
|
||
for item in extra_items:
|
||
desc = f"- ABB: {item.get('abb', '')}, 项目名: {item.get('project', '')}"
|
||
if item.get('result'):
|
||
desc += f", 结果: {item.get('result', '')}"
|
||
if item.get('unit'):
|
||
desc += f" {item.get('unit', '')}"
|
||
items_desc.append(desc)
|
||
|
||
# 获取可用模块列表
|
||
modules = list(self.abb_config.get('modules', {}).keys())
|
||
|
||
prompt = f"""你是医学检验专家,请将以下检测项目分类到对应的检测模块中。
|
||
|
||
## 待分类的检测项目:
|
||
{chr(10).join(items_desc)}
|
||
|
||
## 可用的检测模块:
|
||
{', '.join(modules)}
|
||
|
||
## 要求:
|
||
1. 根据项目的医学属性,将每个项目分配到最合适的模块
|
||
2. 如果项目不属于任何已有模块,分配到 "Other Tests"
|
||
3. 返回JSON格式
|
||
|
||
## 输出格式:
|
||
```json
|
||
{{
|
||
"模块名1": ["ABB1", "ABB2"],
|
||
"模块名2": ["ABB3"],
|
||
...
|
||
}}
|
||
```
|
||
|
||
只返回JSON,不要其他说明。"""
|
||
|
||
try:
|
||
headers = {
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
response = requests.post(
|
||
self.api_url,
|
||
headers=headers,
|
||
json={
|
||
"model": "deepseek-chat",
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"temperature": 0.1,
|
||
"max_tokens": 2000
|
||
},
|
||
timeout=60
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
content = response.json()['choices'][0]['message']['content']
|
||
|
||
# 解析JSON
|
||
if '```json' in content:
|
||
content = content.split('```json')[1].split('```')[0]
|
||
elif '```' in content:
|
||
content = content.split('```')[1].split('```')[0]
|
||
|
||
classification = json.loads(content.strip())
|
||
|
||
# 将ABB映射回完整项目数据
|
||
result = {}
|
||
abb_to_item = {item['abb'].upper(): item for item in extra_items}
|
||
|
||
for module, abbs in classification.items():
|
||
result[module] = []
|
||
for abb in abbs:
|
||
abb_upper = abb.upper()
|
||
if abb_upper in abb_to_item:
|
||
result[module].append(abb_to_item[abb_upper])
|
||
|
||
print(f" ✓ DeepSeek分类完成: {len(result)} 个模块")
|
||
return result
|
||
else:
|
||
print(f" ⚠️ DeepSeek API错误: {response.status_code}")
|
||
return self._default_classify(extra_items)
|
||
|
||
except Exception as e:
|
||
print(f" ⚠️ DeepSeek分类失败: {e}")
|
||
return self._default_classify(extra_items)
|
||
|
||
def _default_classify(self, extra_items: List[Dict]) -> Dict[str, List[Dict]]:
|
||
"""默认分类逻辑(当DeepSeek不可用时)"""
|
||
# 简单的关键词匹配分类
|
||
result = {'Other Tests': []}
|
||
|
||
keyword_to_module = {
|
||
'crp': 'Inflammatory Reaction',
|
||
'esr': 'Inflammatory Reaction',
|
||
'hs-crp': 'Inflammatory Reaction',
|
||
'tgab': 'Thyroid Function',
|
||
'tpoab': 'Thyroid Function',
|
||
'ery': 'Urine Test',
|
||
'cib': 'Microelement',
|
||
'mib': 'Microelement',
|
||
}
|
||
|
||
for item in extra_items:
|
||
abb_lower = item.get('abb', '').lower()
|
||
project_lower = item.get('project', '').lower()
|
||
|
||
classified = False
|
||
for keyword, module in keyword_to_module.items():
|
||
if keyword in abb_lower or keyword in project_lower:
|
||
if module not in result:
|
||
result[module] = []
|
||
result[module].append(item)
|
||
classified = True
|
||
break
|
||
|
||
if not classified:
|
||
result['Other Tests'].append(item)
|
||
|
||
# 移除空模块
|
||
result = {k: v for k, v in result.items() if v}
|
||
|
||
return result
|
||
|
||
def generate_clinical_significance(self, items: List[Dict]) -> Dict[str, Dict[str, str]]:
|
||
"""
|
||
为额外项目生成临床意义解释
|
||
|
||
Args:
|
||
items: 项目列表
|
||
|
||
Returns:
|
||
{ABB: {"clinical_en": "...", "clinical_cn": "..."}}
|
||
"""
|
||
if not items or not self.api_key:
|
||
return {}
|
||
|
||
items_desc = []
|
||
for item in items:
|
||
desc = f"- {item.get('abb', '')}: {item.get('project', '')}"
|
||
if item.get('result'):
|
||
desc += f", 结果: {item.get('result', '')}"
|
||
items_desc.append(desc)
|
||
|
||
prompt = f"""你是医学检验专家,请为以下检测项目生成简短的临床意义解释。
|
||
|
||
## 检测项目:
|
||
{chr(10).join(items_desc)}
|
||
|
||
## 要求:
|
||
1. 每个项目提供英文和中文解释
|
||
2. 解释简洁,约30-50字
|
||
3. 说明该指标的临床意义
|
||
|
||
## 输出格式(JSON):
|
||
```json
|
||
{{
|
||
"ABB1": {{
|
||
"clinical_en": "English explanation...",
|
||
"clinical_cn": "中文解释..."
|
||
}}
|
||
}}
|
||
```
|
||
|
||
只返回JSON。"""
|
||
|
||
try:
|
||
headers = {
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
response = requests.post(
|
||
self.api_url,
|
||
headers=headers,
|
||
json={
|
||
"model": "deepseek-chat",
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"temperature": 0.1,
|
||
"max_tokens": 4000
|
||
},
|
||
timeout=60
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
content = response.json()['choices'][0]['message']['content']
|
||
|
||
if '```json' in content:
|
||
content = content.split('```json')[1].split('```')[0]
|
||
elif '```' in content:
|
||
content = content.split('```')[1].split('```')[0]
|
||
|
||
return json.loads(content.strip())
|
||
|
||
except Exception as e:
|
||
print(f" ⚠️ 生成临床意义失败: {e}")
|
||
|
||
return {}
|
||
|
||
|
||
def find_module_position(self, doc: Document, module_name: str) -> int:
|
||
"""
|
||
在文档中找到指定模块的最后一个表格位置
|
||
|
||
Args:
|
||
doc: Word文档对象
|
||
module_name: 模块名称
|
||
|
||
Returns:
|
||
模块最后一个表格在body中的索引,-1表示未找到
|
||
"""
|
||
keywords = self.module_keywords.get(module_name, [module_name.lower()])
|
||
|
||
body = doc._body._body
|
||
children = list(body)
|
||
|
||
module_start_idx = -1
|
||
module_end_idx = -1
|
||
|
||
# 找到模块开始位置
|
||
for i, elem in enumerate(children):
|
||
text = ''.join(elem.itertext()).strip().lower()
|
||
|
||
for kw in keywords:
|
||
if kw in text:
|
||
module_start_idx = i
|
||
break
|
||
|
||
if module_start_idx >= 0:
|
||
break
|
||
|
||
if module_start_idx < 0:
|
||
return -1
|
||
|
||
# 找到模块结束位置(下一个模块开始或文档结束)
|
||
all_module_keywords = []
|
||
for kws in self.module_keywords.values():
|
||
all_module_keywords.extend(kws)
|
||
|
||
for i in range(module_start_idx + 1, len(children)):
|
||
text = ''.join(children[i].itertext()).strip().lower()
|
||
|
||
# 检查是否是另一个模块的开始
|
||
for kw in all_module_keywords:
|
||
if kw in text and kw not in keywords:
|
||
module_end_idx = i
|
||
break
|
||
|
||
if module_end_idx >= 0:
|
||
break
|
||
|
||
if module_end_idx < 0:
|
||
module_end_idx = len(children)
|
||
|
||
# 在模块范围内找最后一个表格
|
||
last_table_idx = -1
|
||
for i in range(module_start_idx, module_end_idx):
|
||
if children[i].tag.endswith('}tbl'):
|
||
last_table_idx = i
|
||
|
||
return last_table_idx
|
||
|
||
def create_item_table(self, doc: Document, item: Dict, clinical_en: str = "", clinical_cn: str = "") -> any:
|
||
"""
|
||
创建单个检测项目的表格
|
||
|
||
Args:
|
||
doc: Word文档对象
|
||
item: 项目数据
|
||
clinical_en: 英文临床意义
|
||
clinical_cn: 中文临床意义
|
||
|
||
Returns:
|
||
创建的表格元素
|
||
"""
|
||
# 创建表格(4行6列)
|
||
table = doc.add_table(rows=4, cols=6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.autofit = False
|
||
|
||
# 设置列宽
|
||
widths = [Cm(2.5), Cm(3.5), Cm(2.5), Cm(2.5), Cm(2.5), Cm(2.5)]
|
||
for row in table.rows:
|
||
for idx, width in enumerate(widths):
|
||
row.cells[idx].width = width
|
||
|
||
def set_font(run, bold=False, font_size=10.5):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
# Row 0: 空行(顶部边框)
|
||
row0 = table.rows[0]
|
||
row0.height = Cm(0.05)
|
||
for cell in row0.cells:
|
||
cell.text = ''
|
||
|
||
# Row 1: 表头
|
||
header_row = table.rows[1]
|
||
headers = [
|
||
('Abb', '简称'), ('Project', '项目'), ('Result', '结果'),
|
||
('Point', '提示'), ('Refer', '参考'), ('Unit', '单位')
|
||
]
|
||
for idx, (en, cn) in enumerate(headers):
|
||
p = header_row.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(f'{en}\n{cn}')
|
||
set_font(run, bold=True, font_size=9)
|
||
|
||
# Row 2: 数据行
|
||
data_row = table.rows[2]
|
||
|
||
# ABB
|
||
p = data_row.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(item.get('abb', ''))
|
||
set_font(run, bold=True)
|
||
|
||
# 项目名
|
||
p = data_row.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(item.get('project', ''))
|
||
set_font(run, bold=True)
|
||
|
||
# 结果
|
||
p = data_row.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(str(item.get('result', '')))
|
||
set_font(run)
|
||
|
||
# Point
|
||
p = data_row.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(item.get('point', ''))
|
||
set_font(run)
|
||
|
||
# 参考范围
|
||
p = data_row.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(clean_reference_range(item.get('reference', '')))
|
||
set_font(run, font_size=9)
|
||
|
||
# 单位
|
||
p = data_row.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(item.get('unit', ''))
|
||
set_font(run, font_size=9)
|
||
|
||
# Row 3: 临床意义(合并单元格)
|
||
sig_row = table.rows[3]
|
||
top_cell = sig_row.cells[0]
|
||
for i in range(1, 6):
|
||
top_cell.merge(sig_row.cells[i])
|
||
|
||
p = top_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
|
||
if clinical_en:
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_font(run, bold=True, font_size=9)
|
||
run = p.add_run(clinical_en)
|
||
set_font(run, font_size=9)
|
||
run = p.add_run('\n')
|
||
|
||
if clinical_cn:
|
||
run = p.add_run('临床意义:')
|
||
set_font(run, bold=True, font_size=9)
|
||
run = p.add_run(clinical_cn)
|
||
set_font(run, font_size=9)
|
||
|
||
# 设置边框
|
||
self._set_table_borders(table)
|
||
|
||
return table._tbl
|
||
|
||
def _set_table_borders(self, table):
|
||
"""设置表格边框样式"""
|
||
def set_cell_border(cell, **kwargs):
|
||
tc = cell._tc
|
||
tcPr = tc.get_or_add_tcPr()
|
||
tcBorders = OxmlElement('w:tcBorders')
|
||
for edge in ['top', 'left', 'bottom', 'right']:
|
||
if edge in kwargs:
|
||
element = OxmlElement(f'w:{edge}')
|
||
element.set(qn('w:val'), kwargs[edge].get('val', 'single'))
|
||
element.set(qn('w:sz'), str(kwargs[edge].get('sz', 4)))
|
||
element.set(qn('w:color'), kwargs[edge].get('color', '000000'))
|
||
tcBorders.append(element)
|
||
tcPr.append(tcBorders)
|
||
|
||
border_solid = {'val': 'single', 'sz': 4, 'color': '000000'}
|
||
border_dashed = {'val': 'dashed', 'sz': 4, 'color': 'AAAAAA'}
|
||
|
||
for i, row in enumerate(table.rows):
|
||
for cell in row.cells:
|
||
top = border_solid if i == 0 else border_dashed
|
||
set_cell_border(cell, top=top, bottom=border_dashed,
|
||
left=border_dashed, right=border_dashed)
|
||
cell.vertical_alignment = 1
|
||
|
||
def insert_extra_items_to_doc(self, doc_path: str, classified_items: Dict[str, List[Dict]],
|
||
explanations: Dict[str, Dict[str, str]] = None) -> str:
|
||
"""
|
||
将额外项目插入到文档对应模块末尾
|
||
|
||
Args:
|
||
doc_path: 文档路径
|
||
classified_items: {模块名: [项目列表]}
|
||
explanations: {ABB: {"clinical_en": "...", "clinical_cn": "..."}}
|
||
|
||
Returns:
|
||
处理后的文档路径
|
||
"""
|
||
if not classified_items:
|
||
print(" 没有额外项目需要插入")
|
||
return doc_path
|
||
|
||
explanations = explanations or {}
|
||
|
||
doc = Document(doc_path)
|
||
body = doc._body._body
|
||
|
||
inserted_count = 0
|
||
|
||
for module_name, items in classified_items.items():
|
||
if not items:
|
||
continue
|
||
|
||
print(f" 处理模块 [{module_name}]: {len(items)} 个项目")
|
||
|
||
# 找到模块位置
|
||
insert_pos = self.find_module_position(doc, module_name)
|
||
|
||
if insert_pos < 0:
|
||
print(f" ⚠️ 未找到模块 [{module_name}],跳过")
|
||
continue
|
||
|
||
# 为每个项目创建表格并插入
|
||
for item in items:
|
||
abb = item.get('abb', '').upper()
|
||
exp = explanations.get(abb, {})
|
||
clinical_en = exp.get('clinical_en', '')
|
||
clinical_cn = exp.get('clinical_cn', '')
|
||
|
||
# 创建表格
|
||
table_elem = self.create_item_table(doc, item, clinical_en, clinical_cn)
|
||
|
||
# 插入到指定位置后面
|
||
children = list(body)
|
||
if insert_pos < len(children):
|
||
children[insert_pos].addnext(table_elem)
|
||
insert_pos += 1 # 更新位置,下一个表格插入到这个后面
|
||
inserted_count += 1
|
||
print(f" ✓ 插入 {abb}")
|
||
|
||
# 保存文档
|
||
doc.save(doc_path)
|
||
print(f" ✓ 共插入 {inserted_count} 个额外项目表格")
|
||
|
||
return doc_path
|
||
|
||
|
||
def process_extra_items(extracted_items: List[Dict], doc_path: str, api_key: str = None) -> str:
|
||
"""
|
||
处理额外项目的主函数
|
||
|
||
Args:
|
||
extracted_items: OCR提取的所有项目
|
||
doc_path: 已填充的文档路径
|
||
api_key: DeepSeek API Key
|
||
|
||
Returns:
|
||
处理后的文档路径
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("处理额外检测项目(模板中没有的项目)")
|
||
print("=" * 60)
|
||
|
||
handler = ExtraItemsHandler(api_key)
|
||
|
||
# 1. 识别额外项目
|
||
extra_items = handler.identify_extra_items(extracted_items)
|
||
|
||
if not extra_items:
|
||
print(" 没有额外项目需要处理")
|
||
return doc_path
|
||
|
||
print(f"\n 额外项目列表:")
|
||
for item in extra_items:
|
||
print(f" - {item.get('abb', '')}: {item.get('project', '')} = {item.get('result', '')}")
|
||
|
||
# 2. 使用DeepSeek分类
|
||
print("\n 正在分类...")
|
||
classified_items = handler.classify_items_with_deepseek(extra_items)
|
||
|
||
if classified_items:
|
||
print(f"\n 分类结果:")
|
||
for module, items in classified_items.items():
|
||
print(f" [{module}]: {[item.get('abb', '') for item in items]}")
|
||
|
||
# 3. 生成临床意义
|
||
print("\n 正在生成临床意义...")
|
||
explanations = handler.generate_clinical_significance(extra_items)
|
||
|
||
# 4. 插入到文档
|
||
print("\n 正在插入表格...")
|
||
result_path = handler.insert_extra_items_to_doc(doc_path, classified_items, explanations)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("额外项目处理完成")
|
||
print("=" * 60)
|
||
|
||
return result_path
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 测试
|
||
import os
|
||
|
||
api_key = os.getenv("DEEPSEEK_API_KEY", "")
|
||
|
||
# 加载提取数据
|
||
extracted_file = Path(__file__).parent / "extracted_medical_data.json"
|
||
if extracted_file.exists():
|
||
with open(extracted_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
items = data.get('items', data) if isinstance(data, dict) else data
|
||
|
||
handler = ExtraItemsHandler(api_key)
|
||
extra_items = handler.identify_extra_items(items)
|
||
|
||
print(f"\n识别到 {len(extra_items)} 个额外项目:")
|
||
for item in extra_items:
|
||
print(f" - {item.get('abb', '')}: {item.get('project', '')}")
|
||
|
||
if extra_items and api_key:
|
||
classified = handler.classify_items_with_deepseek(extra_items)
|
||
print(f"\n分类结果:")
|
||
for module, items in classified.items():
|
||
print(f" [{module}]: {[item.get('abb', '') for item in items]}")
|