6490 lines
274 KiB
Python
6490 lines
274 KiB
Python
"""
|
||
从医疗报告PDF中提取数据,匹配模板结构,填入Word模板
|
||
"""
|
||
import sys
|
||
import io
|
||
import os
|
||
|
||
# 修复Windows终端中文编码问题
|
||
if sys.platform == 'win32':
|
||
# 设置环境变量强制UTF-8
|
||
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||
# 设置控制台代码页为UTF-8
|
||
os.system('chcp 65001 >nul 2>&1')
|
||
# 重新配置stdout/stderr
|
||
if hasattr(sys.stdout, 'buffer'):
|
||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||
|
||
import fitz
|
||
import json
|
||
import re
|
||
import time
|
||
import requests
|
||
import base64
|
||
from pathlib import Path
|
||
from docx import Document
|
||
from docx.shared import Pt, Cm, Inches
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||
from docx.oxml.ns import qn
|
||
from docx.oxml import OxmlElement
|
||
from copy import deepcopy
|
||
from dotenv import load_dotenv
|
||
|
||
# 加载.env环境变量
|
||
load_dotenv(Path(__file__).parent / ".env")
|
||
|
||
# 导入优化版解析函数
|
||
from parse_medical_v2 import parse_medical_data_v2, clean_extracted_data_v2
|
||
|
||
|
||
def find_health_program_boundary(doc):
|
||
"""
|
||
动态查找"客户健康方案/Client Health Program"在文档中的位置
|
||
返回该元素在body.children中的索引,作为保护边界
|
||
|
||
保护边界之前的所有内容(前四页)不应被修改
|
||
"""
|
||
body = doc.element.body
|
||
children = list(body)
|
||
|
||
for i, elem in enumerate(children):
|
||
# 获取元素的文本内容
|
||
text = ''.join(elem.itertext()).strip()
|
||
|
||
# 查找"客户健康方案"或"Client Health Program"
|
||
if '客户健康方案' in text or 'Client Health Program' in text:
|
||
print(f" [保护] 找到保护边界: 位置 {i}, 内容: {text[:50]}...")
|
||
# 返回 i+1,这样保护区域包括 "Client Health Program" 本身
|
||
return i + 1
|
||
|
||
# 如果没找到,返回默认值(约80个元素,对应前四页)
|
||
print(f" [保护] 未找到'客户健康方案',使用默认边界: 80")
|
||
return 80
|
||
|
||
|
||
def find_examination_file_region(doc):
|
||
"""
|
||
查找"客户功能医学检测档案/Client Functional Medical Examination File"区域的位置
|
||
返回 (start_index, end_index) 元组,表示该区域的起始和结束位置
|
||
|
||
这个区域在尿液检测模块之前,包含客户信息和体检信息,需要保护不被删除
|
||
"""
|
||
body = doc.element.body
|
||
children = list(body)
|
||
|
||
start_idx = -1
|
||
end_idx = -1
|
||
|
||
for i, elem in enumerate(children):
|
||
text = ''.join(elem.itertext()).strip()
|
||
|
||
# 查找"客户功能医学检测档案"标题
|
||
if '功能医学检测档案' in text or 'Functional Medical Examination File' in text:
|
||
start_idx = i
|
||
print(f" [保护] 找到'客户功能医学检测档案'区域起始: 位置 {i}")
|
||
|
||
# 查找"尿液检测"标题作为结束边界
|
||
if start_idx >= 0 and ('尿液检测' in text or 'Urine Detection' in text):
|
||
end_idx = i
|
||
print(f" [保护] 找到'客户功能医学检测档案'区域结束: 位置 {i}")
|
||
break
|
||
|
||
if start_idx >= 0 and end_idx < 0:
|
||
# 如果找到了起始但没找到结束,使用起始位置+20作为结束
|
||
end_idx = start_idx + 20
|
||
print(f" [保护] 未找到结束边界,使用默认: {end_idx}")
|
||
|
||
return (start_idx, end_idx)
|
||
|
||
|
||
def copy_protected_region_from_template(template_path, output_path, boundary):
|
||
"""
|
||
从模板复制保护区域到输出文件(简化版)
|
||
|
||
策略:
|
||
1. 复制模板的前 boundary 个元素(前四页)
|
||
2. 从处理后文件中提取数据部分(从 Client Health Program 之后开始)
|
||
3. 不再额外复制"客户功能医学检测档案"区域(已在步骤3-7中处理)
|
||
"""
|
||
import zipfile
|
||
import shutil
|
||
from lxml import etree
|
||
import os
|
||
|
||
if boundary <= 0:
|
||
print(" [保护] 边界无效,跳过复制")
|
||
return
|
||
|
||
temp_output = str(output_path) + ".temp_output"
|
||
temp_result = str(output_path) + ".temp_result"
|
||
|
||
try:
|
||
shutil.copy(output_path, temp_output)
|
||
|
||
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||
w_ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
||
|
||
with zipfile.ZipFile(template_path, 'r') as z:
|
||
template_xml = z.read('word/document.xml')
|
||
template_tree = etree.fromstring(template_xml)
|
||
template_body = template_tree.find('.//w:body', ns)
|
||
|
||
with zipfile.ZipFile(temp_output, 'r') as z:
|
||
output_xml = z.read('word/document.xml')
|
||
output_tree = etree.fromstring(output_xml)
|
||
output_body = output_tree.find('.//w:body', ns)
|
||
|
||
if template_body is None or output_body is None:
|
||
print(" [保护] 无法找到 body 元素")
|
||
return
|
||
|
||
template_children = list(template_body)
|
||
output_children = list(output_body)
|
||
|
||
print(f" [保护] 模板元素: {len(template_children)}, 处理后元素: {len(output_children)}")
|
||
|
||
# 在处理后文件中找到数据内容的起始位置
|
||
output_start = -1
|
||
for i, elem in enumerate(output_children):
|
||
text = ''.join(elem.itertext()).strip()
|
||
if 'Client Health Program' in text or '客户健康方案' in text:
|
||
output_start = i + 1
|
||
print(f" [保护] 找到 Client Health Program 位置: {i}")
|
||
break
|
||
|
||
if output_start < 0:
|
||
output_start = boundary
|
||
print(f" [保护] 使用默认起始位置: {output_start}")
|
||
else:
|
||
print(f" [保护] 数据起始位置: {output_start}")
|
||
|
||
# 清空模板body,重新构建
|
||
for elem in list(template_body):
|
||
template_body.remove(elem)
|
||
|
||
# 读取原始模板
|
||
with zipfile.ZipFile(template_path, 'r') as z:
|
||
orig_template_xml = z.read('word/document.xml')
|
||
orig_template_tree = etree.fromstring(orig_template_xml)
|
||
orig_template_body = orig_template_tree.find('.//w:body', ns)
|
||
orig_template_children = list(orig_template_body)
|
||
|
||
# 1. 添加模板的前 boundary 个元素(前四页)
|
||
added_count = 0
|
||
for i in range(min(boundary, len(orig_template_children))):
|
||
elem = orig_template_children[i]
|
||
if elem.tag.endswith('}sectPr'):
|
||
continue
|
||
elem_copy = etree.fromstring(etree.tostring(elem))
|
||
template_body.append(elem_copy)
|
||
added_count += 1
|
||
|
||
print(f" [保护] 已添加模板前 {added_count} 个元素")
|
||
|
||
# 获取模板的 sectPr(包含页脚引用)
|
||
sectPr = None
|
||
for elem in orig_template_children:
|
||
if elem.tag.endswith('}sectPr'):
|
||
sectPr = etree.fromstring(etree.tostring(elem))
|
||
print(f" [保护] 使用模板的 sectPr(包含页脚引用)")
|
||
break
|
||
|
||
# 2. 添加处理后文件的数据内容部分
|
||
data_count = 0
|
||
for i in range(output_start, len(output_children)):
|
||
elem = output_children[i]
|
||
if elem.tag.endswith('}sectPr'):
|
||
continue
|
||
elem_copy = etree.fromstring(etree.tostring(elem))
|
||
template_body.append(elem_copy)
|
||
data_count += 1
|
||
|
||
print(f" [保护] 已添加 {data_count} 个数据元素")
|
||
|
||
# 3. 添加 sectPr 元素
|
||
if sectPr is not None:
|
||
template_body.append(sectPr)
|
||
|
||
print(f" [保护] 合并后总元素: {len(list(template_body))}")
|
||
|
||
# 保存修改后的 XML
|
||
new_xml = etree.tostring(template_tree, xml_declaration=True, encoding='UTF-8', standalone='yes')
|
||
|
||
with zipfile.ZipFile(template_path, 'r') as zin:
|
||
with zipfile.ZipFile(temp_result, 'w', zipfile.ZIP_DEFLATED) as zout:
|
||
for item in zin.infolist():
|
||
if item.filename == 'word/document.xml':
|
||
zout.writestr(item, new_xml)
|
||
else:
|
||
zout.writestr(item, zin.read(item.filename))
|
||
|
||
shutil.move(temp_result, output_path)
|
||
print(f" [保护] ✓ 前四页保护完成")
|
||
|
||
except Exception as e:
|
||
print(f" [保护] 复制失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
finally:
|
||
for f in [temp_output, temp_result]:
|
||
if os.path.exists(f):
|
||
try:
|
||
os.remove(f)
|
||
except:
|
||
pass
|
||
|
||
|
||
def fix_footer_reference(template_path, output_path):
|
||
"""
|
||
修复页脚引用,确保所有页面都有 Be.U Med logo
|
||
|
||
问题:在处理过程中,包含 sectPr 的段落可能被删除或修改,导致页脚引用丢失
|
||
解决:从模板复制第一个 sectPr 的 footerReference 到输出文件的 sectPr 中
|
||
"""
|
||
import zipfile
|
||
import shutil
|
||
from lxml import etree
|
||
import os
|
||
|
||
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'}
|
||
w_ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
||
r_ns = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}'
|
||
|
||
try:
|
||
# 读取模板的 document.xml
|
||
with zipfile.ZipFile(template_path, 'r') as z:
|
||
template_xml = z.read('word/document.xml')
|
||
template_tree = etree.fromstring(template_xml)
|
||
template_body = template_tree.find('.//w:body', ns)
|
||
|
||
# 找到模板中第一个有 footerReference 的 sectPr
|
||
template_sectPrs = template_body.findall('.//w:sectPr', ns)
|
||
footer_ref = None
|
||
header_refs = []
|
||
|
||
for sectPr in template_sectPrs:
|
||
for child in sectPr:
|
||
if 'footerReference' in child.tag:
|
||
footer_ref = etree.fromstring(etree.tostring(child))
|
||
print(f" [页脚] 找到模板页脚引用: {child.get(r_ns + 'id')}")
|
||
if 'headerReference' in child.tag:
|
||
header_refs.append(etree.fromstring(etree.tostring(child)))
|
||
if footer_ref is not None:
|
||
break
|
||
|
||
if footer_ref is None:
|
||
print(" [页脚] 模板中没有找到页脚引用,跳过")
|
||
return
|
||
|
||
# 读取输出文件的 document.xml
|
||
with zipfile.ZipFile(output_path, 'r') as z:
|
||
output_xml = z.read('word/document.xml')
|
||
output_tree = etree.fromstring(output_xml)
|
||
output_body = output_tree.find('.//w:body', ns)
|
||
|
||
# 找到输出文件中的 sectPr(通常在 body 的最后)
|
||
output_sectPr = None
|
||
for elem in reversed(list(output_body)):
|
||
if elem.tag.endswith('}sectPr'):
|
||
output_sectPr = elem
|
||
break
|
||
|
||
if output_sectPr is None:
|
||
print(" [页脚] 输出文件中没有找到 sectPr,跳过")
|
||
return
|
||
|
||
# 检查是否已经有 footerReference
|
||
has_footer = False
|
||
for child in output_sectPr:
|
||
if 'footerReference' in child.tag:
|
||
has_footer = True
|
||
break
|
||
|
||
if has_footer:
|
||
print(" [页脚] 输出文件已有页脚引用,跳过")
|
||
return
|
||
|
||
# 在 sectPr 的开头插入 headerReference 和 footerReference
|
||
# 顺序很重要:headerReference 在前,footerReference 在后
|
||
insert_pos = 0
|
||
for header_ref in header_refs:
|
||
output_sectPr.insert(insert_pos, header_ref)
|
||
insert_pos += 1
|
||
output_sectPr.insert(insert_pos, footer_ref)
|
||
|
||
print(f" [页脚] 已添加页脚引用到输出文件")
|
||
|
||
# 保存修改后的 XML
|
||
new_xml = etree.tostring(output_tree, xml_declaration=True, encoding='UTF-8', standalone='yes')
|
||
|
||
# 更新输出文件
|
||
temp_result = str(output_path) + '.temp_footer.docx'
|
||
with zipfile.ZipFile(output_path, 'r') as zin:
|
||
with zipfile.ZipFile(temp_result, 'w', zipfile.ZIP_DEFLATED) as zout:
|
||
for item in zin.infolist():
|
||
if item.filename == 'word/document.xml':
|
||
zout.writestr(item, new_xml)
|
||
else:
|
||
zout.writestr(item, zin.read(item.filename))
|
||
|
||
# 替换输出文件
|
||
shutil.move(temp_result, output_path)
|
||
print(f" [页脚] ✓ 页脚修复完成")
|
||
|
||
except Exception as e:
|
||
print(f" [页脚] 修复失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
|
||
def backup_protected_region(doc):
|
||
"""
|
||
备份保护区域的所有XML元素(深拷贝)
|
||
返回:(边界位置, 备份的元素列表)
|
||
|
||
重要:备份的是XML元素的深拷贝,可以在文档修改后恢复
|
||
"""
|
||
boundary = find_health_program_boundary(doc)
|
||
if boundary <= 0:
|
||
print(f" [保护] 未找到保护边界,跳过备份")
|
||
return -1, []
|
||
|
||
body = doc.element.body
|
||
children = list(body)
|
||
backup = []
|
||
for i in range(boundary):
|
||
backup.append(deepcopy(children[i]))
|
||
|
||
print(f" [保护] 已备份保护区域:boundary={boundary}, backup_len={len(backup)}")
|
||
return boundary, backup
|
||
|
||
|
||
def restore_protected_region(doc, boundary, backup):
|
||
"""
|
||
恢复保护区域的所有XML元素
|
||
|
||
重要:这个函数会完全替换文档开头的元素,确保保护区域完全恢复
|
||
使用深拷贝确保元素可以正确插入到新文档中
|
||
"""
|
||
if boundary <= 0 or not backup:
|
||
print(f" [保护] 跳过恢复:boundary={boundary}, backup_len={len(backup) if backup else 0}")
|
||
return
|
||
|
||
body = doc.element.body
|
||
children = list(body)
|
||
|
||
print(f" [保护] 开始恢复保护区域:boundary={boundary}, backup_len={len(backup)}, current_children={len(children)}")
|
||
|
||
# 删除当前保护区域的所有元素(从后往前删除,避免索引变化问题)
|
||
elements_to_remove = children[:min(boundary, len(children))]
|
||
for elem in reversed(elements_to_remove):
|
||
try:
|
||
body.remove(elem)
|
||
except Exception as e:
|
||
print(f" [保护] 删除元素失败: {e}")
|
||
|
||
# 在开头插入备份的元素(从后往前插入到位置0,这样顺序正确)
|
||
# 使用深拷贝确保元素可以正确插入到新文档中
|
||
for elem in reversed(backup):
|
||
try:
|
||
elem_copy = deepcopy(elem)
|
||
body.insert(0, elem_copy)
|
||
except Exception as e:
|
||
print(f" [保护] 插入元素失败: {e}")
|
||
|
||
print(f" [保护] 恢复完成,当前children数量: {len(list(body))}")
|
||
|
||
|
||
def set_cell_border(cell, **kwargs):
|
||
"""设置单元格边框"""
|
||
tc = cell._tc
|
||
tcPr = tc.get_or_add_tcPr()
|
||
tcBorders = OxmlElement('w:tcBorders')
|
||
for edge in ['top', 'left', 'bottom', 'right']:
|
||
if edge in kwargs:
|
||
element = OxmlElement(f'w:{edge}')
|
||
element.set(qn('w:val'), kwargs[edge].get('val', 'single'))
|
||
element.set(qn('w:sz'), str(kwargs[edge].get('sz', 4)))
|
||
element.set(qn('w:color'), kwargs[edge].get('color', '000000'))
|
||
tcBorders.append(element)
|
||
tcPr.append(tcBorders)
|
||
|
||
|
||
# 配对项目定义 - 这些项目应该在同一个表格中显示(两行数据,共享临床意义)
|
||
# 格式: 基础项 -> (配对项, 基础项中文名, 配对项中文名)
|
||
PAIRED_ITEMS = {
|
||
'NEUT': ('NEUT%', '中性粒细胞数量', '中性粒细胞百分含量'),
|
||
'EOS': ('EOS%', '嗜酸细胞数量', '嗜酸细胞百分含量'),
|
||
'BAS': ('BAS%', '嗜碱细胞数量', '嗜碱细胞百分含量'),
|
||
'LYMPH': ('LYMPH%', '淋巴细胞数量', '淋巴细胞百分含量'),
|
||
'MONO': ('MONO%', '单核细胞数量', '单核细胞百分含量'),
|
||
'TOTAL RBC': ('RBC COUNT', '红细胞总数', '红细胞计数'),
|
||
}
|
||
|
||
# 反向映射 - 百分比项 -> 基础项
|
||
PAIRED_ITEMS_REVERSE = {v[0]: k for k, v in PAIRED_ITEMS.items()}
|
||
|
||
# 所有配对项目的ABB集合(用于跳过单独处理)
|
||
ALL_PAIRED_ABBS = set(PAIRED_ITEMS.keys()) | set(PAIRED_ITEMS_REVERSE.keys())
|
||
|
||
|
||
def get_paired_item(abb):
|
||
"""
|
||
获取配对项目信息
|
||
返回: (paired_abb, is_base, base_cn, percent_cn)
|
||
如果没有配对项目,返回 (None, None, None, None)
|
||
"""
|
||
abb_upper = abb.upper().strip()
|
||
|
||
# 检查是否是基础项
|
||
if abb_upper in PAIRED_ITEMS:
|
||
percent_abb, base_cn, percent_cn = PAIRED_ITEMS[abb_upper]
|
||
return (percent_abb, True, base_cn, percent_cn)
|
||
|
||
# 检查是否是百分比项
|
||
if abb_upper in PAIRED_ITEMS_REVERSE:
|
||
base_abb = PAIRED_ITEMS_REVERSE[abb_upper]
|
||
_, base_cn, percent_cn = PAIRED_ITEMS[base_abb]
|
||
return (base_abb, False, base_cn, percent_cn)
|
||
|
||
return (None, None, None, None)
|
||
|
||
|
||
def is_paired_item(abb):
|
||
"""检查是否是配对项目(基础项或百分比项)"""
|
||
return abb.upper().strip() in ALL_PAIRED_ABBS
|
||
|
||
|
||
def is_paired_base_item(abb):
|
||
"""检查是否是配对项目的基础项(如NEUT, EOS等)"""
|
||
return abb.upper().strip() in PAIRED_ITEMS
|
||
|
||
|
||
def is_paired_percent_item(abb):
|
||
"""检查是否是配对项目的百分比项(如NEUT%, EOS%等)"""
|
||
return abb.upper().strip() in PAIRED_ITEMS_REVERSE
|
||
|
||
|
||
def clean_reference_range(reference: str) -> str:
|
||
"""
|
||
清理参考范围格式:
|
||
1. 去掉括号
|
||
2. 将 <X 转换为 0-X
|
||
3. 将 ≤X 转换为 0-X
|
||
|
||
例如:
|
||
- "(3.5-5.5)" -> "3.5-5.5"
|
||
- "<0.2" -> "0-0.2"
|
||
- "≤10" -> "0-10"
|
||
- "(阴性)" -> "阴性"
|
||
"""
|
||
import re
|
||
|
||
if not reference:
|
||
return reference
|
||
|
||
ref = reference.strip()
|
||
|
||
# 去掉各种括号
|
||
if ref.startswith('(') and ref.endswith(')'):
|
||
ref = ref[1:-1]
|
||
elif ref.startswith('(') and ref.endswith(')'):
|
||
ref = ref[1:-1]
|
||
elif ref.startswith('[') and ref.endswith(']'):
|
||
ref = ref[1:-1]
|
||
|
||
# 处理只有括号开头的情况
|
||
if ref.startswith('('):
|
||
ref = ref[1:]
|
||
if ref.endswith(')'):
|
||
ref = ref[:-1]
|
||
if ref.startswith('('):
|
||
ref = ref[1:]
|
||
if ref.endswith(')'):
|
||
ref = ref[:-1]
|
||
|
||
ref = ref.strip()
|
||
|
||
# 将 <X 或 ≤X 转换为 0-X 格式
|
||
# 匹配 <数字 或 ≤数字 或 <=数字
|
||
match = re.match(r'^[<≤]\s*([\d\.]+)\s*$', ref)
|
||
if match:
|
||
upper_value = match.group(1)
|
||
ref = f"0-{upper_value}"
|
||
|
||
# 匹配 <=数字
|
||
match = re.match(r'^<=\s*([\d\.]+)\s*$', ref)
|
||
if match:
|
||
upper_value = match.group(1)
|
||
ref = f"0-{upper_value}"
|
||
|
||
return ref.strip()
|
||
|
||
|
||
def create_medical_item_table(doc, abb, project_name, result, clinical_en, clinical_cn, include_header=False):
|
||
"""
|
||
创建单个医疗检测项目表格(完全复刻版)
|
||
格式(include_header=True时):
|
||
Row 0: Empty (Small height) - Top Solid Border
|
||
Row 1: Header (Abb简称 | Project项目 | Result结果 | Point指示 | Refer参考 | Unit单位)
|
||
Row 2: ABB | Name | Result | Point | Refer | Unit - Dashed Borders
|
||
Row 3: Clinical Significance (Merged) - Dashed Borders
|
||
"""
|
||
# 创建表格(根据是否需要表头决定行数)
|
||
num_rows = 4 if include_header else 3
|
||
table = doc.add_table(rows=num_rows, cols=6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.autofit = False
|
||
|
||
# 设置列宽
|
||
widths = [Cm(2.5), Cm(3.5), Cm(2.5), Cm(2.5), Cm(2.5), Cm(2.5)]
|
||
for row in table.rows:
|
||
for idx, width in enumerate(widths):
|
||
row.cells[idx].width = width
|
||
|
||
# 定义字体样式函数
|
||
def set_font(run, bold=False, font_size=10.5):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
# 定义临床意义字体样式函数(华文楷体,11号字)
|
||
def set_clinical_font(run, bold=False):
|
||
run.bold = bold
|
||
run.font.name = '华文楷体'
|
||
run.font.size = Pt(11)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '华文楷体')
|
||
|
||
# === Row 0: 空行 ===
|
||
row0 = table.rows[0]
|
||
row0.height = Cm(0.05) # 极小高度
|
||
row0.height_rule = 1 # WD_ROW_HEIGHT_RULE.EXACT (固定高度)
|
||
|
||
for cell in row0.cells:
|
||
cell.text = ''
|
||
p = cell.paragraphs[0]
|
||
p.paragraph_format.space_before = 0
|
||
p.paragraph_format.space_after = 0
|
||
p.paragraph_format.line_spacing = 0
|
||
run = p.add_run()
|
||
run.font.size = Pt(1)
|
||
|
||
# 确定数据行和解释行的索引
|
||
data_row_idx = 2 if include_header else 1
|
||
sig_row_idx = 3 if include_header else 2
|
||
|
||
# === 表头行(可选)===
|
||
if include_header:
|
||
header_row = table.rows[1]
|
||
headers = [
|
||
('Abb', '简称'), ('Project', '项目'), ('Result', '结果'),
|
||
('Point', '提示'), ('Refer', '参考'), ('Unit', '单位')
|
||
]
|
||
for idx, (en, cn) in enumerate(headers):
|
||
p = header_row.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(f'{en}\n{cn}')
|
||
set_font(run, bold=True, font_size=9)
|
||
|
||
# === 数据行 ===
|
||
data_row = table.rows[data_row_idx]
|
||
|
||
# 1. ABB
|
||
p = data_row.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(abb)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名
|
||
p = data_row.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(project_name)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. 结果
|
||
p = data_row.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(str(result))
|
||
set_font(run)
|
||
|
||
# 4-6. Point, Refer, Unit (空)
|
||
for idx in [3, 4, 5]:
|
||
p = data_row.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# === 临床意义行 ===
|
||
sig_row = table.rows[sig_row_idx]
|
||
top_cell = sig_row.cells[0]
|
||
for i in range(1, 6):
|
||
top_cell.merge(sig_row.cells[i])
|
||
|
||
# 第一个段落:英文临床意义
|
||
p = top_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_clinical_font(run, bold=True)
|
||
run = p.add_run(clinical_en)
|
||
set_clinical_font(run)
|
||
|
||
# 第二个段落:中文临床意义(独立段落,与案例文件格式一致)
|
||
p_cn = top_cell.add_paragraph()
|
||
p_cn.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p_cn.add_run('临床意义:')
|
||
set_clinical_font(run, bold=True)
|
||
run = p_cn.add_run(clinical_cn)
|
||
set_clinical_font(run)
|
||
|
||
# === 设置边框 ===
|
||
# 顶部实线 (黑色)
|
||
border_solid = {'val': 'single', 'sz': 4, 'color': '000000', 'space': 0}
|
||
# 其他虚线 (灰色)
|
||
border_dashed = {'val': 'dashed', 'sz': 4, 'color': 'AAAAAA', 'space': 0}
|
||
|
||
for i, row in enumerate(table.rows):
|
||
for cell in row.cells:
|
||
# 默认四周都是虚线
|
||
top = border_dashed
|
||
bottom = border_dashed
|
||
left = border_dashed
|
||
right = border_dashed
|
||
|
||
# 第一行顶部设置为实线
|
||
if i == 0:
|
||
top = border_solid
|
||
|
||
# 应用边框
|
||
set_cell_border(cell, top=top, bottom=bottom, left=left, right=right)
|
||
|
||
# 垂直居中
|
||
cell.vertical_alignment = 1
|
||
|
||
# 添加分隔
|
||
doc.add_paragraph()
|
||
|
||
return table
|
||
|
||
# 百度OCR配置 - 高精度版
|
||
APP_ID = '121295102'
|
||
API_KEY = '8cT0hIWTLPubtwT3Qils9q00'
|
||
SECRET_KEY = 'PPPUH7RwkuyijLqwzzoaWlXohUvm3pZs'
|
||
|
||
# 获取access_token(带重试机制)
|
||
def get_access_token(max_retries: int = 3):
|
||
"""获取百度OCR的access_token,支持网络失败重试"""
|
||
url = "https://aip.baidubce.com/oauth/2.0/token"
|
||
params = {
|
||
"grant_type": "client_credentials",
|
||
"client_id": API_KEY,
|
||
"client_secret": SECRET_KEY
|
||
}
|
||
|
||
for retry in range(max_retries):
|
||
try:
|
||
response = requests.post(url, params=params, timeout=30)
|
||
result = response.json()
|
||
token = result.get('access_token')
|
||
if token:
|
||
return token
|
||
else:
|
||
print(f" ⚠️ 获取token失败: {result.get('error', 'unknown error')}")
|
||
if retry < max_retries - 1:
|
||
time.sleep(2 * (retry + 1))
|
||
except requests.exceptions.Timeout:
|
||
print(f" ⚠️ 获取token超时,{retry+1}/{max_retries} 次重试...")
|
||
if retry < max_retries - 1:
|
||
time.sleep(2 * (retry + 1))
|
||
except requests.exceptions.ConnectionError:
|
||
print(f" ⚠️ 获取token连接失败,{retry+1}/{max_retries} 次重试...")
|
||
if retry < max_retries - 1:
|
||
time.sleep(3 * (retry + 1))
|
||
except Exception as e:
|
||
print(f" ⚠️ 获取token异常: {e},{retry+1}/{max_retries} 次重试...")
|
||
if retry < max_retries - 1:
|
||
time.sleep(2 * (retry + 1))
|
||
|
||
return None
|
||
|
||
ACCESS_TOKEN = None # 每次运行重新获取
|
||
|
||
def extract_pdf_with_position(pdf_path: str, max_retries: int = 3) -> list:
|
||
"""使用百度OCR高精度+位置版提取PDF,返回带位置信息的结果
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
max_retries: 每页最大重试次数(网络失败时)
|
||
"""
|
||
global ACCESS_TOKEN
|
||
if not ACCESS_TOKEN:
|
||
ACCESS_TOKEN = get_access_token()
|
||
if not ACCESS_TOKEN:
|
||
print(" ❌ 获取access_token失败")
|
||
return []
|
||
|
||
doc = fitz.open(pdf_path)
|
||
all_items = [] # 带位置的文本块
|
||
failed_pages = [] # 记录失败的页面
|
||
|
||
url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/accurate?access_token={ACCESS_TOKEN}"
|
||
|
||
print(f" PDF共 {len(doc)} 页")
|
||
|
||
def ocr_single_page(page_idx, retry_count=0):
|
||
"""OCR单页,支持重试"""
|
||
page = doc[page_idx]
|
||
pix = page.get_pixmap(dpi=150)
|
||
img_data = pix.tobytes('png')
|
||
|
||
try:
|
||
img_base64 = base64.b64encode(img_data).decode()
|
||
data = {"image": img_base64}
|
||
response = requests.post(url, data=data, timeout=30)
|
||
result = response.json()
|
||
|
||
if 'words_result' in result:
|
||
page_items = []
|
||
for item in result['words_result']:
|
||
page_items.append({
|
||
'text': item['words'],
|
||
'location': item.get('location', {}),
|
||
'page': page_idx + 1
|
||
})
|
||
print(f" 第 {page_idx+1} 页: {len(result['words_result'])} 行")
|
||
return page_items, True
|
||
elif 'error_code' in result:
|
||
error_code = result['error_code']
|
||
error_msg = result.get('error_msg', '')
|
||
# 网络相关错误码,需要重试
|
||
network_errors = [18, 19, 100, 110, 111, 282000, 282003, 282004]
|
||
if error_code in network_errors and retry_count < max_retries:
|
||
print(f" 第 {page_idx+1} 页网络错误 ({error_code}),{retry_count+1}/{max_retries} 次重试...")
|
||
time.sleep(2 * (retry_count + 1)) # 递增等待时间
|
||
return ocr_single_page(page_idx, retry_count + 1)
|
||
else:
|
||
print(f" 第 {page_idx+1} 页错误: {error_code} - {error_msg}")
|
||
return [], False
|
||
else:
|
||
print(f" 第 {page_idx+1} 页: 未知响应格式")
|
||
return [], False
|
||
|
||
except requests.exceptions.Timeout:
|
||
if retry_count < max_retries:
|
||
print(f" 第 {page_idx+1} 页超时,{retry_count+1}/{max_retries} 次重试...")
|
||
time.sleep(2 * (retry_count + 1))
|
||
return ocr_single_page(page_idx, retry_count + 1)
|
||
else:
|
||
print(f" 第 {page_idx+1} 页超时,已达最大重试次数")
|
||
return [], False
|
||
|
||
except requests.exceptions.ConnectionError:
|
||
if retry_count < max_retries:
|
||
print(f" 第 {page_idx+1} 页连接失败,{retry_count+1}/{max_retries} 次重试...")
|
||
time.sleep(3 * (retry_count + 1))
|
||
return ocr_single_page(page_idx, retry_count + 1)
|
||
else:
|
||
print(f" 第 {page_idx+1} 页连接失败,已达最大重试次数")
|
||
return [], False
|
||
|
||
except Exception as e:
|
||
if retry_count < max_retries:
|
||
print(f" 第 {page_idx+1} 页异常 ({e}),{retry_count+1}/{max_retries} 次重试...")
|
||
time.sleep(2 * (retry_count + 1))
|
||
return ocr_single_page(page_idx, retry_count + 1)
|
||
else:
|
||
print(f" 第 {page_idx+1} 页异常: {e}")
|
||
return [], False
|
||
|
||
# 第一轮:处理所有页面
|
||
for page_idx in range(len(doc)):
|
||
page_items, success = ocr_single_page(page_idx)
|
||
if success:
|
||
all_items.extend(page_items)
|
||
else:
|
||
failed_pages.append(page_idx)
|
||
time.sleep(0.3)
|
||
|
||
# 第二轮:重试失败的页面
|
||
if failed_pages:
|
||
print(f"\n ⚠️ {len(failed_pages)} 页提取失败,进行第二轮重试...")
|
||
time.sleep(5) # 等待一段时间后重试
|
||
|
||
still_failed = []
|
||
for page_idx in failed_pages:
|
||
print(f" 重试第 {page_idx+1} 页...")
|
||
page_items, success = ocr_single_page(page_idx)
|
||
if success:
|
||
all_items.extend(page_items)
|
||
else:
|
||
still_failed.append(page_idx + 1) # 转为1-based页码
|
||
time.sleep(1)
|
||
|
||
if still_failed:
|
||
print(f"\n ❌ 以下页面提取失败(可能需要手动检查): {still_failed}")
|
||
else:
|
||
print(f" ✓ 所有失败页面重试成功")
|
||
|
||
doc.close()
|
||
return all_items
|
||
|
||
|
||
def group_by_rows(items: list, y_threshold: int = 15) -> list:
|
||
"""按Y坐标分组,识别同一行的数据"""
|
||
if not items:
|
||
return []
|
||
|
||
# 按页和Y坐标排序
|
||
sorted_items = sorted(items, key=lambda x: (x['page'], x['location'].get('top', 0)))
|
||
|
||
rows = []
|
||
current_row = []
|
||
last_page = -1
|
||
last_top = -100
|
||
|
||
for item in sorted_items:
|
||
page = item['page']
|
||
top = item['location'].get('top', 0)
|
||
|
||
# 换页或Y坐标差距大于阈值,开始新行
|
||
if page != last_page or abs(top - last_top) > y_threshold:
|
||
if current_row:
|
||
# 按X坐标排序同一行的数据
|
||
current_row.sort(key=lambda x: x['location'].get('left', 0))
|
||
rows.append(current_row)
|
||
current_row = [item]
|
||
last_page = page
|
||
last_top = top
|
||
else:
|
||
current_row.append(item)
|
||
|
||
if current_row:
|
||
current_row.sort(key=lambda x: x['location'].get('left', 0))
|
||
rows.append(current_row)
|
||
|
||
return rows
|
||
|
||
|
||
def extract_pdf_text(pdf_path: str) -> str:
|
||
"""兼容旧接口 - 返回纯文本"""
|
||
items = extract_pdf_with_position(pdf_path)
|
||
rows = group_by_rows(items)
|
||
lines = []
|
||
for row in rows:
|
||
line = " ".join([item['text'] for item in row])
|
||
lines.append(line)
|
||
return "\n".join(lines)
|
||
|
||
|
||
def extract_patient_info(ocr_text: str) -> dict:
|
||
"""
|
||
从OCR文本中提取患者基本信息
|
||
|
||
提取字段:
|
||
- name: 姓名
|
||
- gender: 性别(Male→男性, Female→女性)
|
||
- age: 年龄(提取数字部分)
|
||
- nation: 国籍(默认"中国",OCR中通常没有)
|
||
- exam_time: 体检时间(Collected Date)
|
||
- project: 体检项目(功能医学检测套餐)
|
||
- report_time: 报告时间(使用当前时间)
|
||
|
||
Returns:
|
||
dict: 包含患者基本信息的字典
|
||
"""
|
||
from datetime import datetime
|
||
|
||
info = {
|
||
'name': '',
|
||
'gender': '',
|
||
'age': '',
|
||
'nation': '中国', # 默认值
|
||
'exam_time': '',
|
||
'project': '功能医学检测套餐', # 固定值
|
||
'report_time': datetime.now().strftime('%Y-%m-%d') # 当前时间
|
||
}
|
||
|
||
lines = ocr_text.split('\n')
|
||
|
||
# ---------- 中文体检报告格式检测 ----------
|
||
# 格式: "姓名 姚友胜 性别男 体检单号1125041700091 年龄59"
|
||
for line in lines[:20]:
|
||
if '姓名' in line and ('性别' in line or '年龄' in line):
|
||
# 提取姓名
|
||
name_m = re.search(r'姓名\s*(\S+)', line)
|
||
if name_m:
|
||
raw = name_m.group(1)
|
||
# 去掉姓名后面粘连的 "性别" 等
|
||
raw = re.split(r'性别|年龄|体检', raw)[0]
|
||
if raw:
|
||
info['name'] = raw
|
||
# 提取性别
|
||
gender_m = re.search(r'性别\s*(男|女)', line)
|
||
if gender_m:
|
||
info['gender'] = '男性' if gender_m.group(1) == '男' else '女性'
|
||
# 提取年龄
|
||
age_m = re.search(r'年龄\s*(\d+)', line)
|
||
if age_m:
|
||
info['age'] = age_m.group(1)
|
||
# 提取体检单号中的日期 (格式: 1125041700091 -> 前缀(11)+年(25)+月(04)+日(17)+序号)
|
||
id_m = re.search(r'体检单号\s*(\d+)', line)
|
||
if id_m:
|
||
id_str = id_m.group(1)
|
||
if len(id_str) >= 8:
|
||
yy = id_str[2:4]
|
||
mm = id_str[4:6]
|
||
dd = id_str[6:8]
|
||
try:
|
||
y, m, d = int(yy), int(mm), int(dd)
|
||
if 1 <= m <= 12 and 1 <= d <= 31:
|
||
info['exam_time'] = f'20{yy}-{mm}-{dd}'
|
||
except (ValueError, TypeError):
|
||
pass
|
||
break # 找到中文患者行后不再继续
|
||
|
||
# ---------- 中文报告体检日期补充 ----------
|
||
for line in lines[:50]:
|
||
if '检查日期' in line and not info['exam_time']:
|
||
date_m = re.search(r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})', line)
|
||
if date_m:
|
||
info['exam_time'] = date_m.group(1)
|
||
|
||
# ---------- 英文报告格式 ----------
|
||
for line in lines:
|
||
line_lower = line.lower().strip()
|
||
|
||
# 提取姓名 - Patient Name: MR. SHUNHU YU 或 Patient Name: MS. XXX
|
||
if 'patient name' in line_lower:
|
||
# 匹配 "Patient Name: XXX" 或 "Patient Name : XXX"
|
||
match = re.search(r'patient\s*name\s*[:\:]\s*(.+)', line, re.IGNORECASE)
|
||
if match:
|
||
name = match.group(1).strip()
|
||
# 去掉 MR. / MS. / MRS. 等称谓
|
||
name = re.sub(r'^(MR\.|MS\.|MRS\.|MISS\.?)\s*', '', name, flags=re.IGNORECASE)
|
||
info['name'] = name.strip()
|
||
|
||
# 提取性别 - Sex : Male 或 Sex : Female
|
||
if 'sex' in line_lower and ('male' in line_lower or 'female' in line_lower):
|
||
if 'female' in line_lower:
|
||
info['gender'] = '女性'
|
||
elif 'male' in line_lower:
|
||
info['gender'] = '男性'
|
||
|
||
# 提取年龄 - Age : 57Y6M17D 或 Age : 35
|
||
if 'age' in line_lower and ':' in line or ':' in line:
|
||
match = re.search(r'age\s*[:\:]\s*(\d+)', line, re.IGNORECASE)
|
||
if match:
|
||
info['age'] = match.group(1)
|
||
|
||
# 提取体检时间 - Collected Date/Time: 20 Dec 2025 或 Collected Date : 2025-07-20
|
||
if 'collected' in line_lower and ('date' in line_lower or 'time' in line_lower):
|
||
# 匹配日期格式:20 Dec 2025 或 2025-07-20 或 2025/07/20
|
||
match = re.search(r'collected\s*(?:date)?(?:/time)?\s*[:\:]\s*(.+?)(?:\s+\d{1,2}[:\:]\d{2})?$', line, re.IGNORECASE)
|
||
if match:
|
||
date_str = match.group(1).strip()
|
||
# 尝试解析不同的日期格式
|
||
try:
|
||
# 格式: 20 Dec 2025
|
||
parsed = datetime.strptime(date_str, '%d %b %Y')
|
||
info['exam_time'] = parsed.strftime('%Y-%m-%d')
|
||
except:
|
||
try:
|
||
# 格式: 2025-07-20
|
||
parsed = datetime.strptime(date_str, '%Y-%m-%d')
|
||
info['exam_time'] = parsed.strftime('%Y-%m-%d')
|
||
except:
|
||
try:
|
||
# 格式: 2025/07/20
|
||
parsed = datetime.strptime(date_str, '%Y/%m/%d')
|
||
info['exam_time'] = parsed.strftime('%Y-%m-%d')
|
||
except:
|
||
# 保留原始格式
|
||
info['exam_time'] = date_str
|
||
|
||
return info
|
||
|
||
|
||
def fill_patient_info_in_template(doc, patient_info: dict):
|
||
"""
|
||
在Word模板中填充患者基本信息
|
||
|
||
模板中有两处需要填充:
|
||
1. 第一处(约段落83-94):可能有示例数据,需要替换
|
||
2. 第二处(约段落263-274):空白占位符,需要填充
|
||
|
||
使用固定格式确保 / 符号对齐(所有 / 在同一列)
|
||
|
||
Args:
|
||
doc: python-docx Document对象
|
||
patient_info: 患者信息字典
|
||
"""
|
||
# 定义字段前缀(使用固定宽度格式确保 / 对齐)
|
||
# 英文部分用空格填充到相同宽度,确保 / 在同一列
|
||
# 最长的英文是 "Project"(7字符),统一填充到7字符
|
||
field_formats = {
|
||
'Name': ('Name / 姓名 :', patient_info.get('name', '')),
|
||
'Gender': ('Gender / 性别 :', patient_info.get('gender', '')),
|
||
'Age': ('Age / 年龄 :', patient_info.get('age', '')),
|
||
'Nation': ('Nation / 国籍 :', patient_info.get('nation', '')),
|
||
'Time / 体检': ('Time / 体检时间 :', patient_info.get('exam_time', '')),
|
||
'Project': ('Project / 体检项目 :', patient_info.get('project', '')),
|
||
'Time / 报告': ('Time / 报告时间 :', patient_info.get('report_time', '')),
|
||
}
|
||
|
||
filled_count = 0
|
||
|
||
for para in doc.paragraphs:
|
||
text = para.text.strip()
|
||
|
||
# 检查每个字段
|
||
for field_key, (field_format, value) in field_formats.items():
|
||
# 检查段落是否包含该字段的关键词
|
||
if field_key in text:
|
||
# 只有当值不为空时才替换
|
||
if value:
|
||
# 清空段落内容
|
||
for run in para.runs:
|
||
run.text = ''
|
||
|
||
# 添加新内容(使用固定格式)
|
||
new_text = field_format + value
|
||
if para.runs:
|
||
para.runs[0].text = new_text
|
||
else:
|
||
para.add_run(new_text)
|
||
|
||
filled_count += 1
|
||
print(f" ✓ 填充: {field_format}{value}")
|
||
break # 一个段落只匹配一个字段
|
||
|
||
print(f" 共填充 {filled_count} 个患者信息字段")
|
||
return filled_count
|
||
|
||
|
||
def parse_medical_data(text: str, source_file: str) -> list:
|
||
"""从OCR文本中解析医疗检测数据 - OCR每个字段分行"""
|
||
items = []
|
||
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
||
|
||
# 项目名称到ABB的映射 - 注意优先级:更具体的放前面
|
||
name_to_abb = {
|
||
# 血常规 - 按优先级排序,更具体的放前面
|
||
'mean cell hb concentration': 'MCHC', 'mchc': 'MCHC', # 必须在 hemoglobin 前
|
||
'follicle stimulating': 'FSH', 'fsh': 'FSH', 'folicle stimulating': 'FSH', # 必须在 hemoglobin 前
|
||
'mean corpuscular hemoglobin concentration': 'MCHC',
|
||
'mean corpuscular hemoglobin': 'MCH',
|
||
'rbc distribution width': 'RDW', 'rdw': 'RDW', # 必须在 rbc 前
|
||
'red cell distribution width': 'RDW',
|
||
'total wbc': 'WBC', 'white blood cell': 'WBC', 'wbc': 'WBC',
|
||
'red blood cell': 'RBC', 'rbc count': 'RBC', 'total rbc': 'RBC',
|
||
'hemoglobin(hb)': 'Hb', 'hemoglobin': 'Hb', # 注意:不要用 'hb' 作为key,会匹配到其他项
|
||
'hematocrit': 'HCT', 'hct': 'HCT',
|
||
'mean cell volume': 'MCV', 'mcv': 'MCV', 'mean corpuscular volume': 'MCV',
|
||
'platelet count': 'PLT', 'platelet': 'PLT', 'plt': 'PLT',
|
||
'mean platelet volume': 'MPV', 'mpv': 'MPV',
|
||
'neutrophil': 'NEUT', 'neut': 'NEUT',
|
||
'lymphocyte': 'LYMPH', 'lymph': 'LYMPH',
|
||
'monocyte': 'MONO', 'mono': 'MONO',
|
||
'eosinophil': 'EOS', 'eos': 'EOS',
|
||
'basophil': 'BAS', 'bas': 'BAS',
|
||
'esr': 'ESR', 'erythrocyte sedimentation': 'ESR',
|
||
'glucose(fasting)': 'FPG', 'fasting glucose': 'FPG', 'glucose': 'GLU', 'glu': 'GLU',
|
||
'hba1c': 'HbA1c', 'glycated hemoglobin': 'HbA1c', 'haemoglobin a1c': 'HbA1c', 'haemoglobin alc': 'HbA1c', 'hemoglobin a1c': 'HbA1c',
|
||
# 血脂 - HDL必须在cholesterol前面,否则会被匹配为TC
|
||
'hdl-cholesterol': 'HDL', 'hdl cholesterol': 'HDL', 'hdl': 'HDL',
|
||
'ldl-cholesterol': 'LDL', 'ldl cholesterol': 'LDL', 'ldl direct': 'LDL', 'ldl': 'LDL',
|
||
'vldl-cholesterol': 'VLDL', 'vldl': 'VLDL',
|
||
'total cholesterol': 'TC', 'cholesterol': 'TC', # 放在HDL/LDL后面
|
||
'triglyceride': 'TG', 'tg': 'TG',
|
||
'alt': 'ALT', 'sgpt': 'ALT', 'alanine aminotransferase': 'ALT',
|
||
'ast': 'AST', 'sgot': 'AST', 'aspartate aminotransferase': 'AST',
|
||
'gamma glutamyl transferase': 'GGT', 'gamma gt': 'GGT', 'gamma-gt': 'GGT', 'ggt': 'GGT', 'ggt(': 'GGT',
|
||
'alp': 'ALP', 'alkaline phosphatase': 'ALP',
|
||
'total bilirubin': 'TBIL', 'bilirubin total': 'TBIL', 'bilirubin(total)': 'TBIL',
|
||
'direct bilirubin': 'DBIL', 'bilirubin(direct)': 'DBIL', 'bilirubin direct': 'DBIL',
|
||
'ldh': 'LDH', 'lactate dehydrogenase': 'LDH',
|
||
'inr': 'INR',
|
||
'beta crosslap': 'CTX', 'beta-crosslap': 'CTX',
|
||
'anion gap': 'AG',
|
||
'estimated average glucose': 'EAG',
|
||
'total protein': 'TP',
|
||
'albumin': 'ALB', 'alb': 'ALB',
|
||
'globulin': 'GLB',
|
||
'bun': 'BUN', 'urea nitrogen': 'BUN', 'blood urea nitrogen': 'BUN',
|
||
'carcinoembryonic': 'CEA', 'cea': 'CEA', 'carcinoembryonic antigen': 'CEA',
|
||
'uric acid': 'UA', 'uricacid': 'UA', 'ua': 'UA', 'uric acid.': 'UA',
|
||
'egfr': 'eGFR',
|
||
'tsh': 'TSH', 'thyroid stimulating': 'TSH',
|
||
'ft3': 'FT3', 'free t3': 'FT3',
|
||
'ft4': 'FT4', 'free t4': 'FT4',
|
||
't3': 'T3', 't4': 'T4',
|
||
'estrogen': 'E2', 'estradiol': 'E2', 'estradiol(e2)': 'E2',
|
||
'progesterone': 'PROG',
|
||
'testosterone': 'TESTO',
|
||
'fsh': 'FSH', 'lh': 'LH',
|
||
'cortisol': 'Cortisol',
|
||
'igf-1': 'IGF-1', 'igf1': 'IGF-1',
|
||
'dhea': 'DHEA', 'dhea-s': 'DHEA-S',
|
||
'prolactin': 'PRL',
|
||
'afp': 'AFP', 'alpha fetoprotein': 'AFP',
|
||
'cea': 'CEA',
|
||
'ca125': 'CA125', 'ca 125': 'CA125',
|
||
'ca153': 'CA153', 'ca 15-3': 'CA153', 'carbohydrate antigen 15-3': 'CA153', 'carbohydrate antigen 15': 'CA153',
|
||
'ca199': 'CA199', 'ca 19-9': 'CA199', 'carbohydrate antigen 19-9': 'CA199', 'carbohydrate antigen 19': 'CA199',
|
||
'psa': 'PSA',
|
||
'hepatitis b surface antigen': 'HBsAg', 'hbsag': 'HBsAg', 'hbs ag': 'HBsAg',
|
||
'hepatitis b surface antibody': 'HBsAb', 'hbsab': 'HBsAb', 'anti-hbs': 'HBsAb', 'hbs ab': 'HBsAb',
|
||
'hepatitis be antigen': 'HBeAg', 'hbeag': 'HBeAg', 'hbe ag': 'HBeAg',
|
||
'hepatitis be antibody': 'HBeAb', 'hbeab': 'HBeAb', 'hbe ab': 'HBeAb',
|
||
|
||
# 尿检项目
|
||
'ph': 'PH', 'acidity': 'PH',
|
||
'specific gravity': 'SG', 'sp gravity': 'SG',
|
||
'transparency': 'Clarity', 'clear': 'Clarity',
|
||
'glucose': 'GLU', 'glu': 'GLU',
|
||
'ketone': 'KET', 'ket': 'KET', 'ketones': 'KET',
|
||
'bilirubin': 'BIL', 'bil': 'BIL',
|
||
'urobilinogen': 'URO', 'uro': 'URO',
|
||
'nitrite': 'NIT', 'nit': 'NIT',
|
||
'leukocyte': 'LEU', 'leu': 'LEU', 'leucocyte': 'LEU',
|
||
'erythrocyte': 'ERY', 'ery': 'ERY',
|
||
'color': 'Color', 'colour': 'Color',
|
||
'clarity': 'Clarity', 'turbidity': 'Clarity', 'appearance': 'Clarity',
|
||
'bacteria': 'BAC', 'bact': 'BAC',
|
||
'mucus': 'MUC',
|
||
'yeast': 'Yeast',
|
||
'crystal': 'CRY',
|
||
'hepatitis b core antibody': 'HBcAb', 'hbcab': 'HBcAb', 'anti-hbc': 'HBcAb', 'hbc ab': 'HBcAb',
|
||
'hepatitis c antibody': 'Anti-HCV', 'anti-hcv': 'Anti-HCV', 'hcv ab': 'Anti-HCV',
|
||
'hiv': 'HIV',
|
||
'h.pylori': 'H.pylori IgG', 'h. pylori': 'H.pylori IgG', 'helicobacter': 'H.pylori IgG',
|
||
'calcium': 'Ca', # 移除 'ca' 避免误匹配 clinical, context等
|
||
'phosphorus': 'P', 'phosphate': 'P',
|
||
'iron': 'Fe', 'serum iron': 'Fe',
|
||
'ferritin': 'Ferritin',
|
||
'zinc': 'Zn', 'zn': 'Zn',
|
||
'copper': 'Cu', 'cu': 'Cu',
|
||
'magnesium': 'Mg', 'mg': 'Mg',
|
||
'vitamin b12': 'VitB12', 'vit b12': 'VitB12', 'b12': 'VitB12',
|
||
'folate': 'Folate', 'folic acid': 'Folate',
|
||
'vitamin d': '25-OH-VitD', '25-oh vitamin d': '25-OH-VitD', '25-hydroxy': '25-OH-VitD', 'vitamin d total': '25-OH-VitD',
|
||
'crp': 'CRP', 'c-reactive protein': 'CRP',
|
||
'hs-crp': 'hs-CRP',
|
||
'rf': 'RF', 'rheumatoid factor': 'RF',
|
||
'ana': 'ANA', 'antinuclear antibody': 'ANA',
|
||
'immunoglobulin g': 'IgG', 'immunoglobulin a': 'IgA', 'immunoglobulin m': 'IgM', 'immunoglobulin e': 'IgE',
|
||
'igg': 'IgG', 'iga': 'IgA', 'igm': 'IgM', 'ige': 'IgE',
|
||
'c3': 'C3', 'c4': 'C4',
|
||
'nk cell': 'NK', 'cd16': 'NK', 'cd56': 'NK',
|
||
'osteocalcin': 'OSTE',
|
||
'p1np': 'P1NP',
|
||
'ctx': 'CTX',
|
||
'pth': 'PTH',
|
||
'color': 'Color', 'colour': 'Color',
|
||
'abo group': 'ABO', 'abo blood group': 'ABO',
|
||
'rh group': 'Rh', 'rh blood group': 'Rh',
|
||
'ph': 'pH',
|
||
'specific gravity': 'SG', 'sp gravity': 'SG', 'sg': 'SG',
|
||
'lipoprotein(a)': 'LP(A)', 'lipoprotein a': 'LP(A)',
|
||
'apolipoprotein a1': 'APOA1', 'apolipoprotein a': 'APOA1',
|
||
'apolipoprotein b': 'APOB',
|
||
'protein': 'PRO',
|
||
'ketone': 'KET', 'ket': 'KET',
|
||
'nitrite': 'NIT', 'nit': 'NIT',
|
||
'bilirubin': 'BIL',
|
||
'urobilinogen': 'URO',
|
||
'leukocyte': 'LEU',
|
||
# 凝血功能
|
||
'prothrombin time': 'PT', 'pt': 'PT', 'prothrombin time(pt)': 'PT',
|
||
'thrombin time': 'TT', 'tt': 'TT', 'thrombin time(tt)': 'TT',
|
||
'fibrinogen': 'FIB', 'fibrinogen level': 'FIB',
|
||
'd-dimer': 'D-Dimer', 'fdp d-dimer': 'D-Dimer',
|
||
'aptt': 'APTT', 'activated partial thromboplastin': 'APTT',
|
||
# 电解质
|
||
'sodium': 'Na', 'na': 'Na',
|
||
'potassium': 'K', 'k': 'K',
|
||
'chloride': 'Cl', 'cl': 'Cl',
|
||
'tco2': 'TCO2', 'co2': 'TCO2',
|
||
# 同型半胱氨酸
|
||
'homocysteine': 'HCY', 'hcy': 'HCY',
|
||
# 重金属
|
||
'lead': 'Pb', 'lead in blood': 'Pb',
|
||
'chromium': 'Cr', 'chromium in blood': 'Cr',
|
||
'manganese': 'Mn', 'manganese in blood': 'Mn',
|
||
'nickel': 'Ni', 'nickel in blood': 'Ni',
|
||
# 肿瘤标志物
|
||
'nse': 'NSE', 'neuron specific enolase': 'NSE',
|
||
'cyfra': 'CYFRA21-1', 'cyfra 21-1': 'CYFRA21-1',
|
||
# 血脂比值
|
||
'cholesterol/hdl-c ratio': 'TC/HDL', 'cholesterol/hdl ratio': 'TC/HDL', 'tc/hdl': 'TC/HDL',
|
||
'ldl/hdl ratio': 'LDL/HDL', 'ldl/hdl': 'LDL/HDL',
|
||
# 心肌酶
|
||
'ck-mb': 'CK-MB', 'ckmb': 'CK-MB', 'creatine kinase-mb': 'CK-MB',
|
||
'creatine kinase': 'CK', 'ck': 'CK',
|
||
# 甲状腺
|
||
'total t4': 'T4', 'totalt4': 'T4', 'thyroxine(t4)': 'T4',
|
||
# 炎症
|
||
'aso': 'ASO', 'anti-streptolysin': 'ASO', 'anti streptolysin': 'ASO', 'aso(anti-streptolysin': 'ASO',
|
||
# 自身抗体
|
||
'anti smith': 'Anti-Sm', 'anti-sm': 'Anti-Sm',
|
||
'anti-n rnp': 'Anti-RNP', 'anti rnp': 'Anti-RNP',
|
||
}
|
||
|
||
# OCR数据格式多样:
|
||
# 格式1: 项目名...: \n 数值 \n 单位 \n (参考范围)
|
||
# 格式2: 项目名...: \n 数值 H/L 单位 \n (参考范围)
|
||
# 格式3: 项目名...: \n 数值H% \n (参考范围)
|
||
|
||
# 跳过关键词 - 注意避免误匹配(如 'tel' 会匹配 'platelet')
|
||
skip_words = ['page ', 'patient name', 'doctor:', 'laboratory', 'specimen.', 'specimen type',
|
||
'collected date', 'printed', 'method:', 'bangkok', 'thailand',
|
||
'tel.', 'tel(', 'fax.', 'fax-', 'email:', 'iso 15189', 'iso15189',
|
||
'accreditation', 'lab no.', 'lab no:', 'labno', 'mrn.', 'mrn:', 'requested date',
|
||
'received date', 'address/', 'sex :', 'sex:', 'age :', 'age:',
|
||
'dob :', 'dob:', 'ref.no', 'copyright', 'reported by', 'authorised by',
|
||
'print date', 'remark:', 'remark(', 'confidential', 'this report',
|
||
'reference range', 'test name', 'result unit', 'edta blood',
|
||
'morphology:', 'morphology.', 'adequate', 'differential count',
|
||
'complete blood count', 'issue date', 'revision', 'normal range',
|
||
'for 10-year', 'this equation', 'calculated by', 'outlab',
|
||
'approved by', 'trimester', 'women(', 'female 21', 'post-menopause',
|
||
'cytoplasmic', 'oct1114', 'comment:', 'comment.', 'secs',
|
||
'report by', 'method:', 'method.', 'age:', 'age .', 'dr:', 'dr.',
|
||
'age...', # 移除了尿检项目过滤词: transparency, erythrocyte.., leucocyte.., urobilinogen..
|
||
# 过滤噪音数据 - 参考范围和标注被误识别
|
||
'borderline high', 'borderline low',
|
||
'female 12-', 'male 12-', 'female 14-', 'male 14-', 'female 15-', 'male 15-',
|
||
'female 16-', 'male 16-', 'female 17-', 'male 17-', 'female 18-', 'male 18-',
|
||
'female years', 'male years', 'thai male', 'thai female',
|
||
'serum am', 'serum pm', 'years 501', 'years 508', 'years 1717',
|
||
'years 546', 'years 468', 'years 231', 'years 225',
|
||
'scc 0', 'high =', 'low =', 'age = ', 'rbc = 0', 'high = 160',
|
||
'bilirubin = negative', 'bilirubin negative']
|
||
|
||
# 按key长度排序,最长的优先匹配
|
||
sorted_keys = sorted(name_to_abb.keys(), key=len, reverse=True)
|
||
|
||
# 需要精确匹配的短key(避免误匹配)
|
||
# alt会误匹配cobalt/totalt4, ast会误匹配contrast等
|
||
exact_match_keys = {'ph', 'sg', 'ca', 'mg', 'na', 'k', 'cl', 'p', 'fe', 'zn', 'cu', 'ni', 'cr', 'mn', 'pb',
|
||
'alt', 'ast', 'ggt', 'alp', 'ldh', 'bun', 'ua', 'tg', 'tc', 't3', 't4', 'fsh', 'lh',
|
||
'hb', 'rbc', 'wbc', 'plt', 'mcv', 'mch', 'hct', 'rdw', 'mpv',
|
||
'crp', 'rf', 'ana', 'pth', 'nse', 'cea', 'afp', 'psa', 'hiv'}
|
||
|
||
def find_abb(project_name):
|
||
"""查找项目对应的ABB"""
|
||
pl = project_name.lower().strip()
|
||
|
||
# 对于短key,要求精确匹配或单词边界匹配
|
||
for key in sorted_keys:
|
||
if key in exact_match_keys:
|
||
# 精确匹配:项目名就是key,或者key是独立单词
|
||
if pl == key or re.match(rf'^{key}[\s\.\:\d]', pl) or re.search(rf'\b{key}\b', pl):
|
||
return name_to_abb[key]
|
||
else:
|
||
if key in pl:
|
||
return name_to_abb[key]
|
||
# 生成ABB
|
||
words = [w for w in project_name.split() if len(w) > 0 and w[0].isalpha()]
|
||
if words:
|
||
return ''.join([w[0].upper() for w in words])[:6]
|
||
return project_name[:6].upper()
|
||
|
||
def parse_value_line(text):
|
||
"""解析数值行,返回 (result, point, unit)"""
|
||
text = text.strip()
|
||
result, point, unit = None, '', ''
|
||
|
||
# 格式1: "5.7H%" 或 "140H" 或 "230 H mg/dL" 或 "95" (数值开头)
|
||
m = re.match(r'^([\d\.]+)\s*([HL])?\s*(.*)$', text, re.IGNORECASE)
|
||
if m:
|
||
result = m.group(1)
|
||
if m.group(2):
|
||
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
||
unit = m.group(3).strip() if m.group(3) else ''
|
||
return result, point, unit
|
||
|
||
# 格式2: 数值和单位合并 "158.00mg/dL" 或 "247.00mg/dL"
|
||
m = re.match(r'^([\d\.]+)([a-zA-Z/%]+[/\w]*)$', text)
|
||
if m:
|
||
result = m.group(1)
|
||
unit = m.group(2)
|
||
return result, '', unit
|
||
|
||
# 格式3: 定性结果 - 单字母血型(A/B/O/AB)或单词(Positive/Negative/Reactive等)
|
||
# 支持后面有额外内容如 "Yellow [Normal: Yellow]"
|
||
qualitative_patterns = [
|
||
r'^([ABO]|AB)\b', # 血型
|
||
r'^(Positive|Negative|Reactive|Non[- ]?[Rr]eactive|Normal|Abnormal|Adequate|Yellow|Clear|Straw|Amber)\b', # 定性结果
|
||
]
|
||
for pat in qualitative_patterns:
|
||
m = re.match(pat, text, re.IGNORECASE)
|
||
if m:
|
||
result = m.group(1)
|
||
return result, '', ''
|
||
|
||
# 格式4: 点号后跟数值 "......... 6.0 (4.5-8.0)" -> 提取6.0
|
||
m = re.match(r'^[\.:\s]+([<>]?\d+\.?\d*)\s*(.*)$', text)
|
||
if m:
|
||
result = m.group(1)
|
||
unit = m.group(2).strip()
|
||
return result, '', unit
|
||
|
||
return result, point, unit
|
||
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i].strip()
|
||
line_lower = line.lower()
|
||
|
||
# 跳过无关行
|
||
if any(w in line_lower for w in skip_words):
|
||
i += 1
|
||
continue
|
||
|
||
# 跳过空行
|
||
if len(line) == 0:
|
||
i += 1
|
||
continue
|
||
|
||
# 检查是否是项目名行 (包含 ... 或以 : 结尾)
|
||
# 支持中文冒号 : 和英文冒号 :
|
||
# 增强:支持特定的已知项目名,即使没有冒号
|
||
known_short_projects = ['ph', 'sg', 'pro', 'glu', 'nit', 'ket', 'bld', 'ery', 'leu', 'wbc', 'rbc', 'color', 'turbidity']
|
||
|
||
# 1. 标准格式:以冒号或点结尾
|
||
is_standard_project = re.match(r'^[A-Za-z][A-Za-z0-9\s\-\(\)\.]+[\.:\uff1a]+\s*$', line)
|
||
|
||
# 1.5 以(*)开头的项目名(如 (*)Thrombin Time)- 不需要冒号结尾
|
||
is_star_project = re.match(r'^\(\*\)([A-Za-z][A-Za-z0-9\s\-]+)$', line)
|
||
|
||
# 2. 已知短项目名格式:可能是 "pH" 或 "pH 6.0" 或 "pH ..."
|
||
is_known_project = False
|
||
first_word = line.split()[0].lower().strip('.:') if line else ''
|
||
if first_word in known_short_projects:
|
||
is_known_project = True
|
||
|
||
if is_standard_project or is_known_project or is_star_project:
|
||
# 提取项目名
|
||
if is_standard_project:
|
||
project = re.sub(r'[\.:\:]+\s*$', '', line).strip()
|
||
project = re.sub(r'\.+', '', project).strip()
|
||
# 移除开头的(*)
|
||
project = re.sub(r'^\(\*\)', '', project).strip()
|
||
elif is_star_project:
|
||
# 从(*)开头的行提取项目名
|
||
project = is_star_project.group(1).strip()
|
||
else:
|
||
# 对于已知项目,可能后面直接跟结果
|
||
parts = line.split(maxsplit=1)
|
||
project = parts[0].strip('.:')
|
||
# 如果后面有内容,可能是结果
|
||
remaining = parts[1] if len(parts) > 1 else ""
|
||
|
||
abb = find_abb(project)
|
||
|
||
# 读取后续行获取数值
|
||
result = None
|
||
unit = ""
|
||
reference = ""
|
||
point = ""
|
||
|
||
# 如果是已知项目且同一行有内容,尝试直接解析结果
|
||
if is_known_project and 'remaining' in locals() and remaining:
|
||
# 尝试解析 remaining
|
||
r, p, u = parse_value_line(remaining)
|
||
if r:
|
||
result = r
|
||
point = p
|
||
unit = u
|
||
|
||
j = i + 1
|
||
# 如果还没有结果,继续往下找
|
||
while j < len(lines) and j < i + 6 and result is None:
|
||
next_line = lines[j].strip()
|
||
next_lower = next_line.lower()
|
||
|
||
# 跳过无关行
|
||
if any(w in next_lower for w in skip_words):
|
||
j += 1
|
||
continue
|
||
|
||
# 检查是否是新的项目名
|
||
if re.match(r'^[A-Za-z][A-Za-z0-9\s\-\(\)\.]+[\.:\:]+\s*$', next_line):
|
||
break
|
||
|
||
# 参考范围 (括号包围) - 先检查这个
|
||
if (next_line.startswith('(') or next_line.startswith('<') or
|
||
next_line.startswith('>')) and result is not None:
|
||
reference = next_line if next_line.startswith('(') else f'({next_line})'
|
||
j += 1
|
||
break
|
||
|
||
# 尝试解析数值行
|
||
if result is None:
|
||
r, p, u = parse_value_line(next_line)
|
||
if r:
|
||
result = r
|
||
point = p if p else point
|
||
unit = u if u else unit
|
||
j += 1
|
||
continue
|
||
|
||
# 单独的单位行
|
||
if re.match(r'^[\*a-zA-Z0-9\^\/\%\-\.]+$', next_line) and not next_line[0].isdigit():
|
||
if not unit:
|
||
unit = next_line
|
||
j += 1
|
||
continue
|
||
|
||
j += 1
|
||
|
||
# 保存结果 - 过滤噪音
|
||
if result and abb:
|
||
project_lower = project.lower()
|
||
# 过滤噪音项目名和无效结果
|
||
noise_projects = ['age', 'high', 'low', 'a', 'h', 'l', 'clinical info',
|
||
'context', 'guidelines', 'standards', 'personal data',
|
||
'copyright', 'report', 'specimen', 'method']
|
||
noise_patterns = ['female ', 'male ', 'years ', 'handled following',
|
||
'evolving clinical', 'privacy laws']
|
||
is_noise = (
|
||
project_lower in noise_projects or
|
||
(project_lower == 'rbc' and result == '0') or
|
||
result in ['.', ':', '-', '/'] or # 无效结果
|
||
len(project) > 50 or # 项目名过长肯定是噪音
|
||
any(p in project_lower for p in noise_patterns)
|
||
)
|
||
|
||
if not is_noise:
|
||
# 白细胞分类项目特殊处理:根据参考范围判断是数量还是百分比
|
||
# 百分比的参考范围通常是 0-100 之间的数值,如 (46.5-75.0)
|
||
# 数量的参考范围通常包含 10^3 或 *10 等单位
|
||
wbc_diff_abbs = {'NEUT', 'LYMPH', 'MONO', 'EOS', 'BAS'}
|
||
if abb.upper() in wbc_diff_abbs:
|
||
is_percentage = False
|
||
# 检查单位是否是百分比
|
||
if unit and '%' in unit:
|
||
is_percentage = True
|
||
# 检查参考范围是否是百分比形式(0-100之间的数值)
|
||
elif reference:
|
||
ref_match = re.search(r'\(?([\d\.]+)\s*[-–]\s*([\d\.]+)\)?', reference)
|
||
if ref_match:
|
||
try:
|
||
low = float(ref_match.group(1))
|
||
high = float(ref_match.group(2))
|
||
# 如果参考范围在0-100之间,且没有10^3等单位标识,认为是百分比
|
||
if 0 <= low <= 100 and 0 <= high <= 100 and '10^' not in reference and '*10' not in reference:
|
||
is_percentage = True
|
||
except:
|
||
pass
|
||
|
||
if is_percentage:
|
||
abb = abb.upper() + '%'
|
||
# 如果单位为空,添加%
|
||
if not unit:
|
||
unit = '%'
|
||
|
||
items.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'point': point,
|
||
'unit': unit,
|
||
'reference': reference,
|
||
'source': source_file
|
||
})
|
||
|
||
i = j
|
||
continue
|
||
|
||
# 检查定性结果格式: "项目名...: 结果" 或 "项目名..... . 结果"
|
||
# 更宽松:项目名后有点(可含空格),匹配定性结果
|
||
match = re.match(r'^(.+?)[\.\s]{2,}[:\:]?\s*(Negative|Positive|Non[- ]?Reactive|Reactive|Normal|B|A|AB|O|Yellow|Clear)\b', line, re.IGNORECASE)
|
||
if match:
|
||
project = match.group(1).strip()
|
||
project = re.sub(r'\.+', '', project).strip()
|
||
result = match.group(2).strip()
|
||
|
||
# 过滤噪音 - 只过滤明确的噪音
|
||
project_lower = project.lower()
|
||
is_noise = (
|
||
project_lower in ['age', 'high', 'low', 'a', 'h', 'l'] or
|
||
any(p in project_lower for p in ['female ', 'male ', 'years '])
|
||
)
|
||
|
||
if not is_noise:
|
||
abb = find_abb(project)
|
||
items.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'point': '',
|
||
'unit': '',
|
||
'reference': '',
|
||
'source': source_file
|
||
})
|
||
i += 1
|
||
continue
|
||
|
||
# 检查带冒号的行中是否直接包含定性结果(备用匹配)
|
||
# 如 "HIV-1/HIV-2 Antibody.....: Non Reactive"
|
||
match = re.match(r'^([A-Za-z][A-Za-z0-9\s\-\(\)/\.]+)[:\:]+\s*(Non[- ]?[Rr]eactive|Reactive|Negative|Positive|Yellow|Clear)$', line, re.IGNORECASE)
|
||
if match:
|
||
project = match.group(1).strip()
|
||
project = re.sub(r'\.+', '', project).strip()
|
||
result = match.group(2)
|
||
abb = find_abb(project)
|
||
|
||
items.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'point': '',
|
||
'unit': '',
|
||
'reference': '',
|
||
'source': source_file
|
||
})
|
||
i += 1
|
||
continue
|
||
|
||
# 检查带点号或冒号的行中是否直接包含数值
|
||
# 如 "ESR 1 Hour ...................: 20 H mm/hr" 或 "pH......... 6.0 (4.5-8.0)"
|
||
# 更宽松:项目名后有点(可含空格),结果以数字或<开头
|
||
match = re.match(r'^(.+?)[\.\s]{2,}[:\:]?\s*([<>]?\d+\.?\d*)\s*([HL])?\s*(.*)$', line, re.IGNORECASE)
|
||
if match:
|
||
project = match.group(1).strip()
|
||
project = re.sub(r'\.+', '', project).strip()
|
||
result = match.group(2)
|
||
point = '↑' if match.group(3) and match.group(3).upper() == 'H' else ('↓' if match.group(3) and match.group(3).upper() == 'L' else '')
|
||
rest = match.group(4).strip() if match.group(4) else ''
|
||
|
||
# 解析剩余部分获取单位和参考范围
|
||
unit = ''
|
||
reference = ''
|
||
if rest:
|
||
ref_match = re.search(r'\(([^\)]+)\)', rest)
|
||
if ref_match:
|
||
reference = f'({ref_match.group(1)})'
|
||
rest = rest[:ref_match.start()].strip()
|
||
unit = rest
|
||
|
||
abb = find_abb(project)
|
||
|
||
items.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'point': point,
|
||
'unit': unit,
|
||
'reference': reference,
|
||
'source': source_file
|
||
})
|
||
i += 1
|
||
continue
|
||
|
||
# 备用匹配1: 项目名(括号内容).: 数值 格式
|
||
# 如 "CEA(Carcinoembryonic Antigen).: 1.41" 或 "Vitamin D(25-OH...): 35.00"
|
||
match = re.match(r'^([A-Za-z][A-Za-z0-9\s\-]+)\([^\)]+\)[\.:\s]+\s*([<>]?\d+\.?\d*)\s*(.*)$', line)
|
||
if match:
|
||
project = match.group(1).strip()
|
||
result = match.group(2)
|
||
rest = match.group(3).strip()
|
||
abb = find_abb(project)
|
||
unit = ''
|
||
reference = ''
|
||
if rest:
|
||
ref_match = re.search(r'\(([^\)]+)\)', rest)
|
||
if ref_match:
|
||
reference = f'({ref_match.group(1)})'
|
||
rest = rest[:ref_match.start()].strip()
|
||
unit = rest
|
||
items.append({
|
||
'abb': abb, 'project': project, 'result': result,
|
||
'point': '', 'unit': unit, 'reference': reference, 'source': source_file
|
||
})
|
||
i += 1
|
||
continue
|
||
|
||
# 备用匹配2: 连续点号后跟冒号或空格和结果
|
||
# 如 "Color........................ Yellow" 或 "pH......... 6.0" 或 "Specific Gravity..............: 1.030"
|
||
match = re.match(r'^([A-Za-z][A-Za-z0-9\s\-/\(\)]*?)\.{3,}[:\s]+(.+)$', line)
|
||
if match:
|
||
project = match.group(1).strip()
|
||
rest = match.group(2).strip()
|
||
abb = find_abb(project)
|
||
|
||
# 解析rest:可能是 "Yellow [Normal: Yellow]" 或 "6.0 (4.5-8.0)" 或 "1.030 (1.003-1.030)"
|
||
result = None
|
||
unit = ''
|
||
reference = ''
|
||
|
||
# 先尝试提取数值
|
||
num_match = re.match(r'^([<>]?\d+\.?\d*)\s*([HL])?\s*(.*)$', rest, re.IGNORECASE)
|
||
if num_match:
|
||
result = num_match.group(1)
|
||
rest2 = num_match.group(3).strip()
|
||
ref_match = re.search(r'\(([^\)]+)\)', rest2)
|
||
if ref_match:
|
||
reference = f'({ref_match.group(1)})'
|
||
rest2 = rest2[:ref_match.start()].strip()
|
||
unit = rest2
|
||
else:
|
||
# 尝试提取定性结果
|
||
qual_match = re.match(r'^(Negative|Positive|Yellow|Clear|Normal|Non[- ]?Reactive|Reactive)\b', rest, re.IGNORECASE)
|
||
if qual_match:
|
||
result = qual_match.group(1)
|
||
|
||
if result and abb:
|
||
items.append({
|
||
'abb': abb, 'project': project, 'result': result,
|
||
'point': '', 'unit': unit, 'reference': reference, 'source': source_file
|
||
})
|
||
i += 1
|
||
continue
|
||
|
||
i += 1
|
||
|
||
return items
|
||
|
||
|
||
def clean_extracted_data(items: list) -> list:
|
||
"""清洗提取的数据,修复常见OCR解析错误"""
|
||
import re
|
||
|
||
cleaned = []
|
||
|
||
for item in items:
|
||
abb = item.get('abb', '').upper()
|
||
result = item.get('result', '')
|
||
unit = item.get('unit', '')
|
||
project = item.get('project', '')
|
||
reference = item.get('reference', '')
|
||
|
||
# 1. 过滤明显的噪音数据
|
||
if abb in ['A', 'H', 'L', 'R', 'AGE']:
|
||
continue
|
||
if project.lower() in ['age', 'high', 'low', 'received', 'collected']:
|
||
continue
|
||
if 'phase' in project.lower() or 'trimester' in project.lower():
|
||
continue
|
||
|
||
# 2. 修复result在unit字段的情况(如Color的Yellow)
|
||
if result in ['', '.', '-', '/'] and unit:
|
||
# 颜色值
|
||
colors = ['yellow', 'amber', 'straw', 'colorless', 'red', 'brown', 'dark', 'clear']
|
||
for color in colors:
|
||
if color in unit.lower():
|
||
result = color.capitalize()
|
||
# 从unit中提取参考范围
|
||
if '[' in unit and 'normal' in unit.lower():
|
||
ref_match = re.search(r'\[.*?(\d.*?)\]', unit, re.IGNORECASE)
|
||
if ref_match:
|
||
reference = ref_match.group(1)
|
||
unit = ''
|
||
break
|
||
|
||
# 定性结果
|
||
qualitative = ['negative', 'positive', 'reactive', 'non-reactive', 'normal']
|
||
for q in qualitative:
|
||
if q in unit.lower():
|
||
result = q.capitalize()
|
||
unit = ''
|
||
break
|
||
|
||
# 3. 过滤无效结果
|
||
if result in ['', '.', '-', '/', '00', '99', '999']:
|
||
continue
|
||
|
||
# 4. 修复unit中包含参考范围的情况
|
||
if unit and ('[' in unit or 'normal' in unit.lower()):
|
||
# 提取真正的单位
|
||
unit_match = re.match(r'^([a-zA-Z0-9\^/%\*]+)', unit)
|
||
if unit_match:
|
||
real_unit = unit_match.group(1)
|
||
if len(real_unit) <= 15:
|
||
unit = real_unit
|
||
else:
|
||
unit = ''
|
||
else:
|
||
unit = ''
|
||
|
||
# 5. 修复特定ABB的数据
|
||
# pH应该在4.0-9.0范围
|
||
if abb == 'PH':
|
||
try:
|
||
val = float(result.replace(',', '.'))
|
||
if not (4.0 <= val <= 9.0):
|
||
continue
|
||
except:
|
||
continue
|
||
|
||
# SG应该在1.000-1.050范围
|
||
if abb == 'SG':
|
||
try:
|
||
val = float(result.replace(',', '.'))
|
||
if not (1.000 <= val <= 1.050):
|
||
continue
|
||
except:
|
||
continue
|
||
|
||
# 6. 更新item
|
||
item['result'] = result
|
||
item['unit'] = unit
|
||
if reference and not item.get('reference'):
|
||
item['reference'] = reference
|
||
|
||
cleaned.append(item)
|
||
|
||
return cleaned
|
||
|
||
|
||
def extract_all_pdfs(pdf_dir: str) -> tuple:
|
||
"""提取目录下所有PDF的数据
|
||
|
||
Returns:
|
||
tuple: (all_items, ocr_texts) - 检测项列表和每个PDF的OCR原文字典
|
||
"""
|
||
pdf_path = Path(pdf_dir)
|
||
pdf_files = list(pdf_path.glob("*.pdf"))
|
||
|
||
all_items = []
|
||
ocr_texts = {} # {pdf_name: ocr_text}
|
||
|
||
for pdf_file in pdf_files:
|
||
print(f"\n📄 处理: {pdf_file.name}")
|
||
text = extract_pdf_text(str(pdf_file))
|
||
ocr_texts[pdf_file.name] = text # 保留OCR原文供后续复用
|
||
# 使用优化版解析函数
|
||
items = parse_medical_data_v2(text, pdf_file.name)
|
||
print(f" ✓ 提取 {len(items)} 个检测项")
|
||
all_items.extend(items)
|
||
|
||
# 清洗数据 - 使用优化版清洗函数
|
||
all_items = clean_extracted_data_v2(all_items)
|
||
print(f"\n ✓ 清洗后保留 {len(all_items)} 个有效检测项")
|
||
|
||
return all_items, ocr_texts
|
||
|
||
|
||
def match_with_template(extracted_items: list, template_config: dict) -> dict:
|
||
"""将提取的数据与模板结构匹配"""
|
||
import re
|
||
|
||
# 兼容新旧配置格式
|
||
if 'items' in template_config:
|
||
# 旧格式
|
||
template_items = template_config['items']
|
||
elif 'modules' in template_config:
|
||
# 新格式:从modules中提取所有items
|
||
template_items = []
|
||
for module_name, module_data in template_config['modules'].items():
|
||
for item in module_data.get('items', []):
|
||
template_items.append({
|
||
'abb': item.get('abb', ''),
|
||
'project': item.get('project', ''),
|
||
'project_cn': item.get('project_cn', ''),
|
||
'module': module_name
|
||
})
|
||
else:
|
||
template_items = []
|
||
|
||
# 结果有效性验证规则
|
||
def is_valid_result(abb, result):
|
||
"""检查结果是否对该项目有效"""
|
||
if not result:
|
||
return False
|
||
result_lower = result.lower().strip()
|
||
abb_upper = abb.upper()
|
||
|
||
# 定性结果项目
|
||
qualitative = ['PRO', 'GLU', 'KET', 'BIL', 'NIT', 'URO', 'LEU', 'BLD',
|
||
'HBSAG', 'HBSAB', 'HBEAG', 'HBEAB', 'HBCAB', 'ANTI-HCV', 'HIV', 'RPR',
|
||
'ANA', 'ANTI-SM', 'ANTI-RNP', 'RF']
|
||
valid_qualitative = ['negative', 'positive', 'trace', 'normal', 'abnormal',
|
||
'reactive', 'non-reactive', 'nonreactive', 'weak positive',
|
||
'1+', '2+', '3+', '4+', '+-']
|
||
|
||
if abb_upper in qualitative:
|
||
# 定性结果有效
|
||
if result_lower in valid_qualitative or result_lower.replace('+', '').replace('-', '') in ['1', '2', '3', '4']:
|
||
return True
|
||
# 数值结果也有效(有些定性项目也有定量结果,如HBsAb抗体滴度)
|
||
if re.search(r'\d', result):
|
||
return True
|
||
return False
|
||
|
||
# 血型
|
||
if abb_upper in ['ABO', 'RH']:
|
||
return result_lower in ['a', 'b', 'ab', 'o', 'positive', 'negative', 'rh+', 'rh-', '+', '-']
|
||
|
||
# 颜色
|
||
if abb_upper == 'COLOR':
|
||
return result_lower in ['yellow', 'amber', 'straw', 'colorless', 'red', 'brown', 'dark']
|
||
|
||
# pH值
|
||
if abb_upper == 'PH':
|
||
try:
|
||
val = float(result.replace(',', '.'))
|
||
return 4.0 <= val <= 9.0
|
||
except:
|
||
return False
|
||
|
||
# 比重SG
|
||
if abb_upper == 'SG':
|
||
try:
|
||
val = float(result.replace(',', '.'))
|
||
return 1.000 <= val <= 1.050
|
||
except:
|
||
return False
|
||
|
||
# 数值型结果 - 检查是否包含数字
|
||
if re.search(r'\d', result):
|
||
# 排除明显错误的值
|
||
if len(result) > 30: # 太长
|
||
return False
|
||
if result_lower in ['00', '99', '999']: # 占位符
|
||
return False
|
||
return True
|
||
|
||
return False
|
||
|
||
# 建立ABB索引
|
||
template_by_abb = {}
|
||
for item in template_items:
|
||
abb = item['abb'].upper()
|
||
template_by_abb[abb] = item
|
||
# 处理别名
|
||
if '/' in abb:
|
||
for part in abb.split('/'):
|
||
template_by_abb[part] = item
|
||
|
||
# 先按ABB分组提取数据(使用大写作为key进行匹配,但保留原始ABB)
|
||
items_by_abb = {}
|
||
original_abb_map = {} # 保存原始ABB大小写
|
||
for item in extracted_items:
|
||
abb_upper = item['abb'].upper()
|
||
original_abb = item['abb'] # 保留原始大小写
|
||
if abb_upper not in items_by_abb:
|
||
items_by_abb[abb_upper] = []
|
||
original_abb_map[abb_upper] = original_abb # 记录原始ABB
|
||
items_by_abb[abb_upper].append(item)
|
||
|
||
matched = {}
|
||
unmatched = []
|
||
|
||
for abb_upper, items in items_by_abb.items():
|
||
original_abb = original_abb_map.get(abb_upper, abb_upper) # 获取原始ABB
|
||
|
||
# 过滤有效结果
|
||
valid_items = [i for i in items if is_valid_result(abb_upper, i.get('result', ''))]
|
||
|
||
if not valid_items:
|
||
# 如果没有有效项,使用第一个(可能是定性结果)
|
||
valid_items = items[:1]
|
||
|
||
# 选择最佳匹配(优先选择有异常标记的,其次是有单位和参考范围的)
|
||
best = valid_items[0]
|
||
for item in valid_items:
|
||
score = 0
|
||
# 异常标记权重最高(+10分)
|
||
point = item.get('point', '').strip()
|
||
if point in ['↑', '↓', 'H', 'L', '高', '低']:
|
||
score += 10
|
||
if item.get('unit'): score += 1
|
||
if item.get('reference'): score += 1
|
||
if item.get('project'): score += 1
|
||
|
||
best_point = best.get('point', '').strip()
|
||
best_score = (10 if best_point in ['↑', '↓', 'H', 'L', '高', '低'] else 0) + \
|
||
(1 if best.get('unit') else 0) + \
|
||
(1 if best.get('reference') else 0) + \
|
||
(1 if best.get('project') else 0)
|
||
if score > best_score:
|
||
best = item
|
||
|
||
|
||
# 匹配到模板(使用原始ABB作为key)
|
||
if abb_upper in template_by_abb:
|
||
# 直接匹配优先
|
||
if original_abb not in matched: # 避免重复覆盖
|
||
# 添加模块信息和中文项目名称
|
||
best['module'] = template_by_abb[abb_upper].get('module', '')
|
||
# 使用配置文件中的中文项目名称
|
||
if template_by_abb[abb_upper].get('project_cn'):
|
||
best['project_cn'] = template_by_abb[abb_upper]['project_cn']
|
||
matched[original_abb] = best # 使用原始ABB作为key
|
||
else:
|
||
# 模糊匹配 - 只匹配有意义的相似性,避免'R' in 'COLOR'这种错误
|
||
found = False
|
||
for t_abb in template_by_abb:
|
||
# 要求至少3个字符匹配,且匹配部分占比高
|
||
if len(abb_upper) >= 3 and len(t_abb) >= 3:
|
||
if abb_upper == t_abb:
|
||
if original_abb not in matched:
|
||
# 添加模块信息和中文项目名称
|
||
best['module'] = template_by_abb[t_abb].get('module', '')
|
||
if template_by_abb[t_abb].get('project_cn'):
|
||
best['project_cn'] = template_by_abb[t_abb]['project_cn']
|
||
matched[original_abb] = best # 使用原始ABB作为key
|
||
found = True
|
||
break
|
||
if not found:
|
||
unmatched.append(best)
|
||
|
||
print(f"\n匹配结果: {len(matched)} 个匹配, {len(unmatched)} 个未匹配")
|
||
|
||
# 将未匹配的项目也加入结果中,以便后续作为缺失项目处理
|
||
for item in unmatched:
|
||
original_abb = item.get('abb', '') # 使用原始ABB
|
||
if original_abb and original_abb not in matched:
|
||
matched[original_abb] = item
|
||
|
||
return matched
|
||
|
||
|
||
def remove_placeholder_tables(doc):
|
||
"""
|
||
删除原有模板中的数据行(包括占位符行和已填充数据行)
|
||
保留:模块标题行
|
||
删除:表头行、数据行、Clinical Significance行
|
||
|
||
注意:模块标题表格最终应该只剩下1行(模块标题行)
|
||
"""
|
||
import re
|
||
removed_count = 0
|
||
|
||
# 模块标题关键词(完整的模块名称)
|
||
module_title_patterns = [
|
||
'blood sugar', 'blood count', 'complete blood count', 'urine detection', 'urine test',
|
||
'liver function', 'kidney function', 'lipid profile', 'lipid panel',
|
||
'thyroid function', 'thyroid', 'tumor marker', 'electrolyte', 'serum electrolyte',
|
||
'coagulation', 'blood coagulation', 'immune', 'humoral immunity',
|
||
'bone metabolism', 'infectious disease', 'four infectious',
|
||
'heavy metal', 'microelement', 'trace element',
|
||
'cardiovascular', 'thromboembolism', 'autoantibody', 'autoimmune',
|
||
'blood type', 'inflammatory', 'lymphocyte',
|
||
'female hormone', 'male hormone', 'female-specific', 'imaging',
|
||
'myocardial enzyme', 'cardiac enzyme',
|
||
'血常规', '尿液检测', '肝功能', '肾功能', '血脂', '甲状腺功能', '甲状腺',
|
||
'肿瘤标志物', '电解质', '血糖', '凝血功能', '凝血', '体液免疫', '免疫功能',
|
||
'骨代谢', '传染病', '重金属', '微量元素', '心脑血管', '自身抗体',
|
||
'血型', '炎症', '淋巴细胞', '女性激素', '男性激素', '女性专项', '影像',
|
||
'心肌酶', '女性荷尔蒙', '男性荷尔蒙'
|
||
]
|
||
|
||
def is_module_title_row(row_text):
|
||
"""
|
||
判断是否是真正的模块标题行
|
||
模块标题行的特征:
|
||
1. 完整的模块名称重复出现多次(如 "Blood Sugar\n血糖 Blood Sugar\n血糖...")
|
||
2. 行文本主要由模块名称组成,没有其他数据内容
|
||
"""
|
||
row_text_lower = row_text.lower()
|
||
|
||
# 检查是否有完整的模块名称重复出现
|
||
for pattern in module_title_patterns:
|
||
count = row_text_lower.count(pattern)
|
||
if count >= 3: # 模块标题行通常重复3次以上
|
||
# 额外检查:行文本长度应该与重复的模块名称长度相近
|
||
pattern_total_len = len(pattern) * count
|
||
if len(row_text_lower) < pattern_total_len * 3:
|
||
return True
|
||
return False
|
||
|
||
for table in doc.tables:
|
||
rows_to_remove = []
|
||
|
||
for row_idx, row in enumerate(table.rows):
|
||
row_text = ' '.join([c.text for c in row.cells]).strip()
|
||
row_text_lower = row_text.lower()
|
||
|
||
# 空行:删除
|
||
if not row_text or row_text.replace(' ', '') == '':
|
||
rows_to_remove.append(row)
|
||
continue
|
||
|
||
# 模块标题行:保留
|
||
if is_module_title_row(row_text):
|
||
# 如果包含占位符,清除占位符文本但保留行
|
||
if '{{' in row_text:
|
||
placeholder_pattern = re.compile(r'\{\{[^}]*\}\}')
|
||
for cell in row.cells:
|
||
if '{{' in cell.text:
|
||
cell.text = placeholder_pattern.sub('', cell.text).strip()
|
||
continue
|
||
|
||
# Clinical Significance行:删除(会在后续步骤中重新生成)
|
||
if 'clinical significance' in row_text_lower or '临床意义' in row_text:
|
||
rows_to_remove.append(row)
|
||
continue
|
||
|
||
# 其他所有行都删除(包括表头行和数据行)
|
||
rows_to_remove.append(row)
|
||
|
||
# 删除标记的行
|
||
for row in rows_to_remove:
|
||
try:
|
||
tbl = table._tbl
|
||
tbl.remove(row._tr)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
|
||
return removed_count
|
||
|
||
|
||
def find_module_title_position(doc, module_name):
|
||
"""
|
||
找到模块标题在body中的位置
|
||
返回模块标题表格的位置,新表格应插入到这个位置之后
|
||
|
||
注意:模块标题在模板中是表格的第一行,不是段落
|
||
|
||
关键区分:
|
||
- 模块标题表格:标题行是重复的模块名称(如 "Blood Sugar\n血糖 Blood Sugar\n血糖...")
|
||
- 数据表格:Clinical Significance 行是长文本描述,可能包含关键词但不是标题
|
||
"""
|
||
# 标准模块名称到搜索关键词的映射
|
||
module_titles = {
|
||
# 24个标准模块
|
||
'Urine Test': ['urine test', 'urine detection', '尿液检测', '尿常规'],
|
||
'Complete Blood Count': ['complete blood count', 'cbc', '血常规'],
|
||
'Blood Sugar': ['blood sugar', '糖代谢', '血糖'],
|
||
'Lipid Profile': ['lipid profile', 'lipid panel', '血脂'],
|
||
'Blood Type': ['blood type', '血型'],
|
||
'Blood Coagulation': ['blood coagulation', 'coagulation', '凝血功能', '凝血'],
|
||
'Four Infectious Diseases': ['infectious disease', '传染病', 'four infectious'],
|
||
'Serum Electrolytes': ['serum electrolyte', 'electrolyte', '电解质', '血清电解质'],
|
||
'Liver Function': ['liver function', '肝功能'],
|
||
'Kidney Function': ['kidney function', '肾功能'],
|
||
'Myocardial Enzyme': ['myocardial enzyme', 'cardiac enzyme', '心肌酶', '心肌酶谱'],
|
||
'Thyroid Function': ['thyroid function', '甲状腺功能', '甲功'],
|
||
'Thromboembolism': ['thromboembolism', 'cardiovascular risk', '心脑血管', '血栓'],
|
||
'Bone Metabolism': ['bone metabolism', '骨代谢'],
|
||
'Microelement': ['microelement', 'trace element', 'heavy metal', '微量元素', '重金属'],
|
||
'Lymphocyte Subpopulation': ['lymphocyte subpopulation', 'lymphocyte', '淋巴细胞亚群'],
|
||
'Humoral Immunity': ['humoral immunity', 'immune function', '体液免疫', '免疫功能'],
|
||
'Inflammatory Reaction': ['inflammatory reaction', 'inflammation', '炎症', '血沉'],
|
||
'Autoantibody': ['autoantibody', 'autoimmune', '自身抗体', '自身免疫'],
|
||
'Female Hormone': ['female hormone', '女性激素', '女性荷尔蒙'],
|
||
'Male Hormone': ['male hormone\n男性荷尔蒙', '男性激素', '男性荷尔蒙male hormone'],
|
||
'Tumor Markers': ['tumor marker', '肿瘤标志物'],
|
||
'Imaging': ['imaging', '影像'],
|
||
'Female-specific': ['female-specific', 'gynecological', '妇科', '女性专项'],
|
||
}
|
||
|
||
titles = module_titles.get(module_name, [module_name.lower()])
|
||
body = doc.element.body
|
||
|
||
def is_module_title_row(row_text):
|
||
"""
|
||
判断是否是模块标题行(而不是 Clinical Significance 行)
|
||
|
||
模块标题行特征:
|
||
1. 包含重复的模块名称(如 "Blood Sugar\n血糖 Blood Sugar\n血糖...")
|
||
2. 不以 "Clinical Significance" 开头
|
||
3. 不包含长描述性内容
|
||
"""
|
||
row_text_lower = row_text.lower().strip()
|
||
|
||
# 排除 Clinical Significance 行
|
||
if row_text_lower.startswith('clinical significance'):
|
||
return False
|
||
if '临床意义' in row_text and len(row_text) > 100:
|
||
return False
|
||
|
||
# 检查是否是重复模式的标题行
|
||
# 模块标题行通常是 "Module Name\n中文名 Module Name\n中文名..." 这种重复模式
|
||
for title in titles:
|
||
title_lower = title.lower()
|
||
# 如果关键词在文本中出现多次(>=2),很可能是标题行
|
||
if row_text_lower.count(title_lower) >= 2:
|
||
# 额外检查:排除包含长描述的Clinical Significance行
|
||
# Clinical Significance行通常包含这些描述性词汇
|
||
cs_indicators = ['used to', 'helps to', 'reflects', 'indicates', 'evaluating',
|
||
'diagnosis of', 'marker of', 'assessment', 'screening']
|
||
if any(ind in row_text_lower for ind in cs_indicators) and len(row_text) > 500:
|
||
return False
|
||
return True
|
||
# 如果文本很短且包含关键词,也可能是标题行
|
||
if len(row_text) < 150 and title_lower in row_text_lower:
|
||
# 额外检查:排除包含描述性词汇的行
|
||
description_words = ['content', 'level', 'reflects', 'indicates', 'assisting',
|
||
'diagnosis', 'evaluating', 'normal', 'reference']
|
||
if not any(dw in row_text_lower for dw in description_words):
|
||
return True
|
||
|
||
return False
|
||
|
||
# 遍历所有表格找模块标题
|
||
for i, table in enumerate(doc.tables):
|
||
if len(table.rows) == 0:
|
||
continue
|
||
# 只检查前3行
|
||
for row_idx in range(min(3, len(table.rows))):
|
||
row_text = ' '.join([c.text.strip() for c in table.rows[row_idx].cells])
|
||
row_text_lower = row_text.lower()
|
||
|
||
# 检查是否包含关键词
|
||
if any(title in row_text_lower for title in titles):
|
||
# 进一步验证是否是模块标题行
|
||
if is_module_title_row(row_text):
|
||
# 找到模块标题,返回该表格在body中的位置
|
||
tbl_element = table._tbl
|
||
for idx, child in enumerate(body):
|
||
if child is tbl_element:
|
||
return idx
|
||
|
||
return -1
|
||
|
||
|
||
def detect_gender(matched_data: dict, abb_config: dict) -> str:
|
||
"""
|
||
【已弃用】根据匹配到的荷尔蒙项目检测性别
|
||
|
||
注意:此函数已不再使用。现在统一从OCR文本中提取性别信息(通过patient_info['gender'])。
|
||
保留此函数仅作为备用参考。
|
||
|
||
原判断逻辑:
|
||
1. 如果有 AMH(抗缪勒氏管激素)→ 女性(AMH 只在女性荷尔蒙模块中)
|
||
2. 如果有 TPSA/FPSA(前列腺特异性抗原)→ 男性(前列腺是男性特有器官)
|
||
3. 如果有 CA125/CA15-3/SCC(女性肿瘤标志物)→ 女性
|
||
4. 如果都没有,检查 E2(雌二醇)的值:女性 E2 通常 > 100 pmol/L
|
||
|
||
注意:COR/Cortisol 不参与判断,因为它是需要根据性别分配的项目
|
||
"""
|
||
# 获取别名映射
|
||
abb_aliases = abb_config.get('abb_aliases', {})
|
||
|
||
# 标准化 ABB 的辅助函数
|
||
def normalize(abb):
|
||
abb_upper = abb.upper().strip()
|
||
return abb_aliases.get(abb, abb_aliases.get(abb_upper, abb)).upper()
|
||
|
||
# 检查匹配数据中的项目
|
||
has_amh = False # 女性特有
|
||
has_psa = False # 男性特有
|
||
has_female_tumor_markers = False # 女性肿瘤标志物
|
||
e2_value = None # 雌二醇值
|
||
|
||
for abb, data in matched_data.items():
|
||
result = data.get('result', '')
|
||
if not result or result in ['', '.', '-', '/']:
|
||
continue
|
||
|
||
abb_upper = abb.upper().strip()
|
||
normalized = normalize(abb)
|
||
|
||
# 检查 AMH(女性特有)
|
||
if normalized == 'AMH' or abb_upper == 'AMH':
|
||
has_amh = True
|
||
print(f" 发现 AMH(抗缪勒氏管激素)→ 女性特有项目")
|
||
|
||
# 检查 PSA(男性特有)
|
||
if normalized in ['TPSA', 'FPSA', 'PSA', 'F/TPSA'] or abb_upper in ['TPSA', 'FPSA', 'PSA', 'F/TPSA']:
|
||
has_psa = True
|
||
print(f" 发现 {abb}(前列腺特异性抗原)→ 男性特有项目")
|
||
|
||
# 检查女性肿瘤标志物
|
||
if normalized in ['CA125', 'CA15-3', 'CA153', 'SCC'] or abb_upper in ['CA125', 'CA15-3', 'CA153', 'SCC']:
|
||
has_female_tumor_markers = True
|
||
print(f" 发现 {abb}(女性肿瘤标志物)→ 女性特有项目")
|
||
|
||
# 记录 E2 值
|
||
if normalized == 'E2' or abb_upper == 'E2':
|
||
try:
|
||
e2_value = float(result.replace(',', '').strip())
|
||
print(f" 发现 E2(雌二醇)= {e2_value}")
|
||
except:
|
||
pass
|
||
|
||
# 判断性别
|
||
if has_psa:
|
||
print(f" ✓ 检测结果: 男性 (发现前列腺特异性抗原)")
|
||
return 'male'
|
||
|
||
if has_amh or has_female_tumor_markers:
|
||
print(f" ✓ 检测结果: 女性 (发现女性特有项目)")
|
||
return 'female'
|
||
|
||
# 如果有 E2 值,根据数值判断(女性 E2 通常 > 50 pmol/L)
|
||
if e2_value is not None:
|
||
if e2_value > 50:
|
||
print(f" ✓ 检测结果: 女性 (E2 = {e2_value} > 50)")
|
||
return 'female'
|
||
else:
|
||
print(f" ✓ 检测结果: 男性 (E2 = {e2_value} <= 50)")
|
||
return 'male'
|
||
|
||
# 默认返回女性(因为 COR 原本在女性模块中)
|
||
print(f" ✓ 检测结果: 女性 (默认)")
|
||
return 'female'
|
||
|
||
|
||
def fill_word_template_new(template_path: str, matched_data: dict, output_path: str, api_key: str = None, patient_info: dict = None):
|
||
"""
|
||
新版填充逻辑:
|
||
1. 按照2.pdf标准模块顺序和项目顺序排列
|
||
2. 先删除原有占位符表格行
|
||
3. 为每个ABB单独创建新表格结构
|
||
4. 未匹配到标准项目的数据通过DeepSeek分析后添加到对应模块尾部
|
||
|
||
Args:
|
||
template_path: Word模板路径
|
||
matched_data: 匹配的数据字典
|
||
output_path: 输出文件路径
|
||
api_key: DeepSeek API密钥(可选)
|
||
patient_info: 患者信息字典,包含gender字段(从OCR文本提取)
|
||
"""
|
||
doc = Document(template_path)
|
||
|
||
# 第一步:删除占位符行
|
||
print("\n 🧹 正在删除占位符行...")
|
||
removed = remove_placeholder_tables(doc)
|
||
print(f" ✓ 已删除 {removed} 个占位符行")
|
||
|
||
# 加载配置获取模块信息和标准顺序
|
||
from config import load_abb_config, get_standard_module_order, sort_items_by_standard_order, normalize_abb, normalize_module_name
|
||
abb_config = load_abb_config()
|
||
abb_to_module = abb_config.get('abb_to_module', {})
|
||
abb_to_info = abb_config.get('abb_to_info', {})
|
||
standard_module_order = get_standard_module_order()
|
||
|
||
# 性别检测:从OCR文本中提取的patient_info获取性别
|
||
# 将中文"男性"/"女性"转换为英文"male"/"female"
|
||
gender_from_ocr = patient_info.get('gender', '') if patient_info else ''
|
||
if gender_from_ocr == '男性':
|
||
detected_gender = 'male'
|
||
print(f" ✓ 性别: 男性 (从OCR文本提取)")
|
||
elif gender_from_ocr == '女性':
|
||
detected_gender = 'female'
|
||
print(f" ✓ 性别: 女性 (从OCR文本提取)")
|
||
else:
|
||
# 如果没有从OCR提取到性别,使用默认值(女性)
|
||
detected_gender = 'female'
|
||
print(f" ⚠️ 未从OCR文本提取到性别,使用默认值: 女性")
|
||
|
||
# 根据性别确定荷尔蒙项目应该分配到的模块
|
||
hormone_target_module = 'Male Hormone' if detected_gender == 'male' else 'Female Hormone'
|
||
|
||
# 定义所有荷尔蒙相关的ABB(这些项目在男性和女性荷尔蒙模块中都可能出现)
|
||
hormone_abbs = {
|
||
'E2', 'PROG', 'FSH', 'LH', 'PRL', 'T', 'DHEAS', 'COR', 'CORTISOL',
|
||
'IGF-1', 'IGF1', 'AMH', 'TESTO'
|
||
}
|
||
|
||
# 按模块分组所有数据
|
||
by_module = {}
|
||
unclassified_items = [] # 无法分类的项目
|
||
config_classified = 0 # 配置文件分类计数
|
||
deepseek_classified = 0 # DeepSeek分类计数
|
||
|
||
print("\n 📂 步骤1: 根据配置文件分类...")
|
||
|
||
for abb, data in matched_data.items():
|
||
result = data.get('result', '')
|
||
if not result or result in ['', '.', '-', '/']:
|
||
continue
|
||
|
||
# 标准化ABB名称
|
||
normalized_abb = normalize_abb(abb, abb_config)
|
||
|
||
# 特殊处理荷尔蒙项目:根据检测到的性别分配到对应的荷尔蒙模块
|
||
# 注意:必须优先于配置文件映射,确保根据性别正确分配
|
||
abb_upper = abb.upper().strip()
|
||
normalized_upper = normalized_abb.upper().strip()
|
||
is_hormone_abb = (abb_upper in hormone_abbs or normalized_upper in hormone_abbs)
|
||
|
||
# 如果配置文件中有模块映射,检查是否是荷尔蒙模块
|
||
if not is_hormone_abb:
|
||
# 先检查配置文件中的模块映射
|
||
module_from_config = abb_to_module.get(normalized_abb, '')
|
||
if not module_from_config:
|
||
module_from_config = abb_to_module.get(abb, '')
|
||
if not module_from_config:
|
||
module_from_config = abb_to_module.get(normalized_abb.upper(), '')
|
||
if not module_from_config:
|
||
module_from_config = abb_to_module.get(abb.upper(), '')
|
||
|
||
# 如果配置文件中映射到荷尔蒙模块,也视为荷尔蒙项目
|
||
if module_from_config in ['Male Hormone', 'Female Hormone']:
|
||
is_hormone_abb = True
|
||
|
||
# 如果是荷尔蒙项目,根据性别分配到对应的模块
|
||
if is_hormone_abb:
|
||
# 根据性别确定目标模块:男性→男性荷尔蒙,女性→女性荷尔蒙
|
||
target_module = 'Male Hormone' if detected_gender == 'male' else 'Female Hormone'
|
||
if target_module not in by_module:
|
||
by_module[target_module] = []
|
||
by_module[target_module].append((abb, data))
|
||
config_classified += 1
|
||
print(f" ✓ {abb} → [{target_module}] (荷尔蒙项目,根据性别: {detected_gender})")
|
||
continue
|
||
|
||
# 非荷尔蒙项目:使用配置文件中的模块映射
|
||
# 先尝试精确匹配(处理大小写敏感的ABB如TG/Tg)
|
||
module = abb_to_module.get(normalized_abb, '')
|
||
if not module:
|
||
module = abb_to_module.get(abb, '')
|
||
# 再尝试大写匹配(向后兼容)
|
||
if not module:
|
||
module = abb_to_module.get(normalized_abb.upper(), '')
|
||
if not module:
|
||
module = abb_to_module.get(abb.upper(), '')
|
||
|
||
if module:
|
||
# 配置文件分类成功
|
||
if module not in by_module:
|
||
by_module[module] = []
|
||
by_module[module].append((abb, data))
|
||
config_classified += 1
|
||
else:
|
||
# 需要DeepSeek分类
|
||
unclassified_items.append((abb, data))
|
||
|
||
print(f" ✓ 配置文件分类: {config_classified} 个项目")
|
||
print(f" ⏳ 待DeepSeek分类: {len(unclassified_items)} 个项目")
|
||
|
||
# 使用DeepSeek分类未匹配的项目
|
||
if unclassified_items:
|
||
print("\n 🤖 步骤2: 使用DeepSeek分类未匹配项目...")
|
||
items_to_remove = []
|
||
for abb, data in unclassified_items:
|
||
module = classify_abb_module(abb, data.get('project', abb), api_key)
|
||
if module:
|
||
# 标准化模块名称
|
||
original_module = module
|
||
module = normalize_module_name(module, abb_config)
|
||
|
||
# 如果DeepSeek分类结果是荷尔蒙模块,必须根据性别重新分配
|
||
if module in ['Male Hormone', 'Female Hormone']:
|
||
# 根据性别确定目标模块:男性→男性荷尔蒙,女性→女性荷尔蒙
|
||
module = 'Male Hormone' if detected_gender == 'male' else 'Female Hormone'
|
||
print(f" ✓ {abb} → [{original_module}] → [{module}] (荷尔蒙项目,根据性别: {detected_gender})")
|
||
elif original_module != module:
|
||
print(f" ✓ {abb} → [{original_module}] → [{module}]")
|
||
else:
|
||
print(f" ✓ {abb} → [{module}]")
|
||
|
||
if module not in by_module:
|
||
by_module[module] = []
|
||
by_module[module].append((abb, data))
|
||
deepseek_classified += 1
|
||
items_to_remove.append((abb, data))
|
||
else:
|
||
print(f" ✗ {abb} 无法分类")
|
||
|
||
# 从未分类列表中移除已分类的项目
|
||
for item in items_to_remove:
|
||
unclassified_items.remove(item)
|
||
|
||
print(f" ✓ DeepSeek分类: {deepseek_classified} 个项目")
|
||
|
||
total_classified = config_classified + deepseek_classified
|
||
print(f"\n 📋 分类完成: 共 {total_classified} 个项目,分布在 {len(by_module)} 个模块")
|
||
if unclassified_items:
|
||
print(f" ⚠️ {len(unclassified_items)} 个项目无法分类: {[i[0] for i in unclassified_items]}")
|
||
|
||
# 第三步:按标准模块顺序处理
|
||
added_count = 0
|
||
skipped_modules = []
|
||
|
||
print("\n 📝 步骤3: 按标准顺序填充模块...")
|
||
|
||
# 辅助函数:在项目列表中查找配对项目
|
||
def find_paired_item_in_list(items, target_abb):
|
||
"""在项目列表中查找指定ABB的项目"""
|
||
target_upper = target_abb.upper().strip()
|
||
for abb, data in items:
|
||
if abb.upper().strip() == target_upper:
|
||
return (abb, data)
|
||
return None
|
||
|
||
# 辅助函数:处理模块中的项目(支持配对项目)
|
||
def process_module_items(doc, module, sorted_items, position, abb_to_info, abb_config, api_key, gender=None):
|
||
"""处理模块中的项目,支持配对项目合并显示"""
|
||
nonlocal added_count
|
||
|
||
insert_pos = position
|
||
is_first_item = True
|
||
processed_abbs = set() # 记录已处理的ABB
|
||
|
||
for abb, data in sorted_items:
|
||
abb_upper = abb.upper().strip()
|
||
|
||
# 跳过已处理的项目
|
||
if abb_upper in processed_abbs:
|
||
continue
|
||
|
||
result = data.get('result', '')
|
||
point = data.get('point', '')
|
||
reference = data.get('reference', '')
|
||
unit = data.get('unit', '')
|
||
|
||
# 获取项目信息
|
||
normalized_abb = normalize_abb(abb, abb_config)
|
||
info = abb_to_info.get(normalized_abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(normalized_abb.upper(), {})
|
||
if not info:
|
||
info = abb_to_info.get(abb.upper(), {})
|
||
# 优先使用配置文件中的中文名称,其次使用data中的project_cn
|
||
name = info.get('project_cn') or data.get('project_cn')
|
||
# 如果没有中文名称,调用DeepSeek翻译
|
||
if not name:
|
||
english_name = info.get('project') or data.get('project', abb)
|
||
name = translate_project_name_to_chinese(abb, english_name, api_key)
|
||
|
||
# 检查是否是配对项目,并且配对项目是否都存在于数据中
|
||
if is_paired_item(abb):
|
||
paired_abb, is_base, base_cn, percent_cn = get_paired_item(abb)
|
||
|
||
# 查找配对项目是否存在于当前模块的数据中
|
||
paired_item_data = find_paired_item_in_list(sorted_items, paired_abb) if paired_abb else None
|
||
|
||
if paired_item_data:
|
||
# 两个配对项目都存在,创建配对表格
|
||
paired_abb_actual, paired_data = paired_item_data
|
||
|
||
# 确定基础项和百分比项的ABB和数据
|
||
# 使用原始数据中的ABB(保持PDF中的大小写格式)
|
||
if is_base:
|
||
# 当前项是基础项
|
||
base_abb_name = abb # 原始ABB
|
||
percent_abb_name = paired_abb_actual # 原始配对ABB
|
||
base_result = result
|
||
base_point = point
|
||
base_reference = reference
|
||
base_unit = unit
|
||
percent_result = paired_data.get('result', '')
|
||
percent_point = paired_data.get('point', '')
|
||
percent_reference = paired_data.get('reference', '')
|
||
percent_unit = paired_data.get('unit', '')
|
||
else:
|
||
# 当前项是百分比项,配对项是基础项
|
||
base_abb_name = paired_abb_actual # 原始配对ABB
|
||
percent_abb_name = abb # 原始ABB
|
||
percent_result = result
|
||
percent_point = point
|
||
percent_reference = reference
|
||
percent_unit = unit
|
||
base_result = paired_data.get('result', '')
|
||
base_point = paired_data.get('point', '')
|
||
base_reference = paired_data.get('reference', '')
|
||
base_unit = paired_data.get('unit', '')
|
||
|
||
# 获取AI解释(使用基础项的信息)
|
||
ai_explanation = get_ai_explanation(abb, name, result, api_key, gender=gender)
|
||
|
||
try:
|
||
# 使用配对表格(两行数据都填入)
|
||
insert_paired_items_table_with_both_data(
|
||
doc, insert_pos,
|
||
base_abb_name, percent_abb_name,
|
||
base_cn, percent_cn,
|
||
base_result, base_point, base_reference, base_unit,
|
||
percent_result, percent_point, percent_reference, percent_unit,
|
||
ai_explanation['en'], ai_explanation['cn'],
|
||
include_header=is_first_item # 只有模块第一个项目有表头
|
||
)
|
||
added_count += 1
|
||
insert_pos += 2
|
||
is_first_item = False
|
||
|
||
# 标记基础项和百分比项都已处理
|
||
processed_abbs.add(abb_upper)
|
||
processed_abbs.add(paired_abb.upper().strip())
|
||
print(f" ✓ 配对项目: {base_abb_name} + {percent_abb_name}")
|
||
continue
|
||
except Exception as e:
|
||
print(f" ✗ 添加配对项目 {abb} 失败: {e}")
|
||
else:
|
||
# 只有一个配对项目存在,使用普通表格
|
||
print(f" ℹ️ 配对项目 {abb} 的配对项 {paired_abb} 不存在,使用普通表格")
|
||
|
||
# 普通项目,创建单独表格
|
||
ai_explanation = get_ai_explanation(abb, name, result, api_key, gender=gender)
|
||
|
||
try:
|
||
insert_table_after_position(
|
||
doc, insert_pos, abb, name, result,
|
||
ai_explanation['en'], ai_explanation['cn'],
|
||
point=point, reference=reference, unit=unit,
|
||
include_header=is_first_item # 只有模块第一个项目有表头
|
||
)
|
||
added_count += 1
|
||
insert_pos += 2
|
||
is_first_item = False
|
||
processed_abbs.add(abb_upper)
|
||
except Exception as e:
|
||
print(f" ✗ 添加 {abb} 失败: {e}")
|
||
|
||
return insert_pos
|
||
|
||
return insert_pos
|
||
|
||
# 按标准顺序遍历模块
|
||
for module in standard_module_order:
|
||
if module not in by_module:
|
||
continue
|
||
|
||
items = by_module[module]
|
||
|
||
# 按标准项目顺序排序(标准项目在前,非标准项目在后)
|
||
sorted_items = sort_items_by_standard_order(items, module, abb_config)
|
||
|
||
# 找到该模块标题的位置
|
||
position = find_module_title_position(doc, module)
|
||
|
||
if position < 0:
|
||
# 找不到模块,跳过
|
||
skipped_modules.append((module, len(items)))
|
||
continue
|
||
|
||
print(f" 📍 模块 [{module}] 标题位置: {position}, 共 {len(sorted_items)} 个项目")
|
||
|
||
# 使用新的处理函数(支持配对项目)
|
||
process_module_items(doc, module, sorted_items, position, abb_to_info, abb_config, api_key, gender=detected_gender)
|
||
|
||
# 处理不在标准顺序中的模块
|
||
for module, items in by_module.items():
|
||
if module in standard_module_order:
|
||
continue # 已处理
|
||
|
||
sorted_items = sort_items_by_standard_order(items, module, abb_config)
|
||
position = find_module_title_position(doc, module)
|
||
|
||
if position < 0:
|
||
skipped_modules.append((module, len(items)))
|
||
continue
|
||
|
||
print(f" 📍 额外模块 [{module}] 标题位置: {position}, 共 {len(sorted_items)} 个项目")
|
||
|
||
# 使用新的处理函数(支持配对项目)
|
||
process_module_items(doc, module, sorted_items, position, abb_to_info, abb_config, api_key, gender=detected_gender)
|
||
|
||
if skipped_modules:
|
||
print(f"\n ⚠️ 跳过的模块(找不到标题):")
|
||
for mod, cnt in skipped_modules:
|
||
print(f" - {mod}: {cnt} 个项目")
|
||
|
||
if unclassified_items:
|
||
print(f"\n ⚠️ 无法分类的项目:")
|
||
for abb, data in unclassified_items:
|
||
print(f" - {abb}: {data.get('result', '')}")
|
||
|
||
print(f"\n✓ 已为 {added_count} 个项目创建单独表格")
|
||
|
||
# 使用安全保存
|
||
if output_path:
|
||
from xml_safe_save import safe_save
|
||
safe_save(doc, output_path, template_path)
|
||
print(f"✓ 保存到: {output_path}")
|
||
|
||
return doc
|
||
|
||
|
||
def fill_word_template(template_path: str, matched_data: dict, output_path: str, api_key: str = None, patient_info: dict = None):
|
||
"""
|
||
将匹配的数据填入Word模板(兼容旧接口)
|
||
|
||
Args:
|
||
template_path: Word模板路径
|
||
matched_data: 匹配的数据字典
|
||
output_path: 输出文件路径
|
||
api_key: DeepSeek API密钥(可选)
|
||
patient_info: 患者信息字典(可选)
|
||
"""
|
||
# 直接调用新版函数
|
||
return fill_word_template_new(template_path, matched_data, output_path, api_key, patient_info)
|
||
# 默认单位映射 - 用于补充OCR未识别的单位
|
||
default_units = {
|
||
# 血常规
|
||
'WBC': '10^9/L', 'RBC': '10^12/L', 'HB': 'g/L', 'HGB': 'g/L', 'HCT': '%',
|
||
'MCV': 'fL', 'MCH': 'pg', 'MCHC': 'g/L', 'RDW': '%', 'PLT': '10^9/L',
|
||
'NEUT': '%', 'LYMPH': '%', 'MONO': '%', 'EOS': '%', 'BAS': '%',
|
||
'NEUT#': '10^9/L', 'LYMPH#': '10^9/L', 'MONO#': '10^9/L', 'EOS#': '10^9/L', 'BAS#': '10^9/L',
|
||
# 肝功能
|
||
'ALT': 'U/L', 'AST': 'U/L', 'GGT': 'U/L', 'ALP': 'U/L', 'LDH': 'U/L',
|
||
'TBIL': 'μmol/L', 'DBIL': 'μmol/L', 'IBIL': 'μmol/L',
|
||
'TP': 'g/L', 'ALB': 'g/L', 'GLB': 'g/L',
|
||
# 肾功能
|
||
'BUN': 'mmol/L', 'CREA': 'μmol/L', 'UA': 'μmol/L', 'EGFR': 'mL/min/1.73m²',
|
||
# 血脂
|
||
'TC': 'mmol/L', 'TG': 'mmol/L', 'HDL': 'mmol/L', 'LDL': 'mmol/L',
|
||
'APOA1': 'g/L', 'APOB': 'g/L', 'LP(A)': 'mg/L',
|
||
# 电解质
|
||
'NA': 'mmol/L', 'K': 'mmol/L', 'CL': 'mmol/L', 'CA': 'mmol/L',
|
||
'P': 'mmol/L', 'MG': 'mmol/L', 'FE': 'μmol/L', 'ZN': 'μmol/L', 'CU': 'μmol/L',
|
||
'TCO2': 'mmol/L',
|
||
# 血糖
|
||
'GLU': 'mmol/L', 'HBA1C': '%', 'OGTT': 'mmol/L',
|
||
# 甲状腺
|
||
'TSH': 'mIU/L', 'FT3': 'pmol/L', 'FT4': 'pmol/L', 'T3': 'nmol/L', 'T4': 'nmol/L',
|
||
# 激素
|
||
'E2': 'pmol/L', 'PROG': 'nmol/L', 'TESTO': 'nmol/L', 'FSH': 'IU/L', 'LH': 'IU/L',
|
||
'PRL': 'mIU/L', 'CORTISOL': 'nmol/L', 'DHEA-S': 'μmol/L', 'IGF-1': 'ng/mL',
|
||
# 肿瘤标志物
|
||
'AFP': 'ng/mL', 'CEA': 'ng/mL', 'CA125': 'U/mL', 'CA153': 'U/mL', 'CA199': 'U/mL',
|
||
'PSA': 'ng/mL', 'NSE': 'ng/mL', 'CYFRA21-1': 'ng/mL',
|
||
# 凝血
|
||
'PT': 's', 'APTT': 's', 'TT': 's', 'FIB': 'g/L', 'D-DIMER': 'mg/L', 'INR': '',
|
||
# 炎症/免疫
|
||
'CRP': 'mg/L', 'HS-CRP': 'mg/L', 'RF': 'IU/mL', 'ESR': 'mm/h',
|
||
'IGG': 'g/L', 'IGA': 'g/L', 'IGM': 'g/L', 'IGE': 'IU/mL',
|
||
'C3': 'g/L', 'C4': 'g/L',
|
||
# 维生素
|
||
'VITB12': 'pmol/L', 'FOLATE': 'nmol/L', '25-OH-VITD': 'nmol/L',
|
||
# 尿常规 - 大部分定性无单位
|
||
'SG': '', 'PH': '', 'PRO': '', 'GLU': '', 'KET': '', 'BIL': '', 'NIT': '', 'URO': '', 'LEU': '',
|
||
# 血型
|
||
'ABO': '', 'RH': '',
|
||
# 传染病
|
||
'HBSAG': '', 'HBSAB': '', 'HBEAG': '', 'HBEAB': '', 'HBCAB': '',
|
||
'ANTI-HCV': '', 'HIV': '', 'RPR': '',
|
||
# 自身抗体
|
||
'ANA': '', 'ANTI-SM': '', 'ANTI-RNP': '',
|
||
# 同型半胱氨酸
|
||
'HCY': 'μmol/L',
|
||
# 骨代谢
|
||
'OSTE': 'ng/mL', 'P1NP': 'ng/mL', 'CTX': 'ng/mL', 'PTH': 'pg/mL',
|
||
}
|
||
|
||
# 默认参考范围映射 - 用于补充OCR未识别的参考范围
|
||
default_references = {
|
||
# 尿常规定性项目
|
||
'COLOR': 'Yellow', 'PRO': 'Negative', 'GLU': 'Negative', 'KET': 'Negative',
|
||
'BIL': 'Negative', 'NIT': 'Negative', 'URO': 'Normal', 'LEU': 'Negative',
|
||
'BLD': 'Negative', 'PH': '(4.5-8.0)', 'SG': '(1.003-1.030)',
|
||
# 传染病
|
||
'HBSAG': 'Negative', 'HBSAB': 'Negative/Positive', 'HBEAG': 'Negative',
|
||
'HBEAB': 'Negative', 'HBCAB': 'Negative', 'ANTI-HCV': 'Negative',
|
||
'HIV': 'Non-reactive', 'RPR': 'Non-reactive',
|
||
# 自身抗体
|
||
'ANA': 'Negative', 'ANTI-SM': 'Negative', 'ANTI-RNP': 'Negative', 'RF': 'Negative',
|
||
# 血型
|
||
'ABO': 'A/B/O/AB', 'RH': 'Positive/Negative',
|
||
}
|
||
|
||
# 表头关键词 - 用于识别真正的表头行
|
||
# 真正的表头行应该同时包含ABB+Project或ABB+Result等组合
|
||
header_core = ['abb', '简称'] # 表头行必须包含这些词之一
|
||
header_extra = ['project', '项目', 'result', '结果', 'refer', '参考', 'unit', '单位']
|
||
|
||
# Word模板ABB别名映射:Word中的格式 -> 提取数据中的ABB
|
||
word_abb_aliases = {
|
||
# 肿瘤标志物
|
||
'CA15-3': 'CA153', 'CA19-9': 'CA199',
|
||
# 血型
|
||
'BLOOD TYPE': 'ABO', 'BLOOD TYPE RH': 'Rh', 'ABO': 'ABO', 'RH': 'Rh',
|
||
# 电解质
|
||
'CALCIUM': 'CA', 'MAGNESIUM': 'MG', 'CHLORIDE': 'CL', 'SODIUM': 'NA', 'KALIUM': 'K',
|
||
'PHOSPHORUS': 'P', 'NA': 'NA', 'K': 'K', 'CL': 'CL', 'P': 'P',
|
||
# 糖代谢
|
||
'HBA1C': 'HBA1C', 'FBS': 'GLU', 'FPG': 'FPG', 'EAG': 'EAG',
|
||
# 激素
|
||
'IGF1': 'IGF-1', 'DHEAS': 'DHEA-S', 'DHEA-S': 'DHEA-S', 'COR': 'CORTISOL', 'TESTO': 'TESTO',
|
||
# 尿检项目
|
||
'COLOR': 'COLOR', 'KET': 'KET', 'PRO': 'PRO', 'NIT': 'NIT',
|
||
'PH': 'PH', 'SG': 'SG', 'BLD/ERY': 'ERY', 'CLARITY': 'CLARITY', 'TUR': 'CLARITY',
|
||
'BIL': 'BIL', 'ERY': 'ERY', 'URO': 'URO', 'LEU': 'LEU',
|
||
# 血常规
|
||
'BAS': 'BAS', 'EOS': 'EOS', 'LYMPH': 'LYMPH', 'MONO': 'MONO', 'NEUT': 'NEUT',
|
||
'BAS%': 'BAS', 'EOS%': 'EOS', 'LYMPH%': 'LYMPH', 'MONO%': 'MONO', 'NEUT%': 'NEUT',
|
||
'RBC COUNT': 'RBC', 'WBC COUNT': 'WBC', 'TOTAL RBC': 'RBC',
|
||
'MCH': 'MCH', 'RDW': 'RDW', 'RBC': 'RBC', 'WBC': 'WBC',
|
||
# 免疫
|
||
'ANTI-SM': 'ANTI-SM', 'ANTI-RNP': 'ANTI-RNP', 'ANA': 'ANA', 'ASO': 'ASO',
|
||
'H. PYLORI IGG': 'H.PYLORI',
|
||
# 骨代谢
|
||
'25-OH-VD2+D3': '25-OH-VITD', '25-OH-VITD': '25-OH-VITD',
|
||
'Β - CTX': 'CTX', 'CTX': 'CTX', 'TPINP': 'P1NP', 'OST': 'OSTE',
|
||
# 同型半胱氨酸
|
||
'HOMOCYSTEINE': 'HCY', 'HCY': 'HCY',
|
||
# 凝血
|
||
'INR': 'INR', 'D - DIMER': 'D-DIMER', 'D-DIMER': 'D-DIMER',
|
||
'APTT': 'APTT', 'PT': 'PT', 'TT': 'TT',
|
||
# 肾功能
|
||
'SCR': 'CR', 'CR': 'CR', 'UA': 'UA', 'EGFR': 'EGFR',
|
||
# 肝功能
|
||
'DBIL': 'DBIL', 'IBIL': 'IBIL', 'ALB': 'ALB', 'GLB': 'GLB',
|
||
# 血脂
|
||
'TCO2': 'TCO2', 'AG': 'AG', 'VLDL': 'VLDL', 'LP(A)': 'LP(A)', 'LP(A)': 'LP(A)', 'APOB': 'APOB',
|
||
# 铁代谢
|
||
'FER': 'FERRITIN', 'FERRITIN': 'FERRITIN', 'FE': 'FE',
|
||
# 前列腺
|
||
'TPSA': 'PSA', 'PSA': 'PSA', 'FPSA': 'FPSA',
|
||
# 肿瘤
|
||
'CYFRA21-1': 'CYFRA21-1', 'NSE': 'NSE',
|
||
# 传染病
|
||
'HIV': 'HIV', 'RPR': 'RPR', 'ANTI-HCV': 'ANTI-HCV', 'SAPA': 'SAPA',
|
||
'TRUST': 'RPR', 'TPPA': 'TPPA',
|
||
# 微量元素
|
||
'MN': 'MN', 'NI': 'NI', 'MIB': 'MIB', 'CIB': 'CIB', 'ZN': 'ZN', 'CU': 'CU',
|
||
# 其他
|
||
'SEC': 'SEC', 'CRY': 'CRY', 'T4-TOTAL': 'T4', 'T4': 'T4',
|
||
}
|
||
|
||
# 遍历所有表格
|
||
for table_idx, table in enumerate(doc.tables):
|
||
for row_idx, row in enumerate(table.rows):
|
||
cells = row.cells
|
||
if len(cells) < 2:
|
||
continue
|
||
|
||
# 获取整行文本用于判断
|
||
row_text = ' '.join([c.text.strip().lower() for c in cells])
|
||
|
||
# 跳过表头行 - 必须同时包含ABB关键词和其他表头词
|
||
is_header = any(kw in row_text for kw in header_core) and any(kw in row_text for kw in header_extra)
|
||
if is_header:
|
||
continue
|
||
|
||
# 跳过标题行 (如 "Complete Blood Count 血常规")
|
||
if 'complete blood' in row_text or 'blood count' in row_text:
|
||
continue
|
||
if 'clinical significance' in row_text:
|
||
continue
|
||
if '临床意义' in row_text or '检测' in row_text:
|
||
continue
|
||
|
||
# 获取第一个单元格作为ABB
|
||
first_cell_text = cells[0].text.strip()
|
||
|
||
# 跳过空行
|
||
if not first_cell_text:
|
||
continue
|
||
|
||
# ABB应该是短字符串,通常是大写字母组合
|
||
# 跳过太长的或包含中文的
|
||
if len(first_cell_text) > 20:
|
||
continue
|
||
if any('\u4e00' <= c <= '\u9fff' for c in first_cell_text):
|
||
# 第一列包含中文,可能不是ABB列,检查是否是数据行的其他格式
|
||
continue
|
||
|
||
abb = first_cell_text.upper()
|
||
|
||
# 通过别名映射转换Word模板中的ABB格式
|
||
lookup_abb = word_abb_aliases.get(abb, abb)
|
||
|
||
# 构建大小写不敏感的查找表
|
||
matched_data_upper = {k.upper(): v for k, v in matched_data.items()}
|
||
|
||
# 查找匹配的数据(大小写不敏感)
|
||
data = None
|
||
# 优先用别名转换后的ABB查找
|
||
if lookup_abb.upper() in matched_data_upper:
|
||
data = matched_data_upper[lookup_abb.upper()]
|
||
elif abb in matched_data_upper:
|
||
data = matched_data_upper[abb]
|
||
else:
|
||
# 尝试模糊匹配 - 处理带括号的情况如 "Hemoglobin(Hb)" 匹配 "HB"
|
||
for key in matched_data:
|
||
key_upper = key.upper()
|
||
if abb in key_upper.replace('(', ' ').replace(')', ' ').split():
|
||
data = matched_data[key]
|
||
break
|
||
|
||
if data:
|
||
# 找到匹配数据,标记为已填充(无论是否实际写入)
|
||
filled_abbs.add(lookup_abb.upper())
|
||
|
||
# 确定列索引 - 基于模板结构
|
||
# 列0: ABB, 列1-2: Project, 列3-4: Result, 列5-6: Point, 列7-8: Refer, 列9-10: Unit
|
||
try:
|
||
# 预处理:修复OCR解析错误(结果被放到unit字段的情况)
|
||
result_val = data.get('result', '')
|
||
unit_val = data.get('unit', '')
|
||
|
||
# 如果result无效但unit包含颜色/定性结果,则从unit提取
|
||
if result_val in ['', '.', '-', '/'] and unit_val:
|
||
# 检查unit是否包含颜色值
|
||
colors = ['yellow', 'amber', 'straw', 'colorless', 'red', 'brown', 'dark']
|
||
for color in colors:
|
||
if color in unit_val.lower():
|
||
result_val = color.capitalize()
|
||
unit_val = ''
|
||
break
|
||
|
||
# 填充Result (列3)
|
||
if result_val and result_val not in ['.', '-', '/'] and len(cells) > 3:
|
||
# 检查目标单元格是否为空或只包含占位符(包括模板变量{{xxx}})
|
||
current_text = cells[3].text.strip()
|
||
is_empty = not current_text or current_text in ['', '-', '/'] or current_text.startswith('{{')
|
||
if is_empty:
|
||
cells[3].text = str(result_val)
|
||
filled_count += 1
|
||
|
||
# 填充Point (列5)
|
||
if data.get('point') and len(cells) > 5:
|
||
current_text = cells[5].text.strip()
|
||
if not current_text or current_text in ['', '-', '/']:
|
||
cells[5].text = data['point']
|
||
|
||
# 填充Reference (列7) - 优先使用提取的参考范围,否则使用默认值
|
||
if len(cells) > 7:
|
||
current_text = cells[7].text.strip()
|
||
if not current_text or current_text in ['', '-', '/']:
|
||
ref = data.get('reference', '')
|
||
if not ref:
|
||
# 使用默认参考范围
|
||
ref = default_references.get(abb, '')
|
||
if ref:
|
||
cells[7].text = ref
|
||
|
||
# 填充Unit (列9) - 优先使用提取的单位,否则使用默认单位
|
||
if len(cells) > 9:
|
||
current_text = cells[9].text.strip()
|
||
if not current_text or current_text in ['', '-', '/']:
|
||
unit = data.get('unit', '')
|
||
# 检查unit是否有效(排除混入的参考范围)
|
||
if unit:
|
||
invalid_unit = (
|
||
len(unit) > 20 or # 单位不应该太长
|
||
'normal' in unit.lower() or
|
||
'[' in unit or ']' in unit or
|
||
'(' in unit or ')' in unit or
|
||
'-' in unit and any(c.isdigit() for c in unit) # 包含数字范围
|
||
)
|
||
if invalid_unit:
|
||
unit = ''
|
||
if not unit:
|
||
# 使用默认单位
|
||
unit = default_units.get(abb, '')
|
||
if unit:
|
||
cells[9].text = unit
|
||
|
||
except Exception as e:
|
||
print(f"Error filling {abb}: {e}")
|
||
pass
|
||
|
||
# 计算未填充的项目(大小写不敏感比较)
|
||
filled_abbs_upper = {a.upper() for a in filled_abbs}
|
||
unfilled_abbs = {k for k in matched_data.keys() if k.upper() not in filled_abbs_upper}
|
||
|
||
if unfilled_abbs:
|
||
print(f"\n 📋 发现 {len(unfilled_abbs)} 个未匹配项目,将添加到报告末尾")
|
||
add_missing_items_table(doc, unfilled_abbs, matched_data, api_key)
|
||
|
||
cleaned_count = 0
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
if placeholder_pattern.search(cell.text):
|
||
cell.text = placeholder_pattern.sub('', cell.text).strip()
|
||
cleaned_count += 1
|
||
if cleaned_count > 0:
|
||
print(f" 🧹 清理 {cleaned_count} 个占位符")
|
||
|
||
# 保存
|
||
doc.save(output_path)
|
||
print(f"\n✓ 已填充 {filled_count} 个数据项")
|
||
print(f"✓ 保存到: {output_path}")
|
||
|
||
return doc
|
||
|
||
|
||
# DeepSeek API配置(优先从.env读取,否则使用备用Key)
|
||
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY', '') or "sk-a8653b2b866b4e26a0dea234a498b1fa"
|
||
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
|
||
|
||
# DeepSeek缓存文件路径
|
||
DEEPSEEK_CACHE_FILE = Path(__file__).parent / "deepseek_cache.json"
|
||
_deepseek_cache = None # 内存缓存
|
||
|
||
def load_deepseek_cache():
|
||
"""加载DeepSeek缓存"""
|
||
global _deepseek_cache
|
||
if _deepseek_cache is not None:
|
||
return _deepseek_cache
|
||
|
||
if DEEPSEEK_CACHE_FILE.exists():
|
||
try:
|
||
with open(DEEPSEEK_CACHE_FILE, 'r', encoding='utf-8') as f:
|
||
_deepseek_cache = json.load(f)
|
||
except:
|
||
_deepseek_cache = {'classifications': {}, 'explanations': {}}
|
||
else:
|
||
_deepseek_cache = {'classifications': {}, 'explanations': {}}
|
||
return _deepseek_cache
|
||
|
||
def save_deepseek_cache():
|
||
"""保存DeepSeek缓存"""
|
||
global _deepseek_cache
|
||
if _deepseek_cache:
|
||
with open(DEEPSEEK_CACHE_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(_deepseek_cache, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def translate_project_name_to_chinese(abb: str, project_name: str, api_key: str = None) -> str:
|
||
"""
|
||
将英文项目名称翻译为中文
|
||
|
||
Args:
|
||
abb: 项目缩写
|
||
project_name: 英文项目名称
|
||
api_key: DeepSeek API Key
|
||
|
||
Returns:
|
||
中文项目名称
|
||
"""
|
||
if not project_name or not api_key:
|
||
return project_name
|
||
|
||
# 检查缓存
|
||
cache = load_deepseek_cache()
|
||
if 'translations' not in cache:
|
||
cache['translations'] = {}
|
||
|
||
cache_key = f"{abb}:{project_name}"
|
||
if cache_key in cache['translations']:
|
||
return cache['translations'][cache_key]
|
||
|
||
# 调用DeepSeek翻译
|
||
prompt = f"""请将以下医学检测项目名称翻译为中文。只返回中文翻译,不要其他内容。
|
||
|
||
项目缩写: {abb}
|
||
英文名称: {project_name}
|
||
|
||
要求:
|
||
1. 使用标准医学术语
|
||
2. 简洁准确
|
||
3. 只返回中文名称,不要其他说明"""
|
||
|
||
try:
|
||
response = call_deepseek_api(prompt, api_key, max_tokens=100, timeout=30)
|
||
if response:
|
||
# 清理响应
|
||
cn_name = response.strip()
|
||
# 移除可能的引号和多余内容
|
||
cn_name = cn_name.strip('"\'')
|
||
if '\n' in cn_name:
|
||
cn_name = cn_name.split('\n')[0].strip()
|
||
|
||
# 保存到缓存
|
||
cache['translations'][cache_key] = cn_name
|
||
save_deepseek_cache()
|
||
return cn_name
|
||
except Exception as e:
|
||
print(f" ⚠️ 翻译 {abb} 失败: {e}")
|
||
|
||
return project_name
|
||
|
||
|
||
def enhance_data_with_deepseek(matched_data: dict, api_key: str) -> dict:
|
||
"""
|
||
使用DeepSeek智能补充数据:
|
||
1. 为没有参考范围的项目补充参考范围(包括定性结果)
|
||
2. 判断没有point标记但可能异常的项目
|
||
|
||
Args:
|
||
matched_data: 匹配后的数据字典
|
||
api_key: DeepSeek API Key
|
||
|
||
Returns:
|
||
增强后的数据字典
|
||
"""
|
||
import json
|
||
|
||
# 收集需要处理的项目
|
||
items_need_reference = [] # 需要补充参考范围的项目
|
||
items_need_check = [] # 需要判断是否异常的项目
|
||
|
||
# 定性结果关键词
|
||
qualitative_keywords = ['negative', 'positive', 'non-reactive', 'reactive',
|
||
'normal', 'abnormal', '阴性', '阳性', '正常', '异常',
|
||
'clear', 'cloudy', 'yellow', 'amber', 'trace', 'nil']
|
||
|
||
for abb, data in matched_data.items():
|
||
result = data.get('result', '').strip()
|
||
reference = data.get('reference', '').strip()
|
||
point = data.get('point', '').strip()
|
||
unit = data.get('unit', '').strip()
|
||
project = data.get('project', abb)
|
||
|
||
# 检查是否是定性结果
|
||
is_qualitative = any(kw in result.lower() for kw in qualitative_keywords)
|
||
|
||
# 定性结果没有参考范围,需要补充
|
||
if is_qualitative and not reference:
|
||
items_need_reference.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'unit': unit,
|
||
'is_qualitative': True
|
||
})
|
||
continue
|
||
|
||
# 尝试解析数值结果
|
||
try:
|
||
# 处理可能的数值格式
|
||
result_clean = result.replace(',', '').replace(' ', '')
|
||
result_value = float(result_clean)
|
||
|
||
# 需要补充参考范围
|
||
if not reference:
|
||
items_need_reference.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'unit': unit,
|
||
'is_qualitative': False
|
||
})
|
||
|
||
# 有参考范围但没有point标记,需要判断是否异常
|
||
if reference and not point:
|
||
items_need_check.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'reference': reference,
|
||
'unit': unit
|
||
})
|
||
except (ValueError, TypeError):
|
||
# 非数值结果且不是已知定性结果,也尝试补充参考范围
|
||
if not reference and result:
|
||
items_need_reference.append({
|
||
'abb': abb,
|
||
'project': project,
|
||
'result': result,
|
||
'unit': unit,
|
||
'is_qualitative': True
|
||
})
|
||
|
||
print(f" 需要补充参考范围: {len(items_need_reference)} 个项目")
|
||
print(f" 需要判断异常: {len(items_need_check)} 个项目")
|
||
|
||
# 1. 补充参考范围
|
||
if items_need_reference:
|
||
print(" 正在调用DeepSeek补充参考范围...")
|
||
items_desc = []
|
||
for item in items_need_reference[:30]: # 限制数量避免prompt过长
|
||
desc = f"- {item['abb']}: {item['project']}, 结果: {item['result']}"
|
||
if item['unit']:
|
||
desc += f" {item['unit']}"
|
||
if item.get('is_qualitative'):
|
||
desc += " (定性检测)"
|
||
items_desc.append(desc)
|
||
|
||
prompt = f"""你是一位医学检验专家。请为以下检测项目提供标准参考范围。
|
||
|
||
## 检测项目:
|
||
{chr(10).join(items_desc)}
|
||
|
||
## 要求:
|
||
1. 提供成人的标准参考范围
|
||
2. 数值型参考范围格式示例:3.5-5.5、0-10、0-40
|
||
3. 定性检测的参考范围通常是:Negative、Non-Reactive、Normal、Clear 等
|
||
4. 如果不确定,可以返回空字符串
|
||
5. 不要使用 < 或 > 符号,用具体范围表示,如 <5 改为 0-5
|
||
|
||
## 输出格式(JSON):
|
||
```json
|
||
{{
|
||
"ABB1": "参考范围",
|
||
"ABB2": "参考范围"
|
||
}}
|
||
```
|
||
|
||
只返回JSON,不要其他说明。"""
|
||
|
||
try:
|
||
response = call_deepseek_api(prompt, api_key, max_tokens=1000, timeout=60)
|
||
if response:
|
||
# 解析JSON
|
||
if '```json' in response:
|
||
response = response.split('```json')[1].split('```')[0]
|
||
elif '```' in response:
|
||
response = response.split('```')[1].split('```')[0]
|
||
|
||
references = json.loads(response.strip())
|
||
updated_count = 0
|
||
for abb, ref in references.items():
|
||
# 尝试多种匹配方式
|
||
matched_key = None
|
||
if abb in matched_data:
|
||
matched_key = abb
|
||
elif abb.upper() in matched_data:
|
||
matched_key = abb.upper()
|
||
elif abb.lower() in matched_data:
|
||
matched_key = abb.lower()
|
||
|
||
if matched_key and ref:
|
||
matched_data[matched_key]['reference'] = ref
|
||
updated_count += 1
|
||
print(f" ✓ 已补充 {updated_count} 个项目的参考范围")
|
||
except Exception as e:
|
||
print(f" ⚠️ 补充参考范围失败: {e}")
|
||
|
||
# 2. 判断异常项目
|
||
if items_need_check:
|
||
print(" 正在调用DeepSeek判断异常项目...")
|
||
items_desc = []
|
||
for item in items_need_check[:30]: # 限制数量
|
||
desc = f"- {item['abb']}: {item['project']}, 结果: {item['result']}, 参考范围: {item['reference']}"
|
||
if item['unit']:
|
||
desc += f", 单位: {item['unit']}"
|
||
items_desc.append(desc)
|
||
|
||
prompt = f"""你是一位医学检验专家。请判断以下检测项目的结果是否异常。
|
||
|
||
## 检测项目:
|
||
{chr(10).join(items_desc)}
|
||
|
||
## 判断规则:
|
||
1. 如果结果超出参考范围上限,标记为 "↑"(偏高)
|
||
2. 如果结果低于参考范围下限,标记为 "↓"(偏低)
|
||
3. 如果结果在参考范围内,标记为 ""(正常,空字符串)
|
||
4. 参考范围格式可能是:3.5-5.5、<10、>100、0-40 等
|
||
|
||
## 输出格式(JSON):
|
||
```json
|
||
{{
|
||
"ABB1": "↑",
|
||
"ABB2": "↓",
|
||
"ABB3": ""
|
||
}}
|
||
```
|
||
|
||
只返回JSON,不要其他说明。"""
|
||
|
||
try:
|
||
response = call_deepseek_api(prompt, api_key, max_tokens=1000, timeout=60)
|
||
if response:
|
||
# 解析JSON
|
||
if '```json' in response:
|
||
response = response.split('```json')[1].split('```')[0]
|
||
elif '```' in response:
|
||
response = response.split('```')[1].split('```')[0]
|
||
|
||
abnormal_flags = json.loads(response.strip())
|
||
abnormal_count = 0
|
||
for abb, flag in abnormal_flags.items():
|
||
abb_upper = abb.upper()
|
||
if abb_upper in matched_data and flag in ['↑', '↓', 'H', 'L']:
|
||
matched_data[abb_upper]['point'] = flag
|
||
abnormal_count += 1
|
||
print(f" ✓ {abb_upper}: {flag}")
|
||
print(f" ✓ 发现 {abnormal_count} 个新异常项目")
|
||
except Exception as e:
|
||
print(f" ⚠️ 判断异常失败: {e}")
|
||
|
||
return matched_data
|
||
|
||
|
||
def call_deepseek_api(prompt: str, api_key: str = None, max_tokens: int = 2000, timeout: int = 120) -> str:
|
||
"""
|
||
调用DeepSeek API
|
||
"""
|
||
key = api_key or DEEPSEEK_API_KEY
|
||
if not key:
|
||
return None
|
||
|
||
headers = {
|
||
"Authorization": f"Bearer {key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
data = {
|
||
"model": "deepseek-chat",
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"temperature": 0.3,
|
||
"max_tokens": max_tokens
|
||
}
|
||
|
||
try:
|
||
response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data, timeout=timeout)
|
||
if response.status_code == 200:
|
||
return response.json()["choices"][0]["message"]["content"]
|
||
else:
|
||
print(f" ⚠ DeepSeek API错误: {response.status_code}")
|
||
return None
|
||
except Exception as e:
|
||
print(f" ⚠ DeepSeek请求失败: {e}")
|
||
return None
|
||
|
||
|
||
def classify_abb_module(abb: str, project_name: str, api_key: str = None) -> str:
|
||
"""
|
||
使用DeepSeek判断ABB项目属于哪个文字模块
|
||
"""
|
||
# 首先尝试基于ABB和项目名的规则匹配
|
||
abb_upper = abb.upper()
|
||
project_lower = project_name.lower()
|
||
|
||
# 预定义的ABB到模块映射
|
||
abb_module_map = {
|
||
# 尿检
|
||
'COLOR': 'Urine Detection', 'CLARITY': 'Urine Detection', 'SG': 'Urine Detection',
|
||
'PH': 'Urine Detection', 'PRO': 'Urine Detection', 'GLU': 'Urine Detection',
|
||
'KET': 'Urine Detection', 'NIT': 'Urine Detection', 'URO': 'Urine Detection',
|
||
'BIL': 'Urine Detection', 'LEU': 'Urine Detection', 'ERY': 'Urine Detection',
|
||
'BLD': 'Urine Detection', 'CRY': 'Urine Detection', 'BAC': 'Urine Detection',
|
||
# 血常规
|
||
'WBC': 'Complete Blood Count', 'RBC': 'Complete Blood Count', 'HB': 'Complete Blood Count',
|
||
'HGB': 'Complete Blood Count', 'HCT': 'Complete Blood Count', 'MCV': 'Complete Blood Count',
|
||
'MCH': 'Complete Blood Count', 'MCHC': 'Complete Blood Count', 'PLT': 'Complete Blood Count',
|
||
'RDW': 'Complete Blood Count', 'RDW-SD': 'Complete Blood Count', 'RDW-CV': 'Complete Blood Count',
|
||
'MPV': 'Complete Blood Count', 'PDW': 'Complete Blood Count', 'PCT': 'Complete Blood Count',
|
||
'P-LCR': 'Complete Blood Count',
|
||
'NEUT': 'Complete Blood Count', 'NEUT%': 'Complete Blood Count',
|
||
'LYMPH': 'Complete Blood Count', 'LYMPH%': 'Complete Blood Count',
|
||
'MONO': 'Complete Blood Count', 'MONO%': 'Complete Blood Count',
|
||
'EOS': 'Complete Blood Count', 'EOS%': 'Complete Blood Count',
|
||
'BAS': 'Complete Blood Count', 'BAS%': 'Complete Blood Count',
|
||
'ESR': 'Complete Blood Count',
|
||
# 肝功能
|
||
'ALT': 'Liver Function', 'AST': 'Liver Function', 'GGT': 'Liver Function',
|
||
'ALP': 'Liver Function', 'TBIL': 'Liver Function', 'DBIL': 'Liver Function',
|
||
'IBIL': 'Liver Function', 'TP': 'Liver Function', 'ALB': 'Liver Function',
|
||
'GLB': 'Liver Function', 'A/G': 'Liver Function', 'LDH': 'Liver Function',
|
||
'CHE': 'Liver Function', 'TF': 'Liver Function',
|
||
# 肾功能
|
||
'BUN': 'Kidney Function', 'CREA': 'Kidney Function', 'CR': 'Kidney Function',
|
||
'UA': 'Kidney Function', 'EGFR': 'Kidney Function', 'CYS-C': 'Kidney Function',
|
||
'CYSC': 'Kidney Function', 'Β2-MG': 'Kidney Function', 'B2-MG': 'Kidney Function',
|
||
# 血脂
|
||
'TC': 'Lipid Panel', 'TG': 'Lipid Panel', 'HDL': 'Lipid Panel', 'LDL': 'Lipid Panel',
|
||
'VLDL': 'Lipid Panel', 'APOA1': 'Lipid Panel', 'APOB': 'Lipid Panel', 'LP(A)': 'Lipid Panel',
|
||
'FFA': 'Lipid Panel',
|
||
# 电解质
|
||
'NA': 'Electrolytes', 'K': 'Electrolytes', 'CL': 'Electrolytes', 'CA': 'Electrolytes',
|
||
'P': 'Electrolytes', 'MG': 'Electrolytes', 'FE': 'Electrolytes', 'ZN': 'Electrolytes',
|
||
'CU': 'Electrolytes', 'TCO2': 'Electrolytes', 'AG': 'Electrolytes',
|
||
# 糖代谢
|
||
'FPG': 'Glucose', 'FBS': 'Glucose', 'HBA1C': 'Glucose', 'OGTT': 'Glucose', 'INS': 'Glucose',
|
||
'C-PEP': 'Glucose', 'EAG': 'Glucose',
|
||
# 甲状腺
|
||
'TSH': 'Thyroid', 'FT3': 'Thyroid', 'FT4': 'Thyroid', 'T3': 'Thyroid', 'T4': 'Thyroid',
|
||
'TG-AB': 'Thyroid', 'TGAB': 'Thyroid', 'TPO-AB': 'Thyroid',
|
||
# 激素
|
||
'E2': 'Hormone', 'PROG': 'Hormone', 'TESTO': 'Hormone', 'FSH': 'Hormone', 'LH': 'Hormone',
|
||
'PRL': 'Hormone', 'CORTISOL': 'Hormone', 'DHEA-S': 'Hormone', 'IGF-1': 'Hormone',
|
||
# 肿瘤标志物
|
||
'AFP': 'Tumor Markers', 'CEA': 'Tumor Markers', 'CA125': 'Tumor Markers',
|
||
'CA153': 'Tumor Markers', 'CA199': 'Tumor Markers', 'PSA': 'Tumor Markers',
|
||
'FPSA': 'Tumor Markers', 'TPSA': 'Tumor Markers', 'F/TPSA': 'Tumor Markers',
|
||
'NSE': 'Tumor Markers', 'CYFRA21-1': 'Tumor Markers',
|
||
'SCC': 'Tumor Markers', 'CA724': 'Tumor Markers', 'CA72-4': 'Tumor Markers',
|
||
'CA19-9': 'Tumor Markers', 'CA24-2': 'Tumor Markers', 'CA50': 'Tumor Markers',
|
||
'PROGRP': 'Tumor Markers',
|
||
# 凝血
|
||
'PT': 'Coagulation', 'APTT': 'Coagulation', 'TT': 'Coagulation', 'FIB': 'Coagulation',
|
||
'D-DIMER': 'Coagulation', 'INR': 'Coagulation', 'FDP': 'Coagulation',
|
||
# 传染病
|
||
'HBSAG': 'Infectious Disease', 'HBSAB': 'Infectious Disease', 'HBEAG': 'Infectious Disease',
|
||
'HBEAB': 'Infectious Disease', 'HBCAB': 'Infectious Disease', 'ANTI-HCV': 'Infectious Disease',
|
||
'HIV': 'Infectious Disease', 'RPR': 'Infectious Disease', 'TPPA': 'Infectious Disease',
|
||
'H.PYLORI': 'Infectious Disease',
|
||
# 免疫功能
|
||
'IGG': 'Immune Function', 'IGA': 'Immune Function', 'IGM': 'Immune Function',
|
||
'IGE': 'Immune Function', 'C3': 'Immune Function', 'C4': 'Immune Function',
|
||
'CRP': 'Immune Function', 'HS-CRP': 'Immune Function', 'RF': 'Immune Function',
|
||
'ANA': 'Immune Function', 'ANTI-SM': 'Immune Function', 'ANTI-RNP': 'Immune Function',
|
||
'ASO': 'Immune Function', 'NK': 'Immune Function',
|
||
# 骨代谢
|
||
'OSTE': 'Bone Metabolism', 'P1NP': 'Bone Metabolism', 'CTX': 'Bone Metabolism',
|
||
'PTH': 'Bone Metabolism', '25-OH-VITD': 'Bone Metabolism',
|
||
'25-OH-VD2+D3': 'Bone Metabolism', 'VD3': 'Bone Metabolism', 'VD2': 'Bone Metabolism',
|
||
'OST': 'Bone Metabolism',
|
||
# 重金属
|
||
'PB': 'Heavy Metals', 'MN': 'Heavy Metals', 'NI': 'Heavy Metals',
|
||
'CR': 'Heavy Metals', 'CD': 'Heavy Metals', 'HG': 'Heavy Metals',
|
||
# 维生素
|
||
'VITB12': 'Vitamin', 'FOLATE': 'Vitamin', 'VITD': 'Vitamin',
|
||
'VITA': 'Vitamin', 'VITE': 'Vitamin', 'VITK1': 'Vitamin',
|
||
'VITB1': 'Vitamin', 'VITB2': 'Vitamin', 'VITB3': 'Vitamin',
|
||
'VITB5': 'Vitamin', 'VITB6': 'Vitamin',
|
||
'FER': 'Vitamin', # 铁蛋白(贫血相关)
|
||
# 同型半胱氨酸
|
||
'HCY': 'Homocysteine',
|
||
# 血型
|
||
'ABO': 'Blood Type', 'RH': 'Blood Type',
|
||
}
|
||
|
||
# TG 歧义消解: 甲状腺球蛋白(Tg/Thyroid) vs 甘油三酯(TG/Lipid Panel)
|
||
if abb_upper == 'TG':
|
||
if '甲状腺' in project_lower or 'thyroglobulin' in project_lower:
|
||
return 'Thyroid'
|
||
# 其他情况默认为甘油三酯(Lipid Panel)
|
||
|
||
# 尝试规则匹配
|
||
if abb_upper in abb_module_map:
|
||
return abb_module_map[abb_upper]
|
||
|
||
# 基于项目名关键词匹配(英文+中文)
|
||
keyword_module = {
|
||
# 尿液检测
|
||
'urine': 'Urine Detection', 'urinary': 'Urine Detection',
|
||
'尿液': 'Urine Detection', '尿检': 'Urine Detection', '酸碱度': 'Urine Detection',
|
||
'浊度': 'Urine Detection', '隐血': 'Urine Detection', '亚硝酸盐': 'Urine Detection', '酮体': 'Urine Detection',
|
||
# 血常规
|
||
'blood cell': 'Complete Blood Count', 'hemoglobin': 'Complete Blood Count',
|
||
'platelet': 'Complete Blood Count', 'neutrophil': 'Complete Blood Count',
|
||
'中性粒细胞': 'Complete Blood Count', '淋巴细胞数量': 'Complete Blood Count',
|
||
'血红蛋白': 'Complete Blood Count', '血小板': 'Complete Blood Count',
|
||
'嗜酸': 'Complete Blood Count', '嗜碱': 'Complete Blood Count', '单核细胞': 'Complete Blood Count',
|
||
'红细胞': 'Complete Blood Count', '白细胞': 'Complete Blood Count',
|
||
# 肝功能
|
||
'liver': 'Liver Function', 'hepat': 'Liver Function', 'bilirubin': 'Liver Function',
|
||
'肝功能': 'Liver Function', '总蛋白': 'Liver Function', '白蛋白': 'Liver Function',
|
||
'球蛋白': 'Liver Function', '胆红素': 'Liver Function', '转氨酶': 'Liver Function',
|
||
'碱性磷酸酶': 'Liver Function', '谷氨酰': 'Liver Function',
|
||
# 肾功能
|
||
'kidney': 'Kidney Function', 'renal': 'Kidney Function', 'creatinine': 'Kidney Function',
|
||
'肾功能': 'Kidney Function', '肌酐': 'Kidney Function', '尿素氮': 'Kidney Function', '尿酸': 'Kidney Function',
|
||
# 血脂
|
||
'cholesterol': 'Lipid Panel', 'triglyceride': 'Lipid Panel', 'lipid': 'Lipid Panel',
|
||
'胆固醇': 'Lipid Panel', '甘油三酯': 'Lipid Panel', '脂蛋白': 'Lipid Panel', '血脂': 'Lipid Panel',
|
||
# 血糖
|
||
'glucose': 'Glucose', 'sugar': 'Glucose', 'hba1c': 'Glucose', 'insulin': 'Glucose',
|
||
'空腹血糖': 'Glucose', '糖化血红蛋白': 'Glucose', '血糖': 'Glucose',
|
||
# 甲状腺
|
||
'thyroid': 'Thyroid', 'tsh': 'Thyroid',
|
||
'甲状腺': 'Thyroid', '促甲状腺': 'Thyroid',
|
||
# 激素/荷尔蒙
|
||
'estrogen': 'Hormone', 'testosterone': 'Hormone', 'progesterone': 'Hormone',
|
||
'cortisol': 'Hormone', 'hormone': 'Hormone',
|
||
'雌二醇': 'Hormone', '孕酮': 'Hormone', '睾酮': 'Hormone', '催乳素': 'Hormone',
|
||
'皮质醇': 'Hormone', '荷尔蒙': 'Hormone', '促卵泡': 'Hormone', '促黄体': 'Hormone',
|
||
'脱氢表雄酮': 'Hormone', '生长因子': 'Hormone', '抗缪勒': 'Hormone',
|
||
# 肿瘤标志物
|
||
'tumor': 'Tumor Markers', 'cancer': 'Tumor Markers', 'antigen': 'Tumor Markers',
|
||
'肿瘤': 'Tumor Markers', '甲胎蛋白': 'Tumor Markers', '癌胚抗原': 'Tumor Markers',
|
||
'铁蛋白': 'Tumor Markers', '糖类抗原': 'Tumor Markers', '前列腺': 'Tumor Markers',
|
||
'鳞状细胞': 'Tumor Markers', '降钙素': 'Tumor Markers', '烯醇化酶': 'Tumor Markers',
|
||
# 凝血
|
||
'coagul': 'Coagulation', 'thrombin': 'Coagulation', 'fibrin': 'Coagulation',
|
||
'凝血': 'Coagulation', '纤维蛋白原': 'Coagulation',
|
||
# 传染病
|
||
'hepatitis': 'Infectious Disease', 'hiv': 'Infectious Disease', 'syphilis': 'Infectious Disease',
|
||
'乙肝': 'Infectious Disease', '丙肝': 'Infectious Disease', '梅毒': 'Infectious Disease',
|
||
'传染病': 'Infectious Disease', '免疫缺陷病毒': 'Infectious Disease',
|
||
# 免疫功能
|
||
'immun': 'Immune Function', 'antibod': 'Immune Function', 'complement': 'Immune Function',
|
||
'红细胞沉降': 'Immune Function', '免疫球蛋白': 'Immune Function', '补体': 'Immune Function',
|
||
'c反应蛋白': 'Immune Function', '抗链球菌': 'Immune Function', '抗核抗体': 'Immune Function',
|
||
'类风湿因子': 'Immune Function', '炎症': 'Immune Function',
|
||
# 骨代谢
|
||
'bone': 'Bone Metabolism', 'osteocalcin': 'Bone Metabolism',
|
||
'骨代谢': 'Bone Metabolism', '骨钙素': 'Bone Metabolism', '甲状旁腺': 'Bone Metabolism',
|
||
'维生素d': 'Bone Metabolism', '胶原': 'Bone Metabolism',
|
||
# 重金属/微量元素
|
||
'metal': 'Heavy Metals', 'lead': 'Heavy Metals', 'mercury': 'Heavy Metals',
|
||
'微量元素': 'Heavy Metals', '重金属': 'Heavy Metals',
|
||
# 维生素
|
||
'vitamin': 'Vitamin', 'folate': 'Vitamin', 'b12': 'Vitamin',
|
||
# 同型半胱氨酸
|
||
'homocysteine': 'Homocysteine',
|
||
'同型半胱氨酸': 'Homocysteine',
|
||
# 血型
|
||
'血型': 'Blood Type',
|
||
# 心肌酶
|
||
'肌酸激酶': 'Immune Function', '乳酸脱氢酶': 'Immune Function',
|
||
# 电解质
|
||
'电解质': 'Electrolytes', '钾': 'Electrolytes', '钠': 'Electrolytes', '氯': 'Electrolytes',
|
||
'钙': 'Electrolytes', '镁': 'Electrolytes', '磷': 'Electrolytes',
|
||
# 胃功能
|
||
'胃蛋白酶原': 'Immune Function', '胃泌素': 'Immune Function',
|
||
# 维生素
|
||
'维生素': 'Vitamin',
|
||
# 影像学
|
||
'影像': 'Other', '心电图': 'Other', 'b超': 'Other',
|
||
# 女性专项
|
||
'妇科': 'Other', '女性专项': 'Other',
|
||
}
|
||
|
||
# 按关键词长度降序匹配,确保长关键词优先(如 '糖化血红蛋白' 优先于 '血红蛋白')
|
||
for keyword, module in sorted(keyword_module.items(), key=lambda x: len(x[0]), reverse=True):
|
||
if keyword in project_lower:
|
||
return module
|
||
|
||
# 如果规则匹配失败,检查缓存或调用DeepSeek API
|
||
cache = load_deepseek_cache()
|
||
cache_key = f"{abb}:{project_name}"
|
||
|
||
# 检查缓存
|
||
if cache_key in cache.get('classifications', {}):
|
||
return cache['classifications'][cache_key]
|
||
|
||
if api_key:
|
||
prompt = f"""请判断以下医学检测项目属于哪个检测模块,只返回模块名称(英文):
|
||
|
||
项目缩写: {abb}
|
||
项目名称: {project_name}
|
||
|
||
可选模块:
|
||
- Urine Detection(尿液检测)
|
||
- Complete Blood Count(血常规)
|
||
- Liver Function(肝功能)
|
||
- Kidney Function(肾功能)
|
||
- Lipid Panel(血脂)
|
||
- Electrolytes(电解质)
|
||
- Glucose(糖代谢)
|
||
- Thyroid(甲状腺功能)
|
||
- Hormone(激素)
|
||
- Tumor Markers(肿瘤标志物)
|
||
- Coagulation(凝血功能)
|
||
- Infectious Disease(传染病)
|
||
- Immune Function(免疫功能)
|
||
- Bone Metabolism(骨代谢)
|
||
- Heavy Metals(重金属)
|
||
- Vitamin(维生素)
|
||
- Other(其他)
|
||
|
||
只返回英文模块名称,不要其他内容。"""
|
||
|
||
result = call_deepseek_api(prompt, api_key, max_tokens=50)
|
||
if result:
|
||
result = result.strip()
|
||
# 验证返回的模块名是否有效
|
||
valid_modules = ['Urine Detection', 'Complete Blood Count', 'Liver Function',
|
||
'Kidney Function', 'Lipid Panel', 'Electrolytes', 'Glucose',
|
||
'Thyroid', 'Hormone', 'Tumor Markers', 'Coagulation',
|
||
'Infectious Disease', 'Immune Function', 'Bone Metabolism',
|
||
'Heavy Metals', 'Vitamin', 'Other']
|
||
for vm in valid_modules:
|
||
if vm.lower() in result.lower():
|
||
# 保存到缓存
|
||
cache['classifications'][cache_key] = vm
|
||
save_deepseek_cache()
|
||
return vm
|
||
|
||
return 'Other'
|
||
|
||
|
||
def get_ai_explanation(abb: str, project_name: str, result: str, api_key: str = None, gender: str = None) -> dict:
|
||
"""
|
||
获取临床意义解释
|
||
优先级:1. 模板解释 -> 2. 缓存 -> 3. DeepSeek生成 -> 4. 通用模板
|
||
|
||
参数:
|
||
abb: 项目缩写
|
||
project_name: 项目名称
|
||
result: 检测结果
|
||
api_key: DeepSeek API密钥
|
||
gender: 性别 ('male' 或 'female'),用于 COR/Cortisol 的临床意义选择
|
||
"""
|
||
import json as json_module
|
||
from pathlib import Path
|
||
|
||
# ABB别名映射:提取数据中的ABB -> 模板解释中的ABB
|
||
abb_aliases = {
|
||
'WBC': 'WBC COUNT',
|
||
'ABO': 'BLOOD TYPE',
|
||
'Rh': 'BLOOD TYPE RH',
|
||
'HCV': 'HCV-IGM',
|
||
'Scr': 'SCR',
|
||
'DBil': 'DBIL',
|
||
'TBil': 'TBIL',
|
||
'HbA1C': 'HBA1C',
|
||
'Hcy': 'HCY',
|
||
'Fer': 'FER',
|
||
'TgAb': 'TGAB',
|
||
'pH': 'PH',
|
||
'β-CTX': 'Β-CTX',
|
||
'Color': 'COLOR',
|
||
'Clarity': 'TUR',
|
||
'BIL': 'BIL', # 尿胆红素
|
||
'URO': 'URO', # 尿胆原
|
||
'ERY': 'BLD', # 尿红细胞/隐血
|
||
'IgA': 'IGA',
|
||
'IgE': 'IGE',
|
||
'IgG': 'IGG',
|
||
'IgM': 'IGM',
|
||
'Lp(a)': 'LP(A)',
|
||
'hs-CRP': 'hs-CRP',
|
||
# 电解质和微量元素(大小写映射)
|
||
'Cl': 'CL',
|
||
'Na': 'NA',
|
||
'Mg': 'MG',
|
||
'Ca': 'CA',
|
||
'K': 'K',
|
||
'P': 'P',
|
||
# 重金属(大小写映射)
|
||
'Pb': 'PB',
|
||
'Cr': 'CR',
|
||
'Hg': 'HG',
|
||
'Cd': 'CD',
|
||
'Mn': 'MN',
|
||
'Ni': 'NI',
|
||
'Zn': 'ZN',
|
||
'Cu': 'CU',
|
||
'Fe': 'FE',
|
||
# 其他
|
||
'CIB': 'CIB',
|
||
}
|
||
|
||
# 特殊处理 COR/Cortisol:根据性别选择正确的临床意义
|
||
lookup_abb = abb
|
||
abb_upper = abb.upper().strip()
|
||
if abb_upper in ['COR', 'CORTISOL']:
|
||
if gender == 'male':
|
||
lookup_abb = 'CORTISOL' # 男性使用 CORTISOL 的临床意义
|
||
else:
|
||
lookup_abb = 'COR' # 女性使用 COR 的临床意义
|
||
|
||
# 应用别名映射
|
||
if lookup_abb in abb_aliases:
|
||
lookup_abb = abb_aliases[lookup_abb]
|
||
elif lookup_abb.upper() in abb_aliases:
|
||
lookup_abb = abb_aliases[lookup_abb.upper()]
|
||
|
||
# 1. 首先尝试从模板解释文件获取
|
||
template_explanations_file = Path(__file__).parent / "template_explanations.json"
|
||
if template_explanations_file.exists():
|
||
try:
|
||
with open(template_explanations_file, 'r', encoding='utf-8') as f:
|
||
template_explanations = json_module.load(f)
|
||
|
||
# 先尝试精确匹配(处理大小写敏感的ABB如TG/Tg)
|
||
abb_stripped = lookup_abb.strip()
|
||
if abb_stripped in template_explanations:
|
||
exp = template_explanations[abb_stripped]
|
||
if exp.get('clinical_en') and exp.get('clinical_cn'):
|
||
return {'en': exp['clinical_en'], 'cn': exp['clinical_cn']}
|
||
|
||
# 再尝试大写匹配
|
||
abb_upper_lookup = lookup_abb.upper().strip()
|
||
if abb_upper_lookup in template_explanations:
|
||
exp = template_explanations[abb_upper_lookup]
|
||
if exp.get('clinical_en') and exp.get('clinical_cn'):
|
||
return {'en': exp['clinical_en'], 'cn': exp['clinical_cn']}
|
||
|
||
# 去除特殊字符后匹配
|
||
abb_clean = ''.join(c for c in abb_upper_lookup if c.isalnum())
|
||
for key, value in template_explanations.items():
|
||
key_clean = ''.join(c for c in key.upper() if c.isalnum())
|
||
if abb_clean == key_clean:
|
||
if value.get('clinical_en') and value.get('clinical_cn'):
|
||
return {'en': value['clinical_en'], 'cn': value['clinical_cn']}
|
||
|
||
# 尝试原始ABB(未经别名转换)
|
||
if abb.strip() in template_explanations:
|
||
exp = template_explanations[abb.strip()]
|
||
if exp.get('clinical_en') and exp.get('clinical_cn'):
|
||
return {'en': exp['clinical_en'], 'cn': exp['clinical_cn']}
|
||
if abb.upper().strip() in template_explanations:
|
||
exp = template_explanations[abb.upper().strip()]
|
||
if exp.get('clinical_en') and exp.get('clinical_cn'):
|
||
return {'en': exp['clinical_en'], 'cn': exp['clinical_cn']}
|
||
|
||
except Exception as e:
|
||
pass # 静默失败,继续尝试其他方式
|
||
|
||
# 2. 检查缓存
|
||
cache = load_deepseek_cache()
|
||
cache_key = f"{abb}:{project_name}"
|
||
|
||
if cache_key in cache.get('explanations', {}):
|
||
return cache['explanations'][cache_key]
|
||
|
||
# 3. 如果有API密钥,调用DeepSeek
|
||
if api_key:
|
||
prompt = f"""请为以下医学检测项目生成临床意义说明,分别用英文和中文各一段(每段50-80字)。
|
||
|
||
严格要求:
|
||
1. 只描述该检测项目是什么、测量什么、在医学上的意义
|
||
2. 禁止分析具体检测结果或数值
|
||
3. 禁止给出诊断建议、健康建议或治疗建议
|
||
4. 禁止使用"如果升高/降低则..."、"异常时..."等条件分析语句
|
||
5. 禁止使用"可能"、"也许"、"建议"等词汇
|
||
6. 使用客观、专业的医学术语,陈述事实
|
||
|
||
正确示例:
|
||
- "白细胞计数反映机体免疫系统状态,是评估感染和炎症的重要指标。"
|
||
- "血红蛋白是红细胞中携带氧气的蛋白质,反映血液的携氧能力。"
|
||
|
||
错误示例(禁止):
|
||
- "白细胞升高可能提示感染..."(禁止分析结果)
|
||
- "建议定期复查..."(禁止给建议)
|
||
|
||
项目缩写: {abb}
|
||
项目名称: {project_name}
|
||
|
||
请严格按照以下JSON格式返回,不要其他内容:
|
||
{{"en": "英文临床意义说明", "cn": "中文临床意义说明"}}"""
|
||
|
||
response = call_deepseek_api(prompt, api_key, max_tokens=500)
|
||
if response:
|
||
try:
|
||
# 尝试解析JSON
|
||
# 清理可能的markdown标记
|
||
clean_response = response.strip()
|
||
if '```json' in clean_response:
|
||
clean_response = clean_response.split('```json')[1].split('```')[0]
|
||
elif '```' in clean_response:
|
||
clean_response = clean_response.split('```')[1].split('```')[0]
|
||
|
||
data = json_module.loads(clean_response.strip())
|
||
if 'en' in data and 'cn' in data:
|
||
# 保存到缓存
|
||
cache['explanations'][cache_key] = data
|
||
save_deepseek_cache()
|
||
return data
|
||
except:
|
||
pass
|
||
|
||
# 4. 降级:使用预定义模板
|
||
templates = {
|
||
'WBC': {'en': 'White blood cell count reflects immune system status and is an important indicator for evaluating infection and inflammation.',
|
||
'cn': '白细胞计数反映机体免疫系统状态,是评估感染和炎症的重要指标。'},
|
||
'RBC': {'en': 'Red blood cell count reflects the oxygen-carrying capacity of blood and is used to evaluate anemia status.',
|
||
'cn': '红细胞计数反映血液的携氧能力,用于评估贫血状况。'},
|
||
'HB': {'en': 'Hemoglobin is the oxygen-carrying protein in red blood cells, reflecting the oxygen transport capacity of blood.',
|
||
'cn': '血红蛋白是红细胞中携带氧气的蛋白质,反映血液的携氧能力。'},
|
||
'PLT': {'en': 'Platelet count reflects the blood clotting function and hemostatic capacity.',
|
||
'cn': '血小板计数反映血液的凝血功能和止血能力。'},
|
||
'ALT': {'en': 'Alanine aminotransferase (ALT) is an enzyme primarily found in liver cells, reflecting liver cell integrity.',
|
||
'cn': '谷丙转氨酶(ALT)主要存在于肝细胞中,反映肝细胞的完整性。'},
|
||
'AST': {'en': 'Aspartate aminotransferase (AST) is an enzyme found in liver and heart muscle cells, reflecting tissue integrity.',
|
||
'cn': '谷草转氨酶(AST)存在于肝脏和心肌细胞中,反映组织的完整性。'},
|
||
'TC': {'en': 'Total cholesterol is a lipid component in blood, important for cardiovascular health assessment.',
|
||
'cn': '总胆固醇是血液中的脂质成分,对心血管健康评估具有重要意义。'},
|
||
'TG': {'en': 'Triglycerides are the main form of fat storage in the body, reflecting lipid metabolism status.',
|
||
'cn': '甘油三酯是体内脂肪储存的主要形式,反映脂质代谢状况。'},
|
||
'GLU': {'en': 'Blood glucose is the primary energy source for cells, essential for diabetes screening and metabolic assessment.',
|
||
'cn': '血糖是细胞的主要能量来源,是糖尿病筛查和代谢评估的重要指标。'},
|
||
'TSH': {'en': 'TSH level reflects thyroid function and helps diagnose thyroid disorders.',
|
||
'cn': 'TSH水平反映甲状腺功能,有助于诊断甲状腺疾病。'},
|
||
}
|
||
|
||
if abb.upper() in templates:
|
||
return templates[abb.upper()]
|
||
|
||
# 通用模板
|
||
return {
|
||
"en": f"{project_name} ({abb}) is a medical test indicator used for health assessment and disease screening.",
|
||
"cn": f"{project_name}({abb})是一项医学检测指标,用于健康评估和疾病筛查。"
|
||
}
|
||
|
||
def find_module_end_position(doc, module_name):
|
||
"""
|
||
找到指定模块的最后一个表格位置
|
||
通过查找模块标题行来精确定位
|
||
返回该模块最后一个表格在doc.element.body中的索引
|
||
"""
|
||
# 模块标题的精确匹配(必须是标题行,不是普通数据)
|
||
module_titles = {
|
||
'Urine Detection': ['urine detection', '尿液检测'],
|
||
'Complete Blood Count': ['complete blood count', '血常规'],
|
||
'Heavy Metals': ['heavy metal', '重金属', 'trace element', '微量元素', 'microelement'],
|
||
'Infectious Disease': ['infectious disease', '传染病', 'hepatitis', '肝炎'],
|
||
'Kidney Function': ['kidney function', '肾功能'],
|
||
'Liver Function': ['liver function', '肝功能'],
|
||
'Lipid Panel': ['lipid panel', '血脂'],
|
||
'Thyroid': ['thyroid function', '甲状腺功能'],
|
||
'Hormone': ['hormone', '激素', 'female hormone', 'male hormone'],
|
||
'Tumor Markers': ['tumor marker', '肿瘤标志物'],
|
||
'Electrolytes': ['electrolyte', '电解质'],
|
||
'Glucose': ['glucose metabolism', '糖代谢'],
|
||
'Coagulation': ['coagulation', '凝血'],
|
||
'Immune Function': ['immune function', '免疫功能', 'humoral immunity', '体液免疫'],
|
||
'Bone Metabolism': ['bone metabolism', '骨代谢'],
|
||
}
|
||
|
||
titles = module_titles.get(module_name, [module_name.lower()])
|
||
body = doc.element.body
|
||
|
||
# 第一步:找到模块标题表格的索引
|
||
module_start_table_idx = -1
|
||
for i, table in enumerate(doc.tables):
|
||
# 检查第一行或第二行是否包含模块标题
|
||
for row_idx in range(min(2, len(table.rows))):
|
||
row_text = ' '.join([c.text.lower().strip() for c in table.rows[row_idx].cells])
|
||
# 标题行通常在整行都是相同的文字(合并单元格)
|
||
if any(title in row_text for title in titles):
|
||
module_start_table_idx = i
|
||
break
|
||
if module_start_table_idx >= 0:
|
||
break
|
||
|
||
if module_start_table_idx < 0:
|
||
return -1
|
||
|
||
# 第二步:找到下一个模块的起始位置(或文档末尾)
|
||
next_module_table_idx = len(doc.tables)
|
||
all_titles = []
|
||
for t_list in module_titles.values():
|
||
all_titles.extend(t_list)
|
||
|
||
for i in range(module_start_table_idx + 1, len(doc.tables)):
|
||
table = doc.tables[i]
|
||
for row_idx in range(min(2, len(table.rows))):
|
||
row_text = ' '.join([c.text.lower().strip() for c in table.rows[row_idx].cells])
|
||
# 检查是否是另一个模块的标题
|
||
if any(title in row_text and title not in titles for title in all_titles):
|
||
next_module_table_idx = i
|
||
break
|
||
if next_module_table_idx < len(doc.tables):
|
||
break
|
||
|
||
# 第三步:找到该模块范围内最后一个表格在body中的位置
|
||
last_table_in_module = next_module_table_idx - 1
|
||
if last_table_in_module < module_start_table_idx:
|
||
last_table_in_module = module_start_table_idx
|
||
|
||
# 获取body中的位置
|
||
tbl_element = doc.tables[last_table_in_module]._tbl
|
||
for idx, child in enumerate(body):
|
||
if child is tbl_element:
|
||
return idx
|
||
|
||
return -1
|
||
|
||
def insert_table_after_position(doc, position, abb, project_name, result, clinical_en, clinical_cn,
|
||
point='', reference='', unit='', include_header=False):
|
||
"""
|
||
在指定位置后插入新表格(完全复刻模板样式)
|
||
格式(无表头时):
|
||
Row 0: ABB | Name | Result | Point | Refer | Unit - 数据行
|
||
Row 1: Clinical Significance (Merged) - 解释行
|
||
|
||
格式(有表头时):
|
||
Row 0: Header - Abb简称 | Project项目 | Result结果 | Point指示 | Refer参考 | Unit单位
|
||
Row 1: ABB | Name | Result | Point | Refer | Unit - 数据行
|
||
Row 2: Clinical Significance (Merged) - 解释行
|
||
"""
|
||
from lxml import etree
|
||
|
||
# 清理参考范围格式
|
||
reference = clean_reference_range(reference)
|
||
|
||
# 根据是否需要表头决定行数
|
||
num_rows = 3 if include_header else 2
|
||
table = doc.add_table(rows=num_rows, cols=6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.autofit = False
|
||
|
||
# 设置列宽
|
||
widths = [Cm(2.5), Cm(3.5), Cm(2.5), Cm(2.5), Cm(2.5), Cm(2.5)]
|
||
for row in table.rows:
|
||
for idx, width in enumerate(widths):
|
||
row.cells[idx].width = width
|
||
|
||
# 定义字体样式函数
|
||
def set_font(run, bold=False, font_size=10.5):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
# 定义临床意义字体样式函数(华文楷体,11号字)
|
||
def set_clinical_font(run, bold=False):
|
||
run.bold = bold
|
||
run.font.name = '华文楷体'
|
||
run.font.size = Pt(11)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '华文楷体')
|
||
|
||
# 确定数据行和解释行的索引
|
||
if include_header:
|
||
# 有表头:Row 0=表头, Row 1=数据, Row 2=解释
|
||
header_row_idx = 0
|
||
data_row_idx = 1
|
||
sig_row_idx = 2
|
||
|
||
# === 表头行 ===
|
||
row0 = table.rows[header_row_idx]
|
||
headers = [
|
||
('Abb', '简称'), ('Project', '项目'), ('Result', '结果'),
|
||
('Point', '提示'), ('Refer', '参考'), ('Unit', '单位')
|
||
]
|
||
for idx, (en, cn) in enumerate(headers):
|
||
p = row0.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(f'{en}\n{cn}')
|
||
set_font(run, bold=True, font_size=9)
|
||
else:
|
||
# 无表头:Row 0=数据, Row 1=解释
|
||
data_row_idx = 0
|
||
sig_row_idx = 1
|
||
|
||
# === 数据行 ===
|
||
data_row = table.rows[data_row_idx]
|
||
|
||
# 1. ABB
|
||
p = data_row.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(abb)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名
|
||
p = data_row.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(project_name)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. 结果
|
||
p = data_row.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(str(result))
|
||
set_font(run)
|
||
|
||
# 4. Point列
|
||
p = data_row.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if point:
|
||
run = p.add_run(point)
|
||
set_font(run)
|
||
|
||
# 5. Refer列
|
||
p = data_row.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if reference:
|
||
run = p.add_run(reference)
|
||
set_font(run)
|
||
|
||
# 6. Unit列
|
||
p = data_row.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if unit:
|
||
run = p.add_run(unit)
|
||
set_font(run)
|
||
|
||
# === 临床意义行 ===
|
||
sig_row = table.rows[sig_row_idx]
|
||
top_cell = sig_row.cells[0]
|
||
for i in range(1, 6):
|
||
top_cell.merge(sig_row.cells[i])
|
||
|
||
# 第一个段落:英文临床意义
|
||
p = top_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_clinical_font(run, bold=True)
|
||
run = p.add_run(clinical_en)
|
||
set_clinical_font(run)
|
||
|
||
# 第二个段落:中文临床意义(独立段落,与案例文件格式一致)
|
||
p_cn = top_cell.add_paragraph()
|
||
p_cn.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p_cn.add_run('临床意义:')
|
||
set_clinical_font(run, bold=True)
|
||
run = p_cn.add_run(clinical_cn)
|
||
set_clinical_font(run)
|
||
|
||
# === 设置边框 ===
|
||
# 顶部实线 (黑色)
|
||
border_solid = {'val': 'single', 'sz': 4, 'color': '000000', 'space': 0}
|
||
# 其他虚线 (灰色)
|
||
border_dashed = {'val': 'dashed', 'sz': 4, 'color': 'AAAAAA', 'space': 0}
|
||
|
||
for i, row in enumerate(table.rows):
|
||
for cell in row.cells:
|
||
# 默认四周都是虚线
|
||
top = border_dashed
|
||
bottom = border_dashed
|
||
left = border_dashed
|
||
right = border_dashed
|
||
|
||
# 第一行顶部设置为实线
|
||
if i == 0:
|
||
top = border_solid
|
||
|
||
# 应用边框
|
||
set_cell_border(cell, top=top, bottom=bottom, left=left, right=right)
|
||
|
||
# 垂直居中
|
||
cell.vertical_alignment = 1
|
||
|
||
# 移动表格到指定位置
|
||
if position >= 0:
|
||
body = doc.element.body
|
||
tbl_element = table._tbl
|
||
# 从当前位置移除
|
||
body.remove(tbl_element)
|
||
# 插入到指定位置后
|
||
body.insert(position + 1, tbl_element)
|
||
|
||
# 添加分隔段落(表格后空一行)
|
||
if position >= 0:
|
||
from docx.oxml import OxmlElement
|
||
empty_p = OxmlElement('w:p')
|
||
body.insert(position + 2, empty_p)
|
||
|
||
return table
|
||
|
||
|
||
def insert_paired_items_table(doc, position,
|
||
abb, name_cn, result, clinical_en, clinical_cn,
|
||
point='', reference='', unit='',
|
||
include_header=False):
|
||
"""
|
||
在指定位置后插入配对项目表格(两行数据,共享临床意义)
|
||
例如:EOS和EOS%显示在同一个表格中
|
||
|
||
格式(无表头时):
|
||
Row 0: ABB | Name_CN (基础项) | Result | Point | Reference | Unit
|
||
Row 1: ABB% | Name_CN (百分比项) | (空) | (空) | (空) | (空)
|
||
Row 2: Clinical Significance (Merged) - 解释行
|
||
|
||
格式(有表头时):
|
||
Row 0: Header
|
||
Row 1: ABB | Name_CN (基础项) | Result | Point | Reference | Unit
|
||
Row 2: ABB% | Name_CN (百分比项) | (空) | (空) | (空) | (空)
|
||
Row 3: Clinical Significance (Merged) - 解释行
|
||
|
||
注意:数据只填入第一行(基础项或百分比项,取决于传入的是哪个),第二行只显示ABB和名称
|
||
"""
|
||
from lxml import etree
|
||
|
||
# 获取配对信息
|
||
abb_upper = abb.upper().strip()
|
||
paired_abb, is_base, base_cn, percent_cn = get_paired_item(abb)
|
||
|
||
if not paired_abb:
|
||
# 不是配对项目,使用普通表格
|
||
return insert_table_after_position(doc, position, abb, name_cn, result,
|
||
clinical_en, clinical_cn,
|
||
point=point, reference=reference, unit=unit,
|
||
include_header=include_header)
|
||
|
||
# 确定基础项和百分比项的ABB和名称
|
||
# 数据填入传入的那一行
|
||
if is_base:
|
||
abb1 = abb_upper
|
||
abb2 = paired_abb
|
||
name1 = base_cn
|
||
name2 = percent_cn
|
||
# 数据在第一行
|
||
result1, point1, reference1, unit1 = result, point, reference, unit
|
||
result2, point2, reference2, unit2 = '', '', '', ''
|
||
else:
|
||
abb1 = paired_abb
|
||
abb2 = abb_upper
|
||
name1 = base_cn
|
||
name2 = percent_cn
|
||
# 数据在第二行
|
||
result1, point1, reference1, unit1 = '', '', '', ''
|
||
result2, point2, reference2, unit2 = result, point, reference, unit
|
||
|
||
# 根据是否需要表头决定行数
|
||
num_rows = 4 if include_header else 3
|
||
table = doc.add_table(rows=num_rows, cols=6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.autofit = False
|
||
|
||
# 设置列宽
|
||
widths = [Cm(2.5), Cm(3.5), Cm(2.5), Cm(2.5), Cm(2.5), Cm(2.5)]
|
||
for row in table.rows:
|
||
for idx, width in enumerate(widths):
|
||
row.cells[idx].width = width
|
||
|
||
# 定义字体样式函数
|
||
def set_font(run, bold=False, font_size=10.5):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
# 定义临床意义字体样式函数(华文楷体,11号字)
|
||
def set_clinical_font(run, bold=False):
|
||
run.bold = bold
|
||
run.font.name = '华文楷体'
|
||
run.font.size = Pt(11)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '华文楷体')
|
||
|
||
# 确定行索引
|
||
if include_header:
|
||
header_row_idx = 0
|
||
data_row1_idx = 1
|
||
data_row2_idx = 2
|
||
sig_row_idx = 3
|
||
|
||
# === 表头行 ===
|
||
row0 = table.rows[header_row_idx]
|
||
headers = [
|
||
('Abb', '简称'), ('Project', '项目'), ('Result', '结果'),
|
||
('Point', '提示'), ('Refer', '参考'), ('Unit', '单位')
|
||
]
|
||
for idx, (en, cn) in enumerate(headers):
|
||
p = row0.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(f'{en}\n{cn}')
|
||
set_font(run, bold=True, font_size=9)
|
||
else:
|
||
data_row1_idx = 0
|
||
data_row2_idx = 1
|
||
sig_row_idx = 2
|
||
|
||
# === 数据行1 (基础项,如EOS) ===
|
||
data_row1 = table.rows[data_row1_idx]
|
||
|
||
# 1. ABB1
|
||
p = data_row1.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(abb1)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名1 (中文名)
|
||
p = data_row1.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(name1)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. Result1
|
||
p = data_row1.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if result1:
|
||
run = p.add_run(str(result1))
|
||
set_font(run)
|
||
|
||
# 4. Point1
|
||
p = data_row1.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if point1:
|
||
run = p.add_run(str(point1))
|
||
set_font(run)
|
||
|
||
# 5. Reference1
|
||
p = data_row1.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if reference1:
|
||
run = p.add_run(str(reference1))
|
||
set_font(run)
|
||
|
||
# 6. Unit1
|
||
p = data_row1.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if unit1:
|
||
run = p.add_run(str(unit1))
|
||
set_font(run)
|
||
|
||
# === 数据行2 (百分比项,如EOS%) ===
|
||
data_row2 = table.rows[data_row2_idx]
|
||
|
||
# 1. ABB2
|
||
p = data_row2.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(abb2)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名2 (中文名)
|
||
p = data_row2.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(name2)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. Result2
|
||
p = data_row2.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if result2:
|
||
run = p.add_run(str(result2))
|
||
set_font(run)
|
||
|
||
# 4. Point2
|
||
p = data_row2.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if point2:
|
||
run = p.add_run(str(point2))
|
||
set_font(run)
|
||
|
||
# 5. Reference2
|
||
p = data_row2.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if reference2:
|
||
run = p.add_run(str(reference2))
|
||
set_font(run)
|
||
|
||
# 6. Unit2
|
||
p = data_row2.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if unit2:
|
||
run = p.add_run(str(unit2))
|
||
set_font(run)
|
||
|
||
# === 临床意义行 ===
|
||
sig_row = table.rows[sig_row_idx]
|
||
top_cell = sig_row.cells[0]
|
||
for i in range(1, 6):
|
||
top_cell.merge(sig_row.cells[i])
|
||
|
||
# 第一个段落:英文临床意义
|
||
p = top_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_clinical_font(run, bold=True)
|
||
run = p.add_run(clinical_en)
|
||
set_clinical_font(run)
|
||
|
||
# 第二个段落:中文临床意义(独立段落,与案例文件格式一致)
|
||
p_cn = top_cell.add_paragraph()
|
||
p_cn.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p_cn.add_run('临床意义:')
|
||
set_clinical_font(run, bold=True)
|
||
run = p_cn.add_run(clinical_cn)
|
||
set_clinical_font(run)
|
||
|
||
# === 设置边框 ===
|
||
border_solid = {'val': 'single', 'sz': 4, 'color': '000000', 'space': 0}
|
||
border_dashed = {'val': 'dashed', 'sz': 4, 'color': 'AAAAAA', 'space': 0}
|
||
|
||
for i, row in enumerate(table.rows):
|
||
for cell in row.cells:
|
||
top = border_dashed
|
||
bottom = border_dashed
|
||
left = border_dashed
|
||
right = border_dashed
|
||
|
||
if i == 0:
|
||
top = border_solid
|
||
|
||
set_cell_border(cell, top=top, bottom=bottom, left=left, right=right)
|
||
cell.vertical_alignment = 1
|
||
|
||
# 移动表格到指定位置
|
||
if position >= 0:
|
||
body = doc.element.body
|
||
tbl_element = table._tbl
|
||
body.remove(tbl_element)
|
||
body.insert(position + 1, tbl_element)
|
||
|
||
# 添加分隔段落
|
||
if position >= 0:
|
||
from docx.oxml import OxmlElement
|
||
empty_p = OxmlElement('w:p')
|
||
body.insert(position + 2, empty_p)
|
||
|
||
return table
|
||
|
||
|
||
def insert_paired_items_table_with_both_data(doc, position,
|
||
base_abb, percent_abb,
|
||
base_cn, percent_cn,
|
||
base_result, base_point, base_reference, base_unit,
|
||
percent_result, percent_point, percent_reference, percent_unit,
|
||
clinical_en, clinical_cn,
|
||
include_header=False):
|
||
"""
|
||
插入配对项目表格,两行数据都填入
|
||
Row 0 (可选): 表头
|
||
Row 1: 基础项 ABB | 中文名 | Result | Point | Reference | Unit
|
||
Row 2: 百分比项 ABB | 中文名 | Result | Point | Reference | Unit
|
||
Row 3: Clinical Significance (合并单元格)
|
||
"""
|
||
from lxml import etree
|
||
|
||
# 清理参考范围格式
|
||
base_reference = clean_reference_range(base_reference)
|
||
percent_reference = clean_reference_range(percent_reference)
|
||
|
||
# 根据是否需要表头决定行数
|
||
num_rows = 4 if include_header else 3
|
||
table = doc.add_table(rows=num_rows, cols=6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.autofit = False
|
||
|
||
# 设置列宽
|
||
widths = [Cm(2.5), Cm(3.5), Cm(2.5), Cm(2.5), Cm(2.5), Cm(2.5)]
|
||
for row in table.rows:
|
||
for idx, width in enumerate(widths):
|
||
row.cells[idx].width = width
|
||
|
||
# 定义字体样式函数
|
||
def set_font(run, bold=False, font_size=10.5):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
# 定义临床意义字体样式函数(华文楷体,11号字)
|
||
def set_clinical_font(run, bold=False):
|
||
run.bold = bold
|
||
run.font.name = '华文楷体'
|
||
run.font.size = Pt(11)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '华文楷体')
|
||
|
||
# 确定行索引
|
||
if include_header:
|
||
header_row_idx = 0
|
||
data_row1_idx = 1
|
||
data_row2_idx = 2
|
||
sig_row_idx = 3
|
||
|
||
# === 表头行 ===
|
||
row0 = table.rows[header_row_idx]
|
||
headers = [
|
||
('Abb', '简称'), ('Project', '项目'), ('Result', '结果'),
|
||
('Point', '提示'), ('Refer', '参考'), ('Unit', '单位')
|
||
]
|
||
for idx, (en, cn) in enumerate(headers):
|
||
p = row0.cells[idx].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(f'{en}\n{cn}')
|
||
set_font(run, bold=True, font_size=9)
|
||
else:
|
||
data_row1_idx = 0
|
||
data_row2_idx = 1
|
||
sig_row_idx = 2
|
||
|
||
# === 数据行1 (基础项) ===
|
||
data_row1 = table.rows[data_row1_idx]
|
||
|
||
# 1. ABB
|
||
p = data_row1.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(base_abb)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名 (中文名)
|
||
p = data_row1.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(base_cn)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. Result
|
||
p = data_row1.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if base_result:
|
||
run = p.add_run(str(base_result))
|
||
set_font(run)
|
||
|
||
# 4. Point
|
||
p = data_row1.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if base_point:
|
||
run = p.add_run(str(base_point))
|
||
set_font(run)
|
||
|
||
# 5. Reference
|
||
p = data_row1.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if base_reference:
|
||
run = p.add_run(str(base_reference))
|
||
set_font(run)
|
||
|
||
# 6. Unit
|
||
p = data_row1.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if base_unit:
|
||
run = p.add_run(str(base_unit))
|
||
set_font(run)
|
||
|
||
# === 数据行2 (百分比项) ===
|
||
data_row2 = table.rows[data_row2_idx]
|
||
|
||
# 1. ABB
|
||
p = data_row2.cells[0].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(percent_abb)
|
||
set_font(run, bold=True)
|
||
|
||
# 2. 项目名 (中文名)
|
||
p = data_row2.cells[1].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
run = p.add_run(percent_cn)
|
||
set_font(run, bold=True)
|
||
|
||
# 3. Result
|
||
p = data_row2.cells[2].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if percent_result:
|
||
run = p.add_run(str(percent_result))
|
||
set_font(run)
|
||
|
||
# 4. Point
|
||
p = data_row2.cells[3].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if percent_point:
|
||
run = p.add_run(str(percent_point))
|
||
set_font(run)
|
||
|
||
# 5. Reference
|
||
p = data_row2.cells[4].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if percent_reference:
|
||
run = p.add_run(str(percent_reference))
|
||
set_font(run)
|
||
|
||
# 6. Unit
|
||
p = data_row2.cells[5].paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
if percent_unit:
|
||
run = p.add_run(str(percent_unit))
|
||
set_font(run)
|
||
|
||
# === 临床意义行 ===
|
||
sig_row = table.rows[sig_row_idx]
|
||
top_cell = sig_row.cells[0]
|
||
for i in range(1, 6):
|
||
top_cell.merge(sig_row.cells[i])
|
||
|
||
# 第一个段落:英文临床意义
|
||
p = top_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_clinical_font(run, bold=True)
|
||
run = p.add_run(clinical_en)
|
||
set_clinical_font(run)
|
||
|
||
# 第二个段落:中文临床意义(独立段落,与案例文件格式一致)
|
||
p_cn = top_cell.add_paragraph()
|
||
p_cn.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
run = p_cn.add_run('临床意义:')
|
||
set_clinical_font(run, bold=True)
|
||
run = p_cn.add_run(clinical_cn)
|
||
set_clinical_font(run)
|
||
|
||
# === 设置边框 ===
|
||
border_solid = {'val': 'single', 'sz': 4, 'color': '000000', 'space': 0}
|
||
border_dashed = {'val': 'dashed', 'sz': 4, 'color': 'AAAAAA', 'space': 0}
|
||
|
||
for i, row in enumerate(table.rows):
|
||
for cell in row.cells:
|
||
top = border_dashed
|
||
bottom = border_dashed
|
||
left = border_dashed
|
||
right = border_dashed
|
||
|
||
if i == 0:
|
||
top = border_solid
|
||
|
||
set_cell_border(cell, top=top, bottom=bottom, left=left, right=right)
|
||
cell.vertical_alignment = 1
|
||
|
||
# 移动表格到指定位置
|
||
if position >= 0:
|
||
body = doc.element.body
|
||
tbl_element = table._tbl
|
||
body.remove(tbl_element)
|
||
body.insert(position + 1, tbl_element)
|
||
|
||
# 添加分隔段落
|
||
if position >= 0:
|
||
from docx.oxml import OxmlElement
|
||
empty_p = OxmlElement('w:p')
|
||
body.insert(position + 2, empty_p)
|
||
|
||
return table
|
||
|
||
|
||
def add_missing_items_table(doc, unfilled_abbs, matched_data, api_key=None):
|
||
"""
|
||
添加缺失项目到对应模块尾部
|
||
流程:
|
||
1. 先用DeepSeek分析所有缺失项目属于哪个模块
|
||
2. 按标准模块顺序处理,在对应模块尾部添加表格
|
||
3. 然后调用DeepSeek生成Clinical Significance解释
|
||
"""
|
||
if not unfilled_abbs:
|
||
print("\n ✓ 没有缺失项目需要添加")
|
||
return
|
||
|
||
# 加载配置获取模块信息和标准顺序
|
||
from config import load_abb_config, get_standard_module_order, sort_items_by_standard_order, normalize_abb, normalize_module_name
|
||
abb_config = load_abb_config()
|
||
abb_to_module = abb_config.get('abb_to_module', {})
|
||
abb_to_info = abb_config.get('abb_to_info', {})
|
||
standard_module_order = get_standard_module_order()
|
||
|
||
print(f"\n 📋 开始处理 {len(unfilled_abbs)} 个缺失项目...")
|
||
|
||
# ===== 第一步:使用DeepSeek分析所有缺失项目属于哪个模块 =====
|
||
print("\n 🔍 步骤1: 分析缺失项目所属模块...")
|
||
|
||
by_module = {} # {module: [(abb, data), ...]}
|
||
items_to_classify = [] # 需要调用DeepSeek分类的项目
|
||
|
||
for abb in unfilled_abbs:
|
||
data = matched_data.get(abb, {})
|
||
result = data.get('result', '')
|
||
if not result:
|
||
continue
|
||
|
||
project_name = data.get('project', abb)
|
||
|
||
# 标准化ABB名称
|
||
normalized_abb = normalize_abb(abb, abb_config)
|
||
|
||
# 优先使用配置中的模块(先精确匹配,再大写匹配)
|
||
module = abb_to_module.get(normalized_abb, '')
|
||
if not module:
|
||
module = abb_to_module.get(abb, '')
|
||
if not module:
|
||
module = abb_to_module.get(normalized_abb.upper(), '')
|
||
if not module:
|
||
module = abb_to_module.get(abb.upper(), '')
|
||
|
||
if module:
|
||
if module not in by_module:
|
||
by_module[module] = []
|
||
by_module[module].append((abb, data))
|
||
print(f" ✓ {abb} → [{module}] (配置文件)")
|
||
else:
|
||
# 需要DeepSeek分类
|
||
items_to_classify.append((abb, data, project_name))
|
||
|
||
# 批量调用DeepSeek分类
|
||
if items_to_classify:
|
||
print(f"\n 🤖 调用DeepSeek分类 {len(items_to_classify)} 个未知项目...")
|
||
for abb, data, project_name in items_to_classify:
|
||
module = classify_abb_module(abb, project_name, api_key)
|
||
# 标准化模块名称
|
||
original_module = module
|
||
module = normalize_module_name(module, abb_config)
|
||
if original_module != module:
|
||
print(f" ✓ {abb} → [{original_module}] → [{module}] (DeepSeek)")
|
||
else:
|
||
print(f" ✓ {abb} → [{module}] (DeepSeek)")
|
||
if module not in by_module:
|
||
by_module[module] = []
|
||
by_module[module].append((abb, data))
|
||
|
||
# 打印分组结果
|
||
print(f"\n 📊 分组结果:")
|
||
for module in standard_module_order:
|
||
if module in by_module:
|
||
items = by_module[module]
|
||
print(f" [{module}]: {len(items)} 个项目 - {[i[0] for i in items]}")
|
||
# 打印不在标准顺序中的模块
|
||
for module, items in by_module.items():
|
||
if module not in standard_module_order:
|
||
print(f" [{module}] (额外): {len(items)} 个项目 - {[i[0] for i in items]}")
|
||
|
||
# ===== 第二步:按标准模块顺序添加表格 =====
|
||
print(f"\n 📝 步骤2: 按标准顺序在对应模块尾部添加表格...")
|
||
|
||
# 找到每个模块的标题位置
|
||
module_positions = {}
|
||
skipped_modules = []
|
||
for module in by_module.keys():
|
||
pos = find_module_title_position(doc, module)
|
||
if pos < 0:
|
||
skipped_modules.append(module)
|
||
print(f" ⚠️ 模块 [{module}] 找不到标题位置,将跳过")
|
||
else:
|
||
module_positions[module] = pos
|
||
print(f" 📍 模块 [{module}] 标题位置: {pos}")
|
||
|
||
# 为每个模块的每个ABB创建表格
|
||
added_items = []
|
||
added_count = 0
|
||
|
||
# 按标准顺序处理模块
|
||
for module in standard_module_order:
|
||
if module not in by_module or module in skipped_modules:
|
||
continue
|
||
|
||
items = by_module[module]
|
||
position = module_positions.get(module, -1)
|
||
if position < 0:
|
||
continue
|
||
|
||
# 按标准项目顺序排序
|
||
sorted_items = sort_items_by_standard_order(items, module, abb_config)
|
||
|
||
print(f"\n 📁 处理模块 [{module}] ({len(sorted_items)} 个项目)...")
|
||
|
||
insert_pos = position
|
||
for abb, data in sorted_items:
|
||
result = data.get('result', '')
|
||
point = data.get('point', '')
|
||
reference = data.get('reference', '')
|
||
unit = data.get('unit', '')
|
||
|
||
normalized_abb = normalize_abb(abb, abb_config)
|
||
info = abb_to_info.get(normalized_abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(normalized_abb.upper(), {})
|
||
if not info:
|
||
info = abb_to_info.get(abb.upper(), {})
|
||
# 优先使用配置文件中的中文名称,其次使用data中的project_cn
|
||
name = info.get('project_cn') or data.get('project_cn')
|
||
# 如果没有中文名称,调用DeepSeek翻译
|
||
if not name:
|
||
english_name = info.get('project') or data.get('project', abb)
|
||
name = translate_project_name_to_chinese(abb, english_name, api_key)
|
||
|
||
# 先用占位符创建表格
|
||
placeholder_en = "[Generating clinical significance...]"
|
||
placeholder_cn = "[正在生成临床意义...]"
|
||
|
||
try:
|
||
insert_table_after_position(
|
||
doc, insert_pos, abb, name, result,
|
||
placeholder_en, placeholder_cn,
|
||
point=point, reference=reference, unit=unit,
|
||
include_header=False
|
||
)
|
||
print(f" ✓ 添加表格: {abb} ({name}) = {result}")
|
||
added_items.append((abb, name, result))
|
||
added_count += 1
|
||
insert_pos += 2
|
||
except Exception as e:
|
||
print(f" ✗ 添加 {abb} 失败: {e}")
|
||
|
||
# 处理不在标准顺序中的模块
|
||
for module, items in by_module.items():
|
||
if module in standard_module_order or module in skipped_modules:
|
||
continue
|
||
|
||
position = module_positions.get(module, -1)
|
||
if position < 0:
|
||
continue
|
||
|
||
sorted_items = sort_items_by_standard_order(items, module, abb_config)
|
||
|
||
print(f"\n 📁 处理额外模块 [{module}] ({len(sorted_items)} 个项目)...")
|
||
|
||
insert_pos = position
|
||
for abb, data in sorted_items:
|
||
result = data.get('result', '')
|
||
point = data.get('point', '')
|
||
reference = data.get('reference', '')
|
||
unit = data.get('unit', '')
|
||
|
||
normalized_abb = normalize_abb(abb, abb_config)
|
||
info = abb_to_info.get(normalized_abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(abb, {})
|
||
if not info:
|
||
info = abb_to_info.get(normalized_abb.upper(), {})
|
||
if not info:
|
||
info = abb_to_info.get(abb.upper(), {})
|
||
# 优先使用配置文件中的中文名称,其次使用data中的project_cn
|
||
name = info.get('project_cn') or data.get('project_cn')
|
||
# 如果没有中文名称,调用DeepSeek翻译
|
||
if not name:
|
||
english_name = info.get('project') or data.get('project', abb)
|
||
name = translate_project_name_to_chinese(abb, english_name, api_key)
|
||
|
||
placeholder_en = "[Generating clinical significance...]"
|
||
placeholder_cn = "[正在生成临床意义...]"
|
||
|
||
try:
|
||
insert_table_after_position(
|
||
doc, insert_pos, abb, name, result,
|
||
placeholder_en, placeholder_cn,
|
||
point=point, reference=reference, unit=unit,
|
||
include_header=False
|
||
)
|
||
print(f" ✓ 添加表格: {abb} ({name}) = {result}")
|
||
added_items.append((abb, name, result))
|
||
added_count += 1
|
||
insert_pos += 2
|
||
except Exception as e:
|
||
print(f" ✗ 添加 {abb} 失败: {e}")
|
||
|
||
print(f"\n ✓ 已添加 {added_count} 个表格")
|
||
|
||
# ===== 第三步:调用DeepSeek生成Clinical Significance解释 =====
|
||
if added_items and api_key:
|
||
print(f"\n 🤖 步骤3: 调用DeepSeek生成Clinical Significance解释...")
|
||
|
||
# 遍历文档中的表格,找到占位符并替换为AI解释
|
||
for abb, name, result in added_items:
|
||
print(f" 🤖 生成 {abb} 的临床意义解释...")
|
||
ai_explanation = get_ai_explanation(abb, name, result, api_key)
|
||
|
||
# 在文档中找到该ABB的表格并更新解释
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
cells = row.cells
|
||
if len(cells) > 0:
|
||
first_cell_text = cells[0].text.strip().upper()
|
||
if first_cell_text == abb.upper():
|
||
# 找到匹配的ABB,查找下一行的Clinical Significance
|
||
row_idx = list(table.rows).index(row)
|
||
if row_idx + 1 < len(table.rows):
|
||
sig_row = table.rows[row_idx + 1]
|
||
sig_cell = sig_row.cells[0]
|
||
if 'Generating' in sig_cell.text or '正在生成' in sig_cell.text:
|
||
# 替换占位符
|
||
sig_cell.text = ''
|
||
p = sig_cell.paragraphs[0]
|
||
p.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
|
||
def set_font(run, bold=False, font_size=9):
|
||
run.bold = bold
|
||
run.font.name = 'Times New Roman'
|
||
run.font.size = Pt(font_size)
|
||
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||
|
||
run = p.add_run('Clinical Significance: ')
|
||
set_font(run, bold=True)
|
||
run = p.add_run(ai_explanation['en'])
|
||
set_font(run)
|
||
run = p.add_run('\n')
|
||
run = p.add_run('临床意义:')
|
||
set_font(run, bold=True)
|
||
run = p.add_run(ai_explanation['cn'])
|
||
set_font(run)
|
||
print(f" ✓ 已更新 {abb} 的解释")
|
||
break
|
||
|
||
print(f"\n ✅ 缺失项目处理完成,共添加 {added_count} 个项目")
|
||
|
||
|
||
def clean_empty_rows(doc_path: str, output_path: str, patient_info: dict = None):
|
||
"""清理空白数据行,并将数据表格合并到表头下
|
||
|
||
规则:
|
||
1. 删除空数据行(ABB有内容但Result为空)
|
||
2. 如果表头下只有描述没有数据,删除描述,将下方数据表格内容移上来
|
||
|
||
重要:跳过保护区域(前四页)和"客户功能医学检测档案"区域的所有表格
|
||
|
||
Args:
|
||
doc_path: 文档路径
|
||
output_path: 输出路径
|
||
patient_info: 患者信息字典,包含gender字段(从OCR文本提取),用于模块清理
|
||
"""
|
||
from docx import Document
|
||
from lxml import etree
|
||
import re
|
||
import copy
|
||
from xml_safe_save import safe_save
|
||
|
||
template_path = Path(__file__).parent / "template_complete.docx"
|
||
|
||
doc = Document(doc_path)
|
||
|
||
# 获取保护边界位置
|
||
protection_boundary = find_health_program_boundary(doc)
|
||
print(f" [保护] 清理空行时跳过前 {protection_boundary} 个元素")
|
||
|
||
# 获取"客户功能医学检测档案"区域位置
|
||
exam_file_start, exam_file_end = find_examination_file_region(doc)
|
||
if exam_file_start >= 0:
|
||
print(f" [保护] 清理空行时跳过'客户功能医学检测档案'区域: {exam_file_start}-{exam_file_end}")
|
||
|
||
def is_in_protected_region(idx):
|
||
"""检查索引是否在保护区域内"""
|
||
# 检查是否在前四页保护区域内
|
||
if idx < protection_boundary:
|
||
return True
|
||
# 检查是否在"客户功能医学检测档案"区域内
|
||
if exam_file_start >= 0 and exam_file_start <= idx < exam_file_end:
|
||
return True
|
||
return False
|
||
|
||
# 构建保护区域内的表格集合(包括前四页和"客户功能医学检测档案"区域)
|
||
body = doc.element.body
|
||
body_children = list(body)
|
||
protected_tables = set()
|
||
for i, elem in enumerate(body_children):
|
||
if is_in_protected_region(i):
|
||
if elem.tag.endswith('}tbl'):
|
||
for t in doc.tables:
|
||
if t._tbl is elem:
|
||
protected_tables.add(id(t))
|
||
break
|
||
print(f" [保护] 保护区域内有 {len(protected_tables)} 个表格将被跳过")
|
||
|
||
removed_rows = 0
|
||
merged_count = 0
|
||
|
||
def has_data_in_row(cells):
|
||
"""检查行是否有有效数据(只以 Result 列判断,避免 Refer 范围数字误判)"""
|
||
valid_qualitative = [
|
||
'negative', 'positive', 'normal', 'reactive', 'non-reactive',
|
||
'a', 'b', 'ab', 'o', # 血型
|
||
'yellow', 'amber', 'straw', 'colorless', 'red', 'brown', 'dark', 'clear' # 颜色
|
||
]
|
||
|
||
# 模板结构通常为:
|
||
# - 11列:0 ABB, 1-2 Project, 3-4 Result, 5-6 Point, 7-8 Refer, 9-10 Unit
|
||
# - 6列:0 ABB, 1 Project, 2 Result, 3 Point, 4 Refer, 5 Unit
|
||
if len(cells) >= 11:
|
||
result_col_candidates = [3, 4]
|
||
elif len(cells) >= 6:
|
||
result_col_candidates = [2, 3]
|
||
else:
|
||
result_col_candidates = [2]
|
||
|
||
result_candidates = []
|
||
for col_idx in result_col_candidates:
|
||
if col_idx < len(cells):
|
||
txt = (cells[col_idx].text or '').strip()
|
||
if txt:
|
||
result_candidates.append(txt)
|
||
result_text = result_candidates[0] if result_candidates else ''
|
||
|
||
if not result_text:
|
||
return False
|
||
if result_text in ['', '-', '/', ' ', '.', ':', '{{', '}}']:
|
||
return False
|
||
if result_text.startswith('{{'):
|
||
return False
|
||
|
||
# 排除“范围值”形态(常出现在 Refer 列,但模板错位时也可能落到 Result/Point 列)
|
||
if re.match(r'^[\(\[]?\s*[-+]?\d+(?:\.\d+)?\s*[-–~]\s*[-+]?\d+(?:\.\d+)?\s*[\)\]]?$', result_text):
|
||
return False
|
||
|
||
if re.search(r'\d', result_text):
|
||
return True
|
||
if result_text.lower() in valid_qualitative:
|
||
return True
|
||
return False
|
||
|
||
def is_header_row(row_text, cells=None):
|
||
"""精确识别表头行"""
|
||
# 先排除描述行,避免被误判为表头
|
||
if 'clinical significance' in row_text or '临床意义' in row_text:
|
||
return False
|
||
|
||
# 表头必须具备“Abb/简称 + Project/项目 + Result/结果”组合特征
|
||
has_abb = ('abb' in row_text) or ('简称' in row_text)
|
||
has_project = ('project' in row_text) or ('项目' in row_text)
|
||
has_result = ('result' in row_text) or ('结果' in row_text)
|
||
if not (has_abb and has_project and has_result):
|
||
return False
|
||
|
||
# 如果提供了cells,进行更严格的检查
|
||
if cells:
|
||
# 表头行通常有多个列且每个单元格内容较短
|
||
non_empty_cells = [c for c in cells if c.text.strip()]
|
||
if len(non_empty_cells) < 2:
|
||
return False
|
||
# 表头单元格内容通常较短(<30字符)
|
||
if any(len(c.text.strip()) > 30 for c in cells):
|
||
return False
|
||
|
||
return True
|
||
|
||
def is_title_row(row_text, cells=None):
|
||
"""识别标题行(如 Blood Type 血型, Four Infectious Diseases 传染病四项)"""
|
||
# 先排除描述行,避免解释行误判为标题
|
||
if 'clinical significance' in row_text or '临床意义' in row_text:
|
||
return False
|
||
|
||
# 常见标题关键词 - 包含所有24个标准模块的关键词
|
||
title_keywords = [
|
||
# 英文关键词
|
||
'blood count', 'blood type', 'blood sugar', 'blood coagulation',
|
||
'function', 'profile', 'panel', 'test', 'detection',
|
||
'examination', 'analysis', 'screening', 'marker', 'hormone',
|
||
'infectious', 'disease', 'immunoglobulin', 'complement', 'lipid',
|
||
'electrolyte', 'coagulation', 'metabolism', 'microelement', 'trace element',
|
||
'lymphocyte', 'humoral', 'immunity', 'inflammatory', 'autoantibody',
|
||
'thromboembolism', 'imaging', 'gynecological', 'female-specific',
|
||
'myocardial', 'enzyme', 'cardiac', # 心肌酶谱相关关键词
|
||
# 中文关键词
|
||
'血常规', '血型', '血糖', '凝血', '肝功能', '肾功能', '血脂', '甲状腺',
|
||
'检查', '检测', '传染病', '电解质', '骨代谢', '微量元素', '重金属',
|
||
'淋巴细胞', '体液免疫', '免疫功能', '炎症', '自身抗体', '心脑血管',
|
||
'影像', '妇科', '女性专项', '肿瘤标记物', '肿瘤标志物', '荷尔蒙',
|
||
'心肌酶', '心肌酶谱' # 心肌酶谱中文关键词
|
||
]
|
||
if any(kw in row_text for kw in title_keywords):
|
||
if cells:
|
||
# 获取所有非空单元格的内容
|
||
non_empty_texts = [c.text.strip() for c in cells if c.text.strip()]
|
||
# 去重后的内容数量(合并单元格会有相同内容)
|
||
unique_texts = set(non_empty_texts)
|
||
# 标题行特征:去重后只有1-2种不同内容,或者只有少量非空单元格
|
||
if len(unique_texts) <= 2 or len(non_empty_texts) <= 2:
|
||
return True
|
||
else:
|
||
return True
|
||
return False
|
||
|
||
def is_description_row(row_text):
|
||
return 'clinical significance' in row_text or '临床意义' in row_text
|
||
|
||
def is_data_row(first_cell):
|
||
if first_cell and 2 <= len(first_cell) <= 15:
|
||
clean = first_cell.replace('-', '').replace('/', '').replace('%', '').replace('(', '').replace(')', '').replace(' ', '')
|
||
return clean and clean.replace('.', '').isalnum()
|
||
return False
|
||
|
||
def is_special_table(table):
|
||
"""检查是否是自动生成的特殊格式表格(防止被合并)
|
||
|
||
特殊表格特征:
|
||
1. 2-4行
|
||
2. 最后一行包含 "Clinical Significance" 或 "临床意义"
|
||
3. 第一行不是模块标题(不包含重复的模块名称)
|
||
"""
|
||
rows = len(table.rows)
|
||
if rows < 2 or rows > 4:
|
||
return False
|
||
|
||
try:
|
||
# 检查最后一行是否包含临床意义
|
||
last_row_text = ' '.join([c.text for c in table.rows[-1].cells]).lower()
|
||
if 'clinical significance' not in last_row_text and '临床意义' not in last_row_text:
|
||
return False
|
||
|
||
# 检查第一行是否是模块标题(模块标题表格不是特殊表格)
|
||
first_row_text = ' '.join([c.text for c in table.rows[0].cells]).lower()
|
||
# 模块标题特征:同一个文本重复多次
|
||
first_cell = table.rows[0].cells[0].text.strip()
|
||
if first_cell and len(first_cell) > 3:
|
||
# 检查是否所有单元格都包含相同的文本
|
||
all_same = all(first_cell in c.text for c in table.rows[0].cells)
|
||
if all_same:
|
||
return False # 这是模块标题表格,不是特殊表格
|
||
|
||
return True
|
||
except:
|
||
pass
|
||
return False
|
||
|
||
def analyze_table(table):
|
||
"""分析表格结构"""
|
||
info = {'header_idx': -1, 'title_idx': -1, 'desc_indices': [],
|
||
'data_with_result': [], 'data_without_result': [],
|
||
'is_special': is_special_table(table)}
|
||
|
||
for row_idx, row in enumerate(table.rows):
|
||
cells = row.cells
|
||
if len(cells) < 2:
|
||
continue
|
||
row_text = ' '.join([c.text.strip().lower() for c in cells])
|
||
first_cell = cells[0].text.strip()
|
||
|
||
if is_header_row(row_text, cells):
|
||
info['header_idx'] = row_idx
|
||
elif is_title_row(row_text, cells):
|
||
info['title_idx'] = row_idx
|
||
elif is_description_row(row_text):
|
||
info['desc_indices'].append(row_idx)
|
||
elif is_data_row(first_cell):
|
||
if has_data_in_row(cells):
|
||
info['data_with_result'].append(row_idx)
|
||
else:
|
||
info['data_without_result'].append(row_idx)
|
||
return info
|
||
|
||
def special_table_has_data(table):
|
||
"""特殊表格是否有有效结果。
|
||
|
||
支持多种结构:
|
||
1. 普通项目表格:2-3行,cells[0]=ABB, cells[1]=项目名, cells[2]=Result
|
||
2. 配对项目表格:3-4行,两个数据行(项目名 + Result),共享临床意义
|
||
注意:配对表格的ABB列(cells[0])可能为空,项目名在cells[1]
|
||
3. 11列表格(模板):cells[0]=ABB, cells[1]=项目名, cells[2]可能是项目名重复
|
||
|
||
若所有数据行都没有有效内容,则认为该表格应被删除。
|
||
"""
|
||
try:
|
||
rows = len(table.rows)
|
||
if rows < 2:
|
||
return False
|
||
|
||
# 检查是否有任何有效的数据行
|
||
has_valid_data = False
|
||
for ri in range(rows):
|
||
cells = table.rows[ri].cells
|
||
if len(cells) < 2:
|
||
continue
|
||
first_cell = (cells[0].text or '').strip()
|
||
second_cell = (cells[1].text or '').strip() if len(cells) > 1 else ''
|
||
third_cell = (cells[2].text or '').strip() if len(cells) > 2 else ''
|
||
row_text = ' '.join([c.text for c in cells]).lower()
|
||
|
||
# 跳过Clinical Significance行
|
||
if 'clinical significance' in row_text or '临床意义' in row_text:
|
||
continue
|
||
# 跳过表头行
|
||
if first_cell.lower().startswith('abb') or ('project' in row_text and '项目' in row_text):
|
||
continue
|
||
|
||
# 检查是否有有效内容(ABB列、项目名列或Result列)
|
||
# 配对表格的ABB列可能为空,但项目名列和Result列有内容
|
||
has_content = False
|
||
|
||
# 检查ABB列(第一列)
|
||
if first_cell and first_cell not in [' ', '\n'] and not first_cell.startswith('{{'):
|
||
has_content = True
|
||
|
||
# 检查项目名列(第二列)- 配对表格的中文项目名
|
||
if not has_content and second_cell and second_cell not in [' ', '\n']:
|
||
# 排除占位符
|
||
if not second_cell.startswith('{{'):
|
||
has_content = True
|
||
|
||
# 检查Result列(第三列)
|
||
if not has_content and third_cell and third_cell not in [' ', '\n', '-', '/']:
|
||
if not third_cell.startswith('{{'):
|
||
has_content = True
|
||
|
||
if has_content:
|
||
has_valid_data = True
|
||
break
|
||
|
||
return has_valid_data
|
||
except:
|
||
return False
|
||
|
||
def table_has_any_data(table):
|
||
"""检查表格是否有任何有效数据(用于模块删除判断)"""
|
||
# 先检查特殊表格
|
||
if is_special_table(table):
|
||
return special_table_has_data(table)
|
||
|
||
# 普通表格检查
|
||
info = analyze_table(table)
|
||
return len(info['data_with_result']) > 0
|
||
|
||
# 0. 先删除“特殊表格”中没有结果的整张表(否则后续逻辑会跳过它们)
|
||
removed_special_tables = 0
|
||
for table in list(doc.tables):
|
||
# 跳过保护区域内的表格
|
||
if id(table) in protected_tables:
|
||
continue
|
||
info = analyze_table(table)
|
||
if info['is_special'] and not special_table_has_data(table):
|
||
try:
|
||
table._tbl.getparent().remove(table._tbl)
|
||
removed_special_tables += 1
|
||
except:
|
||
pass
|
||
|
||
# 获取body中表格的顺序(只处理保护区域外的表格)
|
||
body = doc._body._body
|
||
table_order = []
|
||
for elem in body:
|
||
if elem.tag.endswith('}tbl'):
|
||
for t in doc.tables:
|
||
if t._tbl is elem:
|
||
# 跳过保护区域内的表格
|
||
if id(t) not in protected_tables:
|
||
table_order.append(t)
|
||
break
|
||
|
||
# 第一遍:合并表格(表头下无数据,向后搜索找第一个有数据的表格)
|
||
tables_to_remove = set()
|
||
|
||
for i in range(len(table_order)):
|
||
if table_order[i] in tables_to_remove:
|
||
continue
|
||
|
||
t1 = table_order[i]
|
||
info1 = analyze_table(t1)
|
||
|
||
# 如果t1本身就是特殊表格,不要往里合并东西
|
||
if info1['is_special']:
|
||
continue
|
||
|
||
# 条件:t1有表头但无数据
|
||
if info1['header_idx'] >= 0 and len(info1['data_with_result']) == 0:
|
||
# 只在“下一个表头表格”之前搜索,避免跨模块吸走数据
|
||
next_header_pos = None
|
||
for k in range(i + 1, len(table_order)):
|
||
if table_order[k] in tables_to_remove:
|
||
continue
|
||
k_info = analyze_table(table_order[k])
|
||
|
||
# 如果遇到特殊表格,视为边界,停止搜索
|
||
if k_info['is_special']:
|
||
next_header_pos = k
|
||
break
|
||
|
||
# 以“有表头但无数据”的表作为模块边界(数据表可能也带表头,不能当边界)
|
||
if k_info['header_idx'] >= 0 and len(k_info['data_with_result']) == 0:
|
||
next_header_pos = k
|
||
break
|
||
search_end = next_header_pos if next_header_pos is not None else len(table_order)
|
||
|
||
# 在范围内收集所有“有数据且无表头”的表格
|
||
candidates = []
|
||
for j in range(i + 1, search_end):
|
||
if table_order[j] in tables_to_remove:
|
||
continue
|
||
candidate = table_order[j]
|
||
candidate_info = analyze_table(candidate)
|
||
|
||
# 跳过特殊表格(不作为被合并对象)
|
||
if candidate_info['is_special']:
|
||
continue
|
||
|
||
if len(candidate_info['data_with_result']) > 0:
|
||
candidates.append((candidate, candidate_info))
|
||
|
||
if not candidates:
|
||
continue
|
||
|
||
# 用第一个候选数据表的“项目名”作为标题,覆盖t1标题(避免出现空标题)
|
||
title_text = ''
|
||
try:
|
||
first_candidate, first_candidate_info = candidates[0]
|
||
if first_candidate_info.get('data_with_result'):
|
||
data_row_idx = first_candidate_info['data_with_result'][0]
|
||
if len(first_candidate.rows[data_row_idx].cells) > 1:
|
||
title_text = first_candidate.rows[data_row_idx].cells[1].text.strip()
|
||
if not title_text:
|
||
title_text = first_candidate.rows[data_row_idx].cells[0].text.strip()
|
||
except:
|
||
title_text = ''
|
||
|
||
# 清空t1(保留表头行)
|
||
header_idx = info1['header_idx']
|
||
title_row_idx = header_idx + 1
|
||
|
||
# 清空:删除表头行之后所有旧行,但尽量保留表头下一行作为“标题行结构”
|
||
keep_title_row = title_row_idx < len(t1.rows)
|
||
delete_from = (title_row_idx + 1) if keep_title_row else (header_idx + 1)
|
||
for ridx in range(len(t1.rows) - 1, delete_from - 1, -1):
|
||
try:
|
||
t1._tbl.remove(t1.rows[ridx]._tr)
|
||
removed_rows += 1
|
||
except:
|
||
pass
|
||
|
||
# 确保存在标题行:没有则插入一行(插入后重新通过t1.rows获取)
|
||
if not keep_title_row:
|
||
try:
|
||
new_tr = copy.deepcopy(t1.rows[header_idx]._tr)
|
||
t1._tbl.insert(title_row_idx, new_tr)
|
||
except:
|
||
pass
|
||
|
||
# 写入标题:只在第一列写入“第一条数据项目名”,其余列清空
|
||
try:
|
||
if title_row_idx < len(t1.rows):
|
||
title_row = t1.rows[title_row_idx]
|
||
for c in title_row.cells:
|
||
c.text = ''
|
||
if title_text:
|
||
title_row.cells[0].text = title_text
|
||
except:
|
||
pass
|
||
|
||
# 将候选表格的标题/数据/描述复制到t1,并删除候选表格
|
||
for candidate, candidate_info in candidates:
|
||
rows_to_copy = []
|
||
rows_to_copy.extend(candidate_info['data_with_result'])
|
||
rows_to_copy.extend(candidate_info['desc_indices'])
|
||
|
||
for row_idx in sorted(rows_to_copy):
|
||
src_row = candidate.rows[row_idx]
|
||
new_tr = copy.deepcopy(src_row._tr)
|
||
t1._tbl.append(new_tr)
|
||
|
||
tables_to_remove.add(candidate)
|
||
merged_count += 1
|
||
|
||
# 删除被合并的表格
|
||
for t in tables_to_remove:
|
||
try:
|
||
t._tbl.getparent().remove(t._tbl)
|
||
except:
|
||
pass
|
||
|
||
# 第二遍:删除剩余的空数据行(跳过特殊表格和保护区域)
|
||
# 同时删除紧随其后的"Clinical Significance/临床意义"描述行,避免留下孤儿解释块
|
||
for table in doc.tables:
|
||
# 跳过保护区域内的表格
|
||
if id(table) in protected_tables:
|
||
continue
|
||
info = analyze_table(table)
|
||
# 跳过特殊表格
|
||
if info['is_special']:
|
||
continue
|
||
|
||
rows_to_remove = set()
|
||
for row_idx in info['data_without_result']:
|
||
rows_to_remove.add(row_idx)
|
||
# 检查后续行是否是描述行(可能有多行描述)
|
||
next_idx = row_idx + 1
|
||
while next_idx < len(table.rows):
|
||
try:
|
||
next_cells = table.rows[next_idx].cells
|
||
next_text = ' '.join([(c.text or '').strip().lower() for c in next_cells])
|
||
# 检查是否是描述行
|
||
if is_description_row(next_text):
|
||
rows_to_remove.add(next_idx)
|
||
next_idx += 1
|
||
continue
|
||
# 也检查是否是空行或只有少量文字的行(可能是格式化问题)
|
||
if not next_text.strip() or len(next_text.strip()) < 5:
|
||
rows_to_remove.add(next_idx)
|
||
next_idx += 1
|
||
continue
|
||
except:
|
||
pass
|
||
break
|
||
|
||
# 额外检查:删除所有孤立的描述行(前面没有对应数据行的描述)
|
||
kept_data_rows = set(info['data_with_result']) - rows_to_remove
|
||
for desc_idx in info['desc_indices']:
|
||
# 检查这个描述行前面是否有保留的数据行
|
||
has_data_before = False
|
||
for data_idx in kept_data_rows:
|
||
if data_idx < desc_idx:
|
||
# 检查data_idx和desc_idx之间是否没有其他数据行
|
||
intervening_data = [d for d in kept_data_rows if data_idx < d < desc_idx]
|
||
if not intervening_data:
|
||
has_data_before = True
|
||
break
|
||
if not has_data_before:
|
||
rows_to_remove.add(desc_idx)
|
||
|
||
for row_idx in sorted(rows_to_remove, reverse=True):
|
||
try:
|
||
table._tbl.remove(table.rows[row_idx]._tr)
|
||
removed_rows += 1
|
||
except:
|
||
pass
|
||
|
||
# 第二点五遍:补全合并后的标题行(表头下一行为空时,跳过特殊表格和保护区域)
|
||
for table in doc.tables:
|
||
# 跳过保护区域内的表格
|
||
if id(table) in protected_tables:
|
||
continue
|
||
info = analyze_table(table)
|
||
# 跳过特殊表格
|
||
if info['is_special']:
|
||
continue
|
||
if info['header_idx'] < 0:
|
||
continue
|
||
if len(info['data_with_result']) == 0:
|
||
continue
|
||
|
||
title_row_idx = info['header_idx'] + 1
|
||
if title_row_idx >= len(table.rows):
|
||
continue
|
||
|
||
try:
|
||
title_row = table.rows[title_row_idx]
|
||
# 如果表头下一行本身就是数据行,则插入一个“空标题行”(复制表头行结构)
|
||
try:
|
||
first_cell = title_row.cells[0].text.strip() if title_row.cells else ''
|
||
if is_data_row(first_cell) and has_data_in_row(title_row.cells):
|
||
extracted_title = ''
|
||
try:
|
||
if len(title_row.cells) > 1:
|
||
extracted_title = title_row.cells[1].text.strip()
|
||
if not extracted_title:
|
||
extracted_title = title_row.cells[0].text.strip()
|
||
except:
|
||
extracted_title = ''
|
||
|
||
header_tr = copy.deepcopy(table.rows[info['header_idx']]._tr)
|
||
table._tbl.insert(title_row_idx, header_tr)
|
||
title_row = table.rows[title_row_idx]
|
||
try:
|
||
for c in title_row.cells:
|
||
c.text = ''
|
||
if extracted_title:
|
||
title_row.cells[0].text = extracted_title
|
||
except:
|
||
pass
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
# 若标题行已有内容且不是空行,则不覆盖
|
||
if any((c.text or '').strip() for c in title_row.cells):
|
||
continue
|
||
|
||
first_data_idx = info['data_with_result'][0]
|
||
if first_data_idx >= len(table.rows):
|
||
continue
|
||
data_row = table.rows[first_data_idx]
|
||
|
||
title_text = ''
|
||
if len(data_row.cells) > 1:
|
||
title_text = data_row.cells[1].text.strip()
|
||
if not title_text:
|
||
title_text = data_row.cells[0].text.strip()
|
||
if not title_text:
|
||
continue
|
||
|
||
for c in title_row.cells:
|
||
c.text = ''
|
||
title_row.cells[0].text = title_text
|
||
except:
|
||
pass
|
||
|
||
# 第三遍:删除所有没有数据的表格
|
||
# 重要:跳过保护区域内的表格
|
||
# 重要:保留模块标题表格(title_idx >= 0)
|
||
# 重要:保留表头表格(包含 Abb/Project/Result)
|
||
removed_tables = 0
|
||
for table in list(doc.tables):
|
||
# 跳过保护区域内的表格
|
||
if id(table) in protected_tables:
|
||
continue
|
||
info = analyze_table(table)
|
||
# 跳过特殊表格 - 这些是新生成的独立表格,必须保留
|
||
if info['is_special']:
|
||
continue
|
||
# 跳过模块标题表格 - 这些是模块的标题行,必须保留
|
||
if info['title_idx'] >= 0:
|
||
continue
|
||
# 跳过表头表格 - 这些是数据表格的表头,必须保留
|
||
if info['header_idx'] >= 0:
|
||
continue
|
||
# 只要没有数据就删除整个表格
|
||
if len(info['data_with_result']) == 0:
|
||
try:
|
||
table._tbl.getparent().remove(table._tbl)
|
||
removed_tables += 1
|
||
except:
|
||
pass
|
||
|
||
# 第3.5遍:删除重复的模块标题表格
|
||
# 模块标题表格特征:只有1行,包含重复的模块名称
|
||
seen_module_titles = set()
|
||
removed_duplicate_titles = 0
|
||
for table in list(doc.tables):
|
||
if id(table) in protected_tables:
|
||
continue
|
||
# 检查是否是模块标题表格(只有1行,内容重复)
|
||
if len(table.rows) == 1:
|
||
row_text = ' '.join([c.text.strip() for c in table.rows[0].cells]).lower()
|
||
# 检查是否包含模块关键词且重复出现
|
||
for kw in ['imaging', 'urine', 'blood count', 'blood type', 'coagulation',
|
||
'infectious', 'electrolyte', 'liver', 'kidney', 'myocardial',
|
||
'thyroid', 'lipid', 'blood sugar', 'thromboembolism', 'bone',
|
||
'microelement', 'lymphocyte', 'humoral', 'inflammatory',
|
||
'autoantibody', 'tumor', 'female hormone', 'male hormone',
|
||
'female-specific', '影像', '尿液', '血常规', '血型', '凝血',
|
||
'传染病', '电解质', '肝功能', '肾功能', '心肌酶', '甲状腺',
|
||
'血脂', '血糖', '心脑血管', '骨代谢', '微量元素', '淋巴细胞',
|
||
'体液免疫', '炎症', '自身抗体', '肿瘤', '女性激素', '男性激素', '女性专项']:
|
||
if kw in row_text and row_text.count(kw) >= 2:
|
||
# 这是模块标题表格
|
||
if kw in seen_module_titles:
|
||
# 重复的标题表格,删除
|
||
try:
|
||
table._tbl.getparent().remove(table._tbl)
|
||
removed_duplicate_titles += 1
|
||
except:
|
||
pass
|
||
else:
|
||
seen_module_titles.add(kw)
|
||
break
|
||
|
||
if removed_duplicate_titles > 0:
|
||
print(f" [清理] 删除 {removed_duplicate_titles} 个重复的模块标题表格")
|
||
|
||
# 重要:在模块清理之前,先保存并重新加载文档,确保索引正确
|
||
safe_save(doc, output_path, template_path)
|
||
doc = Document(output_path)
|
||
|
||
# 第四遍:删除无数据的模块(包括标题、文字、图片等)
|
||
from docx.text.paragraph import Paragraph
|
||
|
||
module_keywords_cleanup = [
|
||
'urine detection', 'urine test', '尿液检测', 'complete blood count', '血常规',
|
||
'blood sugar', 'glucose', '血糖', 'lipid panel', 'lipid profile', '血脂',
|
||
'blood type', '血型', 'coagulation', 'blood coagulation', '凝血',
|
||
'infectious disease', 'four infectious', '传染病', '传染病四项',
|
||
'electrolyte', 'serum electrolyte', '电解质', '血清电解质',
|
||
'liver function', '肝功能', 'kidney function', '肾功能',
|
||
'cardiac enzyme', 'myocardial enzyme', 'enzyme spectrum', '心肌酶', '心肌酶谱',
|
||
'thyroid', 'thyroid function', '甲状腺', '甲状腺功能',
|
||
'cardiovascular', 'thromboembolism', '心血管', '心脑血管',
|
||
'bone metabolism', '骨代谢',
|
||
'trace element', 'heavy metal', 'microelement', '微量元素', '重金属',
|
||
'lymphocyte', 'lymphocyte subpopulation', '淋巴细胞', '淋巴细胞亚群',
|
||
'humoral immunity', '体液免疫', 'immune function', '免疫功能',
|
||
'inflammation', 'inflammatory', '炎症', '炎症反应',
|
||
'autoantibody', 'autoimmune', '自身抗体', '自身免疫',
|
||
'female hormone', '女性激素', '女性荷尔蒙', 'male hormone', '男性激素', '男性荷尔蒙',
|
||
'gynecological', 'female-specific', '妇科', '女性专项',
|
||
'tumor marker', '肿瘤标记物', '肿瘤标志物',
|
||
'imaging', '影像',
|
||
]
|
||
exclude_keywords_cleanup = ['health program', 'health report', 'abnormal', '异常', 'overall', 'assessment', 'clinical significance', '临床意义', 'functional medical health advice', '功能医学健康建议', 'medical intervention', '医学干预', 'nutrition', '营养', 'exercise', '运动', 'sleep', '睡眠', 'lifestyle', '生活方式', 'follow-up', '随访', 'functional medical team', '功能医学团队',
|
||
'(一)', '(二)', '(三)', '(四)', '(五)', '(六)',
|
||
'复查', '监测', '标志物', '血液学', '状态',
|
||
'bhrt', 'ivnt', 'msc', '干细胞', '静脉营养', '激素替代',
|
||
'建议', '方案', '治疗', '调理', '改善', '优化']
|
||
|
||
protected_section_keywords = ['functional medical health advice', '功能医学健康建议',
|
||
'overall health assessment', '整体健康状况',
|
||
'abnormal index', '异常指标',
|
||
'health report analysis', '健康报告分析',
|
||
'medical intervention', '医学干预',
|
||
'nutrition intervention', '营养干预',
|
||
'exercise intervention', '运动干预',
|
||
'sleep', '睡眠', 'lifestyle', '生活方式',
|
||
'follow-up', '随访', 'functional medical team', '功能医学团队']
|
||
|
||
def is_protected_section_cleanup(text):
|
||
if not text:
|
||
return False
|
||
text_lower = text.lower().strip()
|
||
return any(kw in text_lower for kw in protected_section_keywords)
|
||
|
||
def is_module_title_para_cleanup(text):
|
||
if not text or len(text) > 100:
|
||
return False
|
||
text_lower = text.lower().strip()
|
||
if text_lower.startswith('(i)') or text_lower.startswith('(ii)') or text_lower.startswith('(iii)'):
|
||
return False
|
||
if text_lower.startswith('i.') or text_lower.startswith('ii.') or text_lower.startswith('iii.'):
|
||
return False
|
||
if any(ex in text_lower for ex in exclude_keywords_cleanup):
|
||
return False
|
||
return any(kw in text_lower for kw in module_keywords_cleanup)
|
||
|
||
def is_module_title_table_cleanup(table):
|
||
if len(table.rows) < 1 or len(table.rows) > 2:
|
||
return False
|
||
try:
|
||
full_text = ' '.join([c.text.strip() for row in table.rows for c in row.cells]).lower()
|
||
if 'clinical significance' in full_text or '临床意义' in full_text:
|
||
return False
|
||
if 'abb' in full_text and 'project' in full_text and 'result' in full_text:
|
||
return False
|
||
|
||
# 模块标题关键词(包含变体拼写)
|
||
module_title_names = [
|
||
'urine detection', 'urine test', '尿液检测',
|
||
'complete blood count', '血常规',
|
||
'blood sugar', '血糖', 'lipid profile', '血脂', 'blood type', '血型',
|
||
'blood coagulation', '凝血功能', 'four infectious diseases', '传染病四项',
|
||
'serum electrolytes', '血电解质', 'liver function', '肝功能',
|
||
'kidney function', '肾功能', 'myocardial enzyme', '心肌酶',
|
||
'thyroid function', '甲状腺功能', 'thromboembolism', '心脑血管',
|
||
'bone metabolism', '骨代谢', 'microelement', '微量元素',
|
||
'humoral immunity', '体液免疫', 'inflammatory reaction', '炎症反应',
|
||
'autoantibody', '自身抗体', 'female hormone', '女性激素',
|
||
'male hormone', '男性激素', 'tumor markers', '肿瘤标记物',
|
||
'lymphocyte', 'lymphocyto', '淋巴细胞', '淋巴细胞亚群',
|
||
'imaging', '影像学', 'female-specific', '女性专项'
|
||
]
|
||
|
||
row_text = ' '.join([c.text.strip() for c in table.rows[0].cells]).lower()
|
||
# 放宽条件:只要标题出现1次即可(之前要求2次太严格)
|
||
for title in module_title_names:
|
||
if title in row_text:
|
||
return True
|
||
return False
|
||
except:
|
||
return False
|
||
|
||
body = doc._body._body
|
||
body_children = list(body)
|
||
|
||
tbl_map = {}
|
||
for t in doc.tables:
|
||
tbl_map[id(t._tbl)] = t
|
||
|
||
# 精确识别模块ID(按优先级排列,female hormone 必须在 male hormone 之前匹配,避免子串冲突)
|
||
_MODULE_IDENTIFY_RULES = [
|
||
('female hormone', 'female hormone'), ('女性荷尔蒙', 'female hormone'), ('女性激素', 'female hormone'),
|
||
('male hormone', 'male hormone'), ('男性荷尔蒙', 'male hormone'), ('男性激素', 'male hormone'),
|
||
('female-specific', 'female-specific'), ('女性专项', 'female-specific'),
|
||
('urine detection', 'urine'), ('urine test', 'urine'), ('尿液检测', 'urine'),
|
||
('complete blood count', 'blood count'), ('血常规', 'blood count'),
|
||
('blood sugar', 'blood sugar'), ('血糖', 'blood sugar'),
|
||
('lipid profile', 'lipid'), ('血脂', 'lipid'),
|
||
('blood type', 'blood type'), ('血型', 'blood type'),
|
||
('blood coagulation', 'coagulation'), ('凝血功能', 'coagulation'), ('凝血', 'coagulation'),
|
||
('four infectious', 'infectious'), ('传染病', 'infectious'),
|
||
('serum electrolyte', 'electrolyte'), ('血电解质', 'electrolyte'), ('电解质', 'electrolyte'),
|
||
('liver function', 'liver'), ('肝功能', 'liver'),
|
||
('kidney function', 'kidney'), ('肾功能', 'kidney'),
|
||
('myocardial enzyme', 'myocardial'), ('心肌酶', 'myocardial'),
|
||
('thyroid function', 'thyroid'), ('甲状腺功能', 'thyroid'), ('甲状腺', 'thyroid'),
|
||
('thromboembolism', 'thrombo'), ('心脑血管', 'thrombo'),
|
||
('bone metabolism', 'bone'), ('骨代谢', 'bone'),
|
||
('microelement', 'microelement'), ('微量元素', 'microelement'),
|
||
('humoral immunity', 'humoral'), ('体液免疫', 'humoral'),
|
||
('inflammatory', 'inflammatory'), ('炎症', 'inflammatory'),
|
||
('autoantibody', 'autoantibody'), ('自身抗体', 'autoantibody'),
|
||
('tumor marker', 'tumor'), ('肿瘤标记', 'tumor'),
|
||
('lymphocyte', 'lymphocyte'), ('lymphocyto', 'lymphocyte'), ('淋巴细胞', 'lymphocyte'),
|
||
('imaging', 'imaging'), ('影像', 'imaging'),
|
||
]
|
||
|
||
def identify_module_id(title_text):
|
||
"""从模块标题文本精确识别模块ID"""
|
||
text_lower = title_text.lower()
|
||
for pattern, mid in _MODULE_IDENTIFY_RULES:
|
||
if pattern in text_lower:
|
||
return mid
|
||
return None
|
||
|
||
# 找出所有模块标题表格及其位置(统一使用 is_module_title_table_cleanup + identify_module_id)
|
||
module_title_positions = [] # [(position, table, module_id)]
|
||
for i, elem in enumerate(body_children):
|
||
if elem.tag.endswith('}tbl'):
|
||
for t in doc.tables:
|
||
if t._tbl is elem:
|
||
if is_module_title_table_cleanup(t):
|
||
try:
|
||
title_text = ' '.join([c.text.strip() for c in t.rows[0].cells])
|
||
except:
|
||
title_text = ''
|
||
mid = identify_module_id(title_text)
|
||
if mid:
|
||
module_title_positions.append((i, t, mid))
|
||
break
|
||
|
||
# 检查每个模块是否有数据表格
|
||
modules_with_data = set()
|
||
for idx, (pos, title_table, module_id) in enumerate(module_title_positions):
|
||
next_pos = module_title_positions[idx + 1][0] if idx + 1 < len(module_title_positions) else len(body_children)
|
||
|
||
has_data = False
|
||
for j in range(pos + 1, next_pos):
|
||
elem = body_children[j]
|
||
if elem.tag.endswith('}tbl'):
|
||
for t in doc.tables:
|
||
if t._tbl is elem:
|
||
if not is_module_title_table_cleanup(t) and table_has_any_data(t):
|
||
has_data = True
|
||
break
|
||
if has_data:
|
||
break
|
||
|
||
if has_data:
|
||
modules_with_data.add(module_id)
|
||
|
||
print(f" [模块清理] 有数据的模块: {sorted(modules_with_data)}")
|
||
|
||
# 根据性别判断结果,决定删除哪个荷尔蒙模块
|
||
# 将中文"男性"/"女性"转换为英文"male"/"female"
|
||
gender_from_ocr = patient_info.get('gender', '') if patient_info else ''
|
||
if gender_from_ocr == '男性':
|
||
detected_gender = 'male'
|
||
elif gender_from_ocr == '女性':
|
||
detected_gender = 'female'
|
||
else:
|
||
# 如果没有从OCR提取到性别,使用默认值(女性)
|
||
detected_gender = 'female'
|
||
|
||
# 模块ID到描述段落搜索关键词的映射(用于清理文档中残留的描述段落)
|
||
module_desc_mapping = {
|
||
'urine': ('urine detection', '尿液检测'),
|
||
'blood count': ('complete blood count', '血常规'),
|
||
'blood sugar': ('blood sugar', '血糖'),
|
||
'lipid': ('lipid profile', '血脂'),
|
||
'blood type': ('blood type', '血型'),
|
||
'coagulation': ('blood coagulation', '凝血'),
|
||
'infectious': ('four infectious', '传染病'),
|
||
'electrolyte': ('serum electrolyte', '电解质'),
|
||
'liver': ('liver function', '肝功能'),
|
||
'kidney': ('kidney function', '肾功能'),
|
||
'myocardial': ('myocardial enzyme', '心肌酶'),
|
||
'thyroid': ('thyroid function', '甲状腺'),
|
||
'thrombo': ('thromboembolism', '心脑血管'),
|
||
'bone': ('bone metabolism', '骨代谢'),
|
||
'microelement': ('microelement', '微量元素'),
|
||
'humoral': ('humoral immunity', '体液免疫'),
|
||
'inflammatory': ('inflammatory', '炎症'),
|
||
'autoantibody': ('autoantibody', '自身抗体'),
|
||
'female hormone': ('female hormone', '女性荷尔蒙'),
|
||
'male hormone': ('male hormone', '男性荷尔蒙'),
|
||
'tumor': ('tumor marker', '肿瘤标记'),
|
||
'lymphocyte': ('lymphocyto', '淋巴细胞'),
|
||
'imaging': ('imaging', '影像'),
|
||
'female-specific': ('female-specific', '女性专项'),
|
||
}
|
||
|
||
# 荷尔蒙模块清理逻辑:根据性别判断结果,只保留一个荷尔蒙模块
|
||
if detected_gender == 'male':
|
||
if 'female hormone' in modules_with_data:
|
||
print(f" [模块清理] 性别为男性,强制删除女性荷尔蒙模块")
|
||
modules_with_data.discard('female hormone')
|
||
else: # female
|
||
if 'male hormone' in modules_with_data:
|
||
print(f" [模块清理] 性别为女性,强制删除男性荷尔蒙模块")
|
||
modules_with_data.discard('male hormone')
|
||
|
||
# 动态构建需要清理描述的空模块列表(所有没有数据的模块)
|
||
empty_modules_to_clean = []
|
||
for module_id, (en_title, cn_title) in module_desc_mapping.items():
|
||
if module_id not in modules_with_data:
|
||
empty_modules_to_clean.append((module_id, en_title, cn_title))
|
||
|
||
print(f" [模块清理] 需要删除描述的空模块: {[m[0] for m in empty_modules_to_clean]}")
|
||
|
||
removed_modules = 0
|
||
print(f" [模块清理] 找到 {len(module_title_positions)} 个模块起点")
|
||
for idx in range(len(module_title_positions) - 1, -1, -1):
|
||
start_i, _tbl, module_id = module_title_positions[idx]
|
||
end_i = module_title_positions[idx + 1][0] if idx + 1 < len(module_title_positions) else len(body_children)
|
||
try:
|
||
module_title = ' '.join([c.text.strip() for c in _tbl.rows[0].cells])[:40]
|
||
except:
|
||
module_title = 'Unknown'
|
||
|
||
module_elements = body_children[start_i:end_i]
|
||
|
||
if is_protected_section_cleanup(module_title):
|
||
continue
|
||
|
||
# 根据性别判断是否强制删除荷尔蒙模块(精确匹配module_id)
|
||
should_force_remove = False
|
||
if module_id == 'female hormone' and detected_gender == 'male':
|
||
should_force_remove = True
|
||
print(f" [模块清理] 性别为男性,强制删除女性荷尔蒙模块: {module_title}")
|
||
elif module_id == 'male hormone' and detected_gender == 'female':
|
||
should_force_remove = True
|
||
print(f" [模块清理] 性别为女性,强制删除男性荷尔蒙模块: {module_title}")
|
||
|
||
# 如果模块有数据且不需要强制删除,直接跳过
|
||
if not should_force_remove and module_id and module_id in modules_with_data:
|
||
continue
|
||
|
||
# 兆底检查:扫描模块内表格是否实际有数据
|
||
module_has_data = False
|
||
for e in module_elements:
|
||
if e.tag.endswith('}tbl'):
|
||
for t in doc.tables:
|
||
if t._tbl is e:
|
||
if not is_module_title_table_cleanup(t) and table_has_any_data(t):
|
||
module_has_data = True
|
||
break
|
||
|
||
if should_force_remove or not module_has_data:
|
||
# 安全边界(向后):从 start_i+1 往后扫描,找到下一个模块的标题段落,避免删除下一个模块的标题+描述
|
||
safe_end = end_i
|
||
for ei in range(start_i + 1, end_i):
|
||
elem = body_children[ei]
|
||
if elem.tag.endswith('}p'):
|
||
p_text = ''.join(elem.itertext()).strip()
|
||
if is_module_title_para_cleanup(p_text):
|
||
# 确认这个标题段落属于另一个模块(不是当前模块)
|
||
p_mid = identify_module_id(p_text)
|
||
if p_mid and p_mid != module_id:
|
||
safe_end = ei
|
||
break
|
||
|
||
# 安全边界(向前):从 start_i-1 往前扫描,找到当前模块的标题段落和描述段落
|
||
# 这些段落在标题表格之前,需要一起删除
|
||
safe_start = start_i
|
||
for ei in range(start_i - 1, -1, -1):
|
||
elem = body_children[ei]
|
||
if elem.tag.endswith('}tbl'):
|
||
# 遇到表格(上一个模块的数据表格),停止
|
||
break
|
||
if elem.tag.endswith('}p'):
|
||
p_text = ''.join(elem.itertext()).strip()
|
||
if is_module_title_para_cleanup(p_text):
|
||
p_mid = identify_module_id(p_text)
|
||
if p_mid and p_mid != module_id:
|
||
# 属于其他模块的标题段落,停止
|
||
break
|
||
safe_start = ei
|
||
|
||
removed_in_module = 0
|
||
for ei in range(safe_end - 1, safe_start - 1, -1):
|
||
try:
|
||
body_children[ei].getparent().remove(body_children[ei])
|
||
removed_in_module += 1
|
||
except:
|
||
pass
|
||
removed_modules += 1
|
||
if should_force_remove:
|
||
print(f" [模块清理] 删除荷尔蒙模块(根据性别): {module_title} ({removed_in_module} 个元素)")
|
||
else:
|
||
print(f" [模块清理] 删除空模块: {module_title} ({removed_in_module} 个元素)")
|
||
|
||
# 删除空模块的描述段落
|
||
if empty_modules_to_clean:
|
||
# 重新获取body_children(因为上面可能删除了一些元素)
|
||
body_children = list(body)
|
||
|
||
from docx.oxml.ns import qn
|
||
|
||
# 构建数据模块关键词集合(用于安全检查,防止误删有数据模块的内容)
|
||
data_module_keywords = set()
|
||
for mid in modules_with_data:
|
||
if mid in module_desc_mapping:
|
||
en, cn = module_desc_mapping[mid]
|
||
data_module_keywords.add(en.lower())
|
||
data_module_keywords.add(cn)
|
||
|
||
# 找到所有描述段落标题的位置
|
||
desc_title_positions = [] # [(position, module_id, title_text)]
|
||
for i, elem in enumerate(body_children):
|
||
if elem.tag.endswith('}p'):
|
||
text_parts = []
|
||
for t in elem.iter(qn('w:t')):
|
||
if t.text:
|
||
text_parts.append(t.text)
|
||
text = ''.join(text_parts).strip()
|
||
text_lower = text.lower()
|
||
|
||
# 检查是否是描述段落标题(包含模块名称)
|
||
# 注意:描述标题可能较长(如 "Thyroid Function Test Result Analysis 甲状腺功能检测结果分析"),放宽到200字符
|
||
if len(text) < 200:
|
||
for module_id, en_title, cn_title in empty_modules_to_clean:
|
||
if en_title in text_lower and cn_title in text:
|
||
desc_title_positions.append((i, module_id, text[:40]))
|
||
break
|
||
|
||
# 找到所有可能的描述段落标题(用于确定边界)
|
||
# 关键:必须检测所有模块的描述标题(包括有数据的模块),作为删除边界
|
||
all_desc_titles = [
|
||
'urine detection', 'complete blood count', 'blood sugar', 'lipid profile',
|
||
'blood type', 'blood coagulation', 'four infectious', 'serum electrolyte',
|
||
'liver function', 'kidney function', 'myocardial enzyme', 'thyroid function',
|
||
'thromboembolism', 'bone metabolism', 'microelement', 'humoral immunity',
|
||
'inflammatory', 'autoantibody', 'female hormone', 'male hormone',
|
||
'tumor marker', 'lymphocyte', 'lymphocyto', 'imaging', 'female-specific'
|
||
]
|
||
|
||
all_title_positions = []
|
||
for i, elem in enumerate(body_children):
|
||
if elem.tag.endswith('}p'):
|
||
text_parts = []
|
||
for t in elem.iter(qn('w:t')):
|
||
if t.text:
|
||
text_parts.append(t.text)
|
||
text = ''.join(text_parts).strip()
|
||
text_lower = text.lower()
|
||
|
||
# 放宽长度限制到200字符,避免遗漏长标题导致边界检测失败
|
||
if len(text) < 200:
|
||
for title in all_desc_titles:
|
||
if title in text_lower:
|
||
all_title_positions.append(i)
|
||
break
|
||
|
||
all_title_positions.sort()
|
||
print(f" [描述清理] 检测到 {len(desc_title_positions)} 个空模块描述标题, {len(all_title_positions)} 个边界标题")
|
||
|
||
# 删除空模块的描述段落
|
||
removed_desc = 0
|
||
for pos, module_id, title_text in sorted(desc_title_positions, reverse=True):
|
||
# 找到下一个描述标题的位置
|
||
next_pos = len(body_children)
|
||
for p in all_title_positions:
|
||
if p > pos:
|
||
next_pos = p
|
||
break
|
||
|
||
# 安全检查:扫描待删除范围,如果包含有数据模块的关键词则截断
|
||
safe_end = next_pos
|
||
for i in range(pos + 1, next_pos):
|
||
if i < len(body_children):
|
||
elem_text = ''.join(body_children[i].itertext()).strip().lower()
|
||
for dkw in data_module_keywords:
|
||
if dkw.lower() in elem_text:
|
||
# 发现有数据模块的内容,截断删除范围
|
||
safe_end = i
|
||
print(f" [描述清理] 安全截断: {title_text} 在位置 {i} 发现数据模块关键词 '{dkw}',从 {next_pos} 截断到 {safe_end}")
|
||
break
|
||
if safe_end != next_pos:
|
||
break
|
||
|
||
# 删除从当前标题到安全边界之间的所有元素
|
||
elements_to_remove = []
|
||
for i in range(pos, safe_end):
|
||
if i < len(body_children):
|
||
elements_to_remove.append(body_children[i])
|
||
|
||
for elem in reversed(elements_to_remove):
|
||
try:
|
||
elem.getparent().remove(elem)
|
||
removed_desc += 1
|
||
except:
|
||
pass
|
||
|
||
print(f" [描述清理] 删除空模块描述: {title_text} ({len(elements_to_remove)} 个元素, 范围 {pos}-{safe_end})")
|
||
|
||
# 使用安全保存
|
||
safe_save(doc, output_path, template_path)
|
||
print(f"\n✓ 清理完成: 删除 {removed_rows} 行, 合并 {merged_count} 对表格, 删除 {removed_tables} 个空表格, 删除 {removed_special_tables} 个空特殊表格")
|
||
print(f"✓ 模块清理: 删除 {removed_modules} 个无数据模块")
|
||
|
||
return doc
|
||
|
||
|
||
def format_document_structure(doc_path: str, output_path: str):
|
||
"""
|
||
整理Word文档结构:
|
||
1. 清理多余的空白段落(连续空段落只保留一个)
|
||
2. 在模块标题前插入分页符(确保每个模块从新页开始)
|
||
|
||
重要:跳过保护区域(前四页)和"客户功能医学检测档案"区域的所有元素
|
||
"""
|
||
from docx import Document
|
||
from docx.oxml.ns import qn
|
||
from docx.oxml import OxmlElement
|
||
from xml_safe_save import safe_save
|
||
|
||
template_path_local = Path(__file__).parent / "template_complete.docx"
|
||
|
||
doc = Document(doc_path)
|
||
body = doc.element.body
|
||
|
||
# 获取保护边界位置
|
||
protection_boundary = find_health_program_boundary(doc)
|
||
print(f" [保护] 格式整理时跳过前 {protection_boundary} 个元素")
|
||
|
||
# 获取"客户功能医学检测档案"区域位置
|
||
exam_file_start, exam_file_end = find_examination_file_region(doc)
|
||
if exam_file_start >= 0:
|
||
print(f" [保护] 格式整理时跳过'客户功能医学检测档案'区域: {exam_file_start}-{exam_file_end}")
|
||
|
||
# 模块标题关键词(与清理函数保持一致)
|
||
module_keywords = [
|
||
'urine detection', 'urine test', '尿液检测', 'complete blood count', '血常规',
|
||
'blood sugar', 'glucose', '血糖', 'lipid panel', 'lipid profile', '血脂',
|
||
'blood type', '血型', 'coagulation', 'blood coagulation', '凝血',
|
||
'infectious disease', 'four infectious', '传染病', '传染病四项',
|
||
'electrolyte', 'serum electrolyte', '电解质', '血清电解质',
|
||
'liver function', '肝功能', 'kidney function', '肾功能',
|
||
'cardiac enzyme', 'myocardial enzyme', 'enzyme spectrum', '心肌酶', '心肌酶谱',
|
||
'thyroid', 'thyroid function', '甲状腺', '甲状腺功能',
|
||
'cardiovascular', 'thromboembolism', '心血管', '心脑血管',
|
||
'bone metabolism', '骨代谢',
|
||
'trace element', 'heavy metal', 'microelement', '微量元素', '重金属',
|
||
'lymphocyte', 'lymphocyte subpopulation', '淋巴细胞', '淋巴细胞亚群',
|
||
'humoral immunity', '体液免疫', 'immune function', '免疫功能',
|
||
'inflammation', 'inflammatory', '炎症', '炎症反应',
|
||
'autoantibody', 'autoimmune', '自身抗体', '自身免疫',
|
||
'female hormone', '女性激素', '女性荷尔蒙', 'male hormone', '男性激素', '男性荷尔蒙',
|
||
'gynecological', 'female-specific', '妇科', '女性专项',
|
||
'tumor marker', '肿瘤标记物', '肿瘤标志物',
|
||
'imaging', '影像',
|
||
]
|
||
|
||
exclude_keywords = ['health program', 'health report', 'abnormal', '异常', 'overall', 'assessment',
|
||
'medical intervention', '医学干预', 'functional medical health advice', '功能医学健康建议']
|
||
|
||
def is_module_title_paragraph(text):
|
||
"""检查段落是否是模块标题"""
|
||
if not text or len(text) > 100:
|
||
return False
|
||
text_lower = text.lower().strip()
|
||
|
||
# 排除章节大标题(以罗马数字或括号数字开头)
|
||
if text_lower.startswith('(i)') or text_lower.startswith('(ii)') or text_lower.startswith('(iii)'):
|
||
return False
|
||
if text_lower.startswith('i.') or text_lower.startswith('ii.') or text_lower.startswith('iii.'):
|
||
return False
|
||
|
||
if any(ex in text_lower for ex in exclude_keywords):
|
||
return False
|
||
return any(kw in text_lower for kw in module_keywords)
|
||
|
||
def is_module_title_table(elem):
|
||
"""检查表格元素是否是模块标题表格"""
|
||
text = ''.join(elem.itertext()).strip()
|
||
if not text or len(text) > 200:
|
||
return False
|
||
text_lower = text.lower()
|
||
|
||
# 排除章节大标题
|
||
if any(ex in text_lower for ex in exclude_keywords):
|
||
return False
|
||
|
||
# 检查是否包含模块关键词
|
||
for kw in module_keywords:
|
||
if kw in text_lower:
|
||
# 模块标题表格通常会重复模块名称多次
|
||
if text_lower.count(kw) >= 2:
|
||
return True
|
||
return False
|
||
|
||
def is_in_protected_region(idx):
|
||
"""检查索引是否在保护区域内"""
|
||
# 检查是否在前四页保护区域内
|
||
if idx < protection_boundary:
|
||
return True
|
||
# 检查是否在"客户功能医学检测档案"区域内
|
||
if exam_file_start >= 0 and exam_file_start <= idx < exam_file_end:
|
||
return True
|
||
return False
|
||
|
||
def create_page_break_paragraph():
|
||
"""创建包含分页符的段落"""
|
||
p = OxmlElement('w:p')
|
||
r = OxmlElement('w:r')
|
||
br = OxmlElement('w:br')
|
||
br.set(qn('w:type'), 'page')
|
||
r.append(br)
|
||
p.append(r)
|
||
return p
|
||
|
||
# 第一步:清理多余的空白段落和占位符段落(跳过保护区域)
|
||
removed_count = 0
|
||
children = list(body)
|
||
prev_was_empty_p = False
|
||
|
||
# 需要删除的占位符文本
|
||
placeholder_texts = ['testing result检测结果', 'testing result 检测结果']
|
||
|
||
for i, elem in enumerate(children):
|
||
# 跳过保护区域(包括前四页和"客户功能医学检测档案"区域)
|
||
if is_in_protected_region(i):
|
||
prev_was_empty_p = False # 重置状态,避免跨区域删除
|
||
continue
|
||
if elem.tag.endswith('}p'):
|
||
text = ''.join(elem.itertext()).strip()
|
||
text_lower = text.lower().replace(' ', '')
|
||
has_break = elem.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None
|
||
|
||
# 删除 "Testing Result检测结果" 占位符段落
|
||
if any(ph.replace(' ', '') in text_lower for ph in placeholder_texts):
|
||
try:
|
||
body.remove(elem)
|
||
removed_count += 1
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
if not text and not has_break:
|
||
if prev_was_empty_p:
|
||
try:
|
||
body.remove(elem)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
else:
|
||
prev_was_empty_p = True
|
||
else:
|
||
prev_was_empty_p = False
|
||
else:
|
||
prev_was_empty_p = False
|
||
|
||
# 第二步:在模块标题前插入分页符(每个模块都需要,跳过保护区域)
|
||
# 注意:模块标题可能是段落(<p>)或表格(<tbl>)
|
||
# 重新计算保护区域边界(因为第一步删除元素后位置偏移)
|
||
protection_boundary = find_health_program_boundary(doc)
|
||
exam_file_start, exam_file_end = find_examination_file_region(doc)
|
||
pagebreak_count = 0
|
||
children = list(body) # 重新获取
|
||
|
||
for i, elem in enumerate(children):
|
||
# 跳过保护区域
|
||
if is_in_protected_region(i):
|
||
continue
|
||
|
||
is_title = False
|
||
|
||
# 检查段落类型的模块标题
|
||
if elem.tag.endswith('}p'):
|
||
text = ''.join(elem.itertext()).strip()
|
||
if is_module_title_paragraph(text):
|
||
is_title = True
|
||
|
||
# 检查表格类型的模块标题
|
||
elif elem.tag.endswith('}tbl'):
|
||
if is_module_title_table(elem):
|
||
is_title = True
|
||
|
||
if is_title:
|
||
# 检查前面是否已经有分页符
|
||
has_pagebreak_before = False
|
||
if i > 0:
|
||
prev_elem = children[i-1]
|
||
if prev_elem.tag.endswith('}p'):
|
||
prev_break = prev_elem.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
|
||
if prev_break is not None and prev_break.get(qn('w:type')) == 'page':
|
||
has_pagebreak_before = True
|
||
|
||
if not has_pagebreak_before:
|
||
# 在模块标题前插入分页符
|
||
pb = create_page_break_paragraph()
|
||
elem.addprevious(pb)
|
||
pagebreak_count += 1
|
||
|
||
# 第2.3步:清理特定模块后的空白页
|
||
# 特殊处理:某些模块后面容易产生空白页(凝血功能、骨代谢等)
|
||
def clean_module_trailing_blanks(body, module_keywords, next_module_keywords):
|
||
"""清理指定模块数据表格前的多余空白段落"""
|
||
children = list(body)
|
||
removed_count = 0
|
||
|
||
# 找到模块标题表格的位置(数据区域开始)
|
||
for i, elem in enumerate(children):
|
||
if elem.tag.endswith('}tbl'):
|
||
text = ''.join(elem.itertext()).strip().lower()
|
||
if any(kw in text for kw in module_keywords):
|
||
# 找到了模块标题表格,检查前面是否有多余的空段落
|
||
# 往前查找,删除分页符前的空段落(保留一个分页符)
|
||
j = i - 1
|
||
page_break_found = False
|
||
while j >= 0:
|
||
prev_elem = children[j]
|
||
if prev_elem.tag.endswith('}p'):
|
||
prev_text = ''.join(prev_elem.itertext()).strip()
|
||
has_break = prev_elem.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') is not None
|
||
|
||
if not prev_text and not has_break:
|
||
# 空段落,删除
|
||
try:
|
||
body.remove(prev_elem)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
elif has_break and not prev_text:
|
||
# 分页符段落
|
||
if page_break_found:
|
||
# 已经有一个分页符了,删除多余的
|
||
try:
|
||
body.remove(prev_elem)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
else:
|
||
page_break_found = True
|
||
else:
|
||
# 有内容的段落,停止
|
||
break
|
||
else:
|
||
# 不是段落,停止
|
||
break
|
||
j -= 1
|
||
# 重新获取children
|
||
children = list(body)
|
||
|
||
return removed_count
|
||
|
||
# 清理凝血功能模块数据表格前的空白
|
||
removed = clean_module_trailing_blanks(body, ['coagulation', '凝血'], ['infectious', '传染病'])
|
||
if removed > 0:
|
||
print(f" 🧹 清理凝血功能模块前 {removed} 个空白元素")
|
||
|
||
# 清理骨代谢模块数据表格前的空白
|
||
removed = clean_module_trailing_blanks(body, ['bone metabolism', '骨代谢'], ['microelement', '微量元素'])
|
||
if removed > 0:
|
||
print(f" 🧹 清理骨代谢模块前 {removed} 个空白元素")
|
||
|
||
# 清理骨代谢模块数据表格后、微量元素分页符前的空段落
|
||
def clean_between_modules(body, current_module_keywords, next_module_keywords):
|
||
"""清理当前模块最后一个数据表格后、下一个模块分页符前的空段落"""
|
||
children = list(body)
|
||
removed_count = 0
|
||
w_ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
||
|
||
# 找到下一个模块标题的位置
|
||
next_module_pos = -1
|
||
for i, elem in enumerate(children):
|
||
text = ''.join(elem.itertext()).strip().lower()
|
||
if any(kw in text for kw in next_module_keywords):
|
||
next_module_pos = i
|
||
break
|
||
|
||
if next_module_pos < 0:
|
||
return 0
|
||
|
||
# 从下一个模块标题往前找,删除空段落(保留一个分页符)
|
||
j = next_module_pos - 1
|
||
page_break_found = False
|
||
while j >= 0:
|
||
elem = children[j]
|
||
if elem.tag.endswith('}p'):
|
||
text = ''.join(elem.itertext()).strip()
|
||
br_elem = elem.find(f'.//{w_ns}br')
|
||
has_break = br_elem is not None
|
||
break_type = br_elem.get(f'{w_ns}type', '') if br_elem is not None else ''
|
||
|
||
if not text and not has_break:
|
||
# 空段落,删除
|
||
try:
|
||
body.remove(elem)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
elif has_break and break_type == 'page' and not text:
|
||
# 分页符段落
|
||
if page_break_found:
|
||
# 已经有一个分页符了,删除多余的
|
||
try:
|
||
body.remove(elem)
|
||
removed_count += 1
|
||
except:
|
||
pass
|
||
else:
|
||
page_break_found = True
|
||
# 找到分页符,停止(保留这个分页符)
|
||
break
|
||
else:
|
||
# 有内容的段落或其他类型的换行,停止
|
||
break
|
||
elif elem.tag.endswith('}tbl'):
|
||
# 遇到表格,停止
|
||
break
|
||
j -= 1
|
||
|
||
return removed_count
|
||
|
||
removed = clean_between_modules(body, ['bone metabolism', '骨代谢'], ['microelement', '微量元素'])
|
||
if removed > 0:
|
||
print(f" 🧹 清理骨代谢模块后 {removed} 个空白元素")
|
||
|
||
# 第2.5步:在保护区域之后的所有图片前添加分页符
|
||
# 重要:只处理保护区域之后的图片,前四页的图片不能添加分页符
|
||
safe_save(doc, output_path, template_path_local)
|
||
doc = Document(output_path)
|
||
body = doc.element.body
|
||
children = list(body)
|
||
health_program_pos = find_health_program_boundary(doc)
|
||
|
||
print(f" [图片分页] 保护边界位置: {health_program_pos}")
|
||
|
||
# 模块标题关键词(用于判断图片是否是页面底部的logo图片)
|
||
module_keywords = [
|
||
'urine', 'blood', 'sugar', 'lipid', 'coagulation', 'infectious', 'electrolyte',
|
||
'liver', 'kidney', 'myocardial', 'thyroid', 'thromboembolism', 'bone', 'microelement',
|
||
'immunity', 'inflammatory', 'autoantibody', 'hormone', 'tumor', 'lymphocyte', 'imaging',
|
||
'尿液', '血常规', '血糖', '血脂', '凝血', '传染病', '电解质', '肝功能', '肾功能',
|
||
'心肌酶', '甲状腺', '血栓', '骨代谢', '微量元素', '免疫', '炎症', '自身抗体',
|
||
'激素', '肿瘤', '淋巴', '影像'
|
||
]
|
||
|
||
def is_logo_image(children, img_idx):
|
||
"""检查图片是否是页面底部的logo图片(logo后面通常紧跟着下一个模块标题)"""
|
||
# 检查图片后面的几个元素
|
||
for j in range(img_idx + 1, min(img_idx + 5, len(children))):
|
||
next_elem = children[j]
|
||
next_text = ''.join(next_elem.itertext()).strip().lower()
|
||
# 如果后面紧跟着模块标题,说明这是logo图片
|
||
if any(kw in next_text for kw in module_keywords):
|
||
return True
|
||
return False
|
||
|
||
# 先收集所有需要添加分页符的图片元素
|
||
# 注意:不再在图片前添加分页符,因为这会导致空白页
|
||
# 分页符应该在模块标题前添加,而不是在logo图片前
|
||
images_need_pagebreak = []
|
||
# 暂时禁用图片分页符功能,因为它会导致空白页
|
||
# for i, elem in enumerate(children):
|
||
# ...
|
||
|
||
# 然后统一添加分页符(避免循环中修改列表导致的问题)
|
||
image_pagebreak_count = 0
|
||
for elem in images_need_pagebreak:
|
||
pb = create_page_break_paragraph()
|
||
elem.addprevious(pb)
|
||
image_pagebreak_count += 1
|
||
|
||
if image_pagebreak_count > 0:
|
||
print(f" 📷 在 {image_pagebreak_count} 个图片前插入分页符")
|
||
|
||
# 第三步:清理文档末尾的空白内容(空段落、分页符、空表格)
|
||
# 从后往前删除,直到遇到有内容的元素
|
||
children = list(body)
|
||
removed_tail = 0
|
||
for i in range(len(children) - 1, -1, -1):
|
||
elem = children[i]
|
||
tag = elem.tag.split('}')[-1]
|
||
|
||
# 跳过sectPr(文档设置)
|
||
if tag == 'sectPr':
|
||
continue
|
||
|
||
# 检查是否是空段落或只有分页符的段落
|
||
if tag == 'p':
|
||
text = ''.join(elem.itertext()).strip()
|
||
if not text:
|
||
try:
|
||
body.remove(elem)
|
||
removed_tail += 1
|
||
continue
|
||
except:
|
||
pass
|
||
else:
|
||
break # 遇到有内容的段落,停止
|
||
|
||
# 检查是否是空表格(只有标题行没有数据)
|
||
elif tag == 'tbl':
|
||
# 找到对应的Table对象
|
||
is_empty_table = True
|
||
for t in doc.tables:
|
||
if t._tbl is elem:
|
||
# 检查表格是否有实际数据
|
||
for row in t.rows:
|
||
row_text = ' '.join([c.text.strip() for c in row.cells]).lower()
|
||
if row_text and 'clinical significance' not in row_text:
|
||
# 检查是否是数据行(包含数字或结果)
|
||
import re
|
||
if re.search(r'\d', row_text) or any(kw in row_text for kw in ['positive', 'negative', 'normal']):
|
||
is_empty_table = False
|
||
break
|
||
break
|
||
|
||
if is_empty_table:
|
||
try:
|
||
body.remove(elem)
|
||
removed_tail += 1
|
||
continue
|
||
except:
|
||
pass
|
||
else:
|
||
break # 遇到有数据的表格,停止
|
||
else:
|
||
break # 遇到其他类型元素,停止
|
||
|
||
if removed_tail > 0:
|
||
print(f" 🧹 清理文档末尾 {removed_tail} 个空白元素")
|
||
|
||
# 第三步:清理连续的分页符(避免空白页)
|
||
# 重新加载文档
|
||
safe_save(doc, output_path, template_path_local)
|
||
doc = Document(output_path)
|
||
body = doc.element.body
|
||
children = list(body)
|
||
w_ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
||
|
||
removed_pagebreaks = 0
|
||
|
||
# 清理分页符前面的空段落(这会导致空白页)
|
||
i = 0
|
||
while i < len(children):
|
||
elem = children[i]
|
||
if elem.tag.endswith('}p'):
|
||
br = elem.find(f'.//{w_ns}br')
|
||
if br is not None and br.get(f'{w_ns}type') == 'page':
|
||
text = ''.join(elem.itertext()).strip()
|
||
if not text: # 这是一个分页符段落
|
||
# 检查前面是否有空段落,如果有就删除
|
||
if i > 0:
|
||
prev_elem = children[i - 1]
|
||
if prev_elem.tag.endswith('}p'):
|
||
prev_text = ''.join(prev_elem.itertext()).strip()
|
||
prev_br = prev_elem.find(f'.//{w_ns}br')
|
||
if not prev_text and prev_br is None:
|
||
# 前面是空段落,删除它
|
||
try:
|
||
body.remove(prev_elem)
|
||
children = list(body)
|
||
removed_pagebreaks += 1
|
||
continue # 不增加i,继续检查
|
||
except:
|
||
pass
|
||
i += 1
|
||
|
||
# 清理连续的分页符
|
||
children = list(body)
|
||
i = 0
|
||
while i < len(children) - 1:
|
||
elem = children[i]
|
||
next_elem = children[i + 1]
|
||
|
||
if elem.tag.endswith('}p'):
|
||
br = elem.find(f'.//{w_ns}br')
|
||
if br is not None and br.get(f'{w_ns}type') == 'page':
|
||
text = ''.join(elem.itertext()).strip()
|
||
if not text:
|
||
if next_elem.tag.endswith('}p'):
|
||
next_br = next_elem.find(f'.//{w_ns}br')
|
||
next_text = ''.join(next_elem.itertext()).strip()
|
||
|
||
if next_br is not None and next_br.get(f'{w_ns}type') == 'page' and not next_text:
|
||
try:
|
||
body.remove(elem)
|
||
children = list(body)
|
||
removed_pagebreaks += 1
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
elif not next_text and next_br is None:
|
||
try:
|
||
body.remove(next_elem)
|
||
children = list(body)
|
||
removed_pagebreaks += 1
|
||
continue
|
||
except:
|
||
pass
|
||
i += 1
|
||
|
||
# 第四步:删除表头前面的多余分页符
|
||
# 表头前面不应该有分页符(分页符应该在模块标题前面)
|
||
children = list(body)
|
||
removed_header_pagebreaks = 0
|
||
i = 1
|
||
while i < len(children):
|
||
elem = children[i]
|
||
if elem.tag.endswith('}tbl'):
|
||
# 检查是否是表头表格
|
||
text = ''.join(elem.itertext()).strip().lower()
|
||
if 'abb' in text and 'project' in text and 'result' in text:
|
||
# 这是表头表格,检查前面是否有分页符
|
||
if i > 0:
|
||
prev_elem = children[i - 1]
|
||
if prev_elem.tag.endswith('}p'):
|
||
br = prev_elem.find(f'.//{w_ns}br')
|
||
if br is not None and br.get(f'{w_ns}type') == 'page':
|
||
prev_text = ''.join(prev_elem.itertext()).strip()
|
||
if not prev_text:
|
||
try:
|
||
body.remove(prev_elem)
|
||
children = list(body)
|
||
removed_header_pagebreaks += 1
|
||
continue # 不增加i
|
||
except:
|
||
pass
|
||
i += 1
|
||
|
||
if removed_pagebreaks > 0:
|
||
print(f" 🧹 清理 {removed_pagebreaks} 个连续分页符")
|
||
if removed_header_pagebreaks > 0:
|
||
print(f" 🧹 清理表头前 {removed_header_pagebreaks} 个多余分页符")
|
||
|
||
# 使用安全保存
|
||
safe_save(doc, output_path, template_path_local)
|
||
print(f"\n✓ 格式整理完成: 清理了 {removed_count} 个多余空白段落, 插入 {pagebreak_count} 个模块间分页符")
|
||
|
||
return doc
|
||
|
||
|
||
def main(force_extract=False, use_deepseek=False, deepseek_api_key=None):
|
||
"""
|
||
主函数
|
||
Args:
|
||
force_extract: 是否强制重新提取数据(忽略缓存)
|
||
use_deepseek: 是否使用DeepSeek分析补充数据
|
||
deepseek_api_key: DeepSeek API密钥
|
||
"""
|
||
# 路径配置
|
||
pdf_dir = r"c:\Users\UI\Desktop\医疗报告\医疗报告智能体"
|
||
template_config_path = Path(__file__).parent / "abb_mapping_config.json"
|
||
word_template_path = Path(__file__).parent / "template_complete.docx"
|
||
reports_dir = Path(__file__).parent / "reports"
|
||
reports_dir.mkdir(exist_ok=True)
|
||
from datetime import datetime
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
output_path = reports_dir / f"filled_report_{timestamp}.docx"
|
||
extracted_file = Path(__file__).parent / "extracted_medical_data.json"
|
||
|
||
# ========== 获取保护边界位置(不备份,改为在各步骤中跳过保护区域)==========
|
||
print('\n' + '=' * 60)
|
||
print('[PROTECT] 检测保护区域边界(前四页)')
|
||
print('=' * 60)
|
||
template_doc = Document(word_template_path)
|
||
protection_boundary = find_health_program_boundary(template_doc)
|
||
print(f' 保护边界位置: {protection_boundary}')
|
||
print(f' 说明: 保护区域内的元素将在各处理步骤中被跳过')
|
||
del template_doc # 释放模板文档
|
||
|
||
|
||
print("=" * 60)
|
||
print("步骤1: 获取检测数据 (百度OCR)")
|
||
print("=" * 60)
|
||
|
||
# 检查PDF目录中的文件
|
||
pdf_files = list(Path(pdf_dir).glob("*.pdf"))
|
||
pdf_files_info = {str(f.name): f.stat().st_mtime for f in pdf_files}
|
||
|
||
# 检查是否需要重新提取
|
||
need_extract = force_extract
|
||
|
||
if not need_extract and extracted_file.exists():
|
||
with open(extracted_file, 'r', encoding='utf-8') as f:
|
||
cached_data = json.load(f)
|
||
|
||
# 检查缓存中记录的PDF文件信息
|
||
cached_pdf_info = cached_data.get('pdf_files', {})
|
||
|
||
# 比较当前PDF文件和缓存中的文件
|
||
if set(pdf_files_info.keys()) != set(cached_pdf_info.keys()):
|
||
# 文件列表不同(有新增或删除)
|
||
new_files = set(pdf_files_info.keys()) - set(cached_pdf_info.keys())
|
||
removed_files = set(cached_pdf_info.keys()) - set(pdf_files_info.keys())
|
||
if new_files:
|
||
print(f" 📄 检测到新增PDF文件: {', '.join(new_files)}")
|
||
if removed_files:
|
||
print(f" 📄 检测到删除PDF文件: {', '.join(removed_files)}")
|
||
need_extract = True
|
||
else:
|
||
# 检查文件修改时间
|
||
for fname, mtime in pdf_files_info.items():
|
||
if fname in cached_pdf_info and mtime > cached_pdf_info[fname]:
|
||
print(f" 📄 检测到PDF文件已更新: {fname}")
|
||
need_extract = True
|
||
break
|
||
else:
|
||
need_extract = True
|
||
|
||
if not need_extract:
|
||
print(f" ✓ 发现缓存数据: {extracted_file}")
|
||
extracted_items = cached_data.get('items', [])
|
||
patient_info = cached_data.get('patient_info', {})
|
||
print(f" ✓ 从缓存读取 {len(extracted_items)} 个检测项")
|
||
if patient_info:
|
||
print(f" ✓ 从缓存读取患者信息: {patient_info.get('name', '未知')}")
|
||
print(f" 💡 如需重新提取,请删除缓存文件或使用 --force 参数")
|
||
else:
|
||
# 重新提取
|
||
if force_extract:
|
||
print(" 📄 强制重新提取...")
|
||
else:
|
||
print(" 📄 检测到文件变化,开始OCR提取...")
|
||
|
||
# 提取检测数据(同时返回OCR原文,避免重复OCR)
|
||
extracted_items, ocr_texts = extract_all_pdfs(pdf_dir)
|
||
print(f"\n共提取 {len(extracted_items)} 个检测项")
|
||
|
||
# 提取患者基本信息(复用已有的OCR文本,不再重复调用OCR)
|
||
patient_info = {}
|
||
if ocr_texts:
|
||
print("\n 📋 提取患者基本信息...")
|
||
first_ocr_text = next(iter(ocr_texts.values()))
|
||
patient_info = extract_patient_info(first_ocr_text)
|
||
print(f" 姓名: {patient_info.get('name', '未提取')}")
|
||
print(f" 性别: {patient_info.get('gender', '未提取')}")
|
||
print(f" 年龄: {patient_info.get('age', '未提取')}")
|
||
print(f" 体检时间: {patient_info.get('exam_time', '未提取')}")
|
||
print(f" 报告时间: {patient_info.get('report_time', '未提取')}")
|
||
|
||
# 保存提取的数据(包含PDF文件信息和患者信息用于后续比较)
|
||
with open(extracted_file, 'w', encoding='utf-8') as f:
|
||
json.dump({
|
||
'total_items': len(extracted_items),
|
||
'items': extracted_items,
|
||
'pdf_files': pdf_files_info, # 记录PDF文件信息
|
||
'patient_info': patient_info # 记录患者信息
|
||
}, f, ensure_ascii=False, indent=2)
|
||
print(f"✓ 数据已保存到: {extracted_file}")
|
||
|
||
# 设置全局DeepSeek API Key
|
||
global DEEPSEEK_API_KEY
|
||
if deepseek_api_key:
|
||
DEEPSEEK_API_KEY = deepseek_api_key
|
||
|
||
print("\n" + "=" * 60)
|
||
print("步骤2: 与模板结构匹配")
|
||
print("=" * 60)
|
||
with open(template_config_path, 'r', encoding='utf-8') as f:
|
||
template_config = json.load(f)
|
||
matched_data = match_with_template(extracted_items, template_config)
|
||
|
||
# 步骤2.5: 使用DeepSeek补充参考范围和判断异常
|
||
if use_deepseek and deepseek_api_key:
|
||
print("\n" + "=" * 60)
|
||
print("步骤2.5: 智能补充参考范围和异常判断")
|
||
print("=" * 60)
|
||
matched_data = enhance_data_with_deepseek(matched_data, deepseek_api_key)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("步骤3: 填入Word模板")
|
||
print("=" * 60)
|
||
fill_word_template(word_template_path, matched_data, output_path, deepseek_api_key, patient_info)
|
||
|
||
# 步骤4: 处理额外检测项目
|
||
# 注意:步骤3已经通过DeepSeek分类处理了大部分项目,这里只处理真正未被处理的项目
|
||
print("\n" + "=" * 60)
|
||
print("步骤4: 处理额外检测项目")
|
||
print("=" * 60)
|
||
# 暂时禁用额外项目处理,因为步骤3已经通过DeepSeek分类处理了所有项目
|
||
# 如果需要启用,需要修改extra_items_handler.py排除已在步骤3中处理的项目
|
||
print(" ℹ️ 额外项目已在步骤3中通过DeepSeek分类处理")
|
||
# try:
|
||
# from extra_items_handler import process_extra_items
|
||
# process_extra_items(extracted_items, str(output_path), deepseek_api_key)
|
||
# except Exception as e:
|
||
# print(f" ⚠️ 额外项目处理失败: {e}")
|
||
# import traceback
|
||
# traceback.print_exc()
|
||
|
||
# 步骤5: 填充异常指标汇总
|
||
print("\n" + "=" * 60)
|
||
print("步骤5: 填充异常指标汇总")
|
||
print("=" * 60)
|
||
# 收集异常项目
|
||
abnormal_items = []
|
||
for abb, data in matched_data.items():
|
||
point = data.get('point', '')
|
||
if point in ['↑', '↓', 'H', 'L', '高', '低']:
|
||
abnormal_items.append({
|
||
'abb': abb,
|
||
'name': data.get('project', abb),
|
||
'result': data.get('result', ''),
|
||
'point': point,
|
||
'reference': data.get('reference', ''),
|
||
'unit': data.get('unit', '')
|
||
})
|
||
|
||
if abnormal_items:
|
||
print(f" 发现 {len(abnormal_items)} 个异常项目")
|
||
doc = Document(output_path)
|
||
from health_content_generator import fill_abnormal_index_summary, generate_item_explanations
|
||
|
||
# 获取异常项目的临床意义解释(优先使用模板解释)
|
||
item_explanations = generate_item_explanations(abnormal_items, deepseek_api_key, call_deepseek_api if use_deepseek else None)
|
||
|
||
fill_abnormal_index_summary(doc, abnormal_items, item_explanations)
|
||
# 使用安全保存
|
||
from xml_safe_save import safe_save
|
||
safe_save(doc, output_path, word_template_path)
|
||
else:
|
||
print(" 没有异常项目")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("步骤6: 清理空白数据行")
|
||
print("=" * 60)
|
||
clean_empty_rows(output_path, output_path, patient_info)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("步骤7: 格式整理(表格间空行 + 模块间分页符)")
|
||
print("=" * 60)
|
||
format_document_structure(output_path, output_path)
|
||
|
||
# 步骤8: 修复保护区域
|
||
print("\n" + "=" * 60)
|
||
print("步骤8: 修复保护区域(前四页)")
|
||
print("=" * 60)
|
||
print(" 策略: 从原始模板复制前四页,保留所有图片和布局")
|
||
copy_protected_region_from_template(word_template_path, output_path, protection_boundary)
|
||
|
||
# 步骤8.5: 填充患者基本信息
|
||
print("\n" + "=" * 60)
|
||
print("步骤8.5: 填充患者基本信息")
|
||
print("=" * 60)
|
||
if patient_info and any(patient_info.values()):
|
||
doc = Document(output_path)
|
||
fill_patient_info_in_template(doc, patient_info)
|
||
doc.save(output_path)
|
||
print(f" ✓ 患者信息已填充")
|
||
else:
|
||
print(" ⚠️ 未提取到患者信息,跳过填充")
|
||
|
||
# 步骤9: 根据异常项生成健康评估和建议内容(可选)
|
||
# 注意:必须在步骤8之后执行,因为步骤8会从模板复制前四页
|
||
if use_deepseek and deepseek_api_key:
|
||
print("\n" + "=" * 60)
|
||
print("步骤9: 生成健康评估与建议内容")
|
||
print("=" * 60)
|
||
doc = Document(output_path)
|
||
from health_content_generator import generate_and_fill_health_content as gen_health
|
||
gen_health(doc, matched_data, deepseek_api_key, call_deepseek_api)
|
||
# 直接保存,不使用safe_save(避免覆盖分页符)
|
||
doc.save(output_path)
|
||
print(f" ✓ 健康内容已保存")
|
||
|
||
# 步骤10: 修复页脚(确保所有页面都有 Be.U Med logo)
|
||
print("\n" + "=" * 60)
|
||
print("步骤10: 修复页脚")
|
||
print("=" * 60)
|
||
fix_footer_reference(word_template_path, output_path)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("✅ 全部完成!")
|
||
print(f"✅ 输出文件: {output_path}")
|
||
print("=" * 60)
|
||
|
||
if __name__ == '__main__':
|
||
import os
|
||
|
||
force = '--force' in sys.argv or '-f' in sys.argv
|
||
# 默认启用 DeepSeek 分析
|
||
use_deepseek = '--no-deepseek' not in sys.argv
|
||
|
||
# 获取DeepSeek API Key(优先使用代码中的默认值,其次环境变量,最后命令行参数)
|
||
deepseek_key = DEEPSEEK_API_KEY or os.environ.get('DEEPSEEK_API_KEY', '')
|
||
for i, arg in enumerate(sys.argv):
|
||
if arg in ['--api-key', '-k'] and i + 1 < len(sys.argv):
|
||
deepseek_key = sys.argv[i + 1]
|
||
break
|
||
|
||
if use_deepseek and not deepseek_key:
|
||
print("⚠️ 使用DeepSeek需要提供API Key")
|
||
print(" 方法1: 在代码中设置 DEEPSEEK_API_KEY")
|
||
print(" 方法2: 设置环境变量 DEEPSEEK_API_KEY")
|
||
print(" 方法3: 使用参数 --api-key YOUR_KEY")
|
||
sys.exit(1)
|
||
|
||
print("=" * 60)
|
||
print(" 医疗报告智能提取与填充系统")
|
||
print("=" * 60)
|
||
print(f" OCR提取: 百度高精度OCR")
|
||
print(f" 智能分析: {'DeepSeek ✓' if use_deepseek else '关闭'}")
|
||
print(f" 强制刷新: {'是' if force else '否'}")
|
||
print("=" * 60)
|
||
|
||
main(force_extract=force, use_deepseek=use_deepseek, deepseek_api_key=deepseek_key)
|