1045 lines
39 KiB
Python
1045 lines
39 KiB
Python
|
|
"""
|
|||
|
|
使用docxtpl填充Word模板
|
|||
|
|
"""
|
|||
|
|
from docxtpl import DocxTemplate
|
|||
|
|
import json
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_extracted_data(items: list) -> list:
|
|||
|
|
"""清理提取的数据,分离单位和参考范围,过滤无效数据"""
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
cleaned = []
|
|||
|
|
|
|||
|
|
for item in items:
|
|||
|
|
result = item.get('result', '')
|
|||
|
|
unit = item.get('unit', '')
|
|||
|
|
reference = item.get('reference', '')
|
|||
|
|
project = item.get('project', '')
|
|||
|
|
|
|||
|
|
# 跳过无效数据
|
|||
|
|
if result in ['.', ':', '-', '/', '', None]:
|
|||
|
|
# 检查unit中是否有实际结果(如 "Yellow [Normal...]")
|
|||
|
|
if unit:
|
|||
|
|
# 提取unit开头的结果值
|
|||
|
|
result_in_unit = re.match(r'^([A-Za-z]+)\s*\[', unit)
|
|||
|
|
if result_in_unit:
|
|||
|
|
item['result'] = result_in_unit.group(1)
|
|||
|
|
unit = re.sub(r'^[A-Za-z]+\s*', '', unit)
|
|||
|
|
else:
|
|||
|
|
continue # 跳过无效数据
|
|||
|
|
else:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过明显错误的project(如包含Phase、antibody等)
|
|||
|
|
if any(kw in project.lower() for kw in ['phase', 'antibody', 'treponema']):
|
|||
|
|
# 这些可能是OCR错误识别的行
|
|||
|
|
abb = item.get('abb', '').upper()
|
|||
|
|
if abb in ['PH', 'CU', 'CL', 'CA']: # 这些ABB容易被误匹配
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 如果unit包含[Normal...]或(...)范围信息,分离出来
|
|||
|
|
if unit:
|
|||
|
|
# 匹配 [Normal : xxx] 或 [正常 : xxx]
|
|||
|
|
normal_match = re.search(r'\[Normal\s*[::]\s*([^\]]+)\]', unit, re.IGNORECASE)
|
|||
|
|
if normal_match:
|
|||
|
|
if not reference:
|
|||
|
|
item['reference'] = normal_match.group(1).strip()
|
|||
|
|
unit = re.sub(r'\[Normal\s*[::][^\]]+\]', '', unit, flags=re.IGNORECASE).strip()
|
|||
|
|
|
|||
|
|
# 匹配 (xxx-xxx) 范围
|
|||
|
|
range_match = re.search(r'\([\d\.\-<>]+\)', unit)
|
|||
|
|
if range_match and not reference:
|
|||
|
|
item['reference'] = range_match.group(0)
|
|||
|
|
unit = re.sub(r'\([\d\.\-<>]+\)', '', unit).strip()
|
|||
|
|
|
|||
|
|
# 清理开头的数字(可能是错误解析)
|
|||
|
|
unit = re.sub(r'^-?\d+\s*', '', unit).strip()
|
|||
|
|
|
|||
|
|
item['unit'] = unit
|
|||
|
|
|
|||
|
|
cleaned.append(item)
|
|||
|
|
|
|||
|
|
return cleaned
|
|||
|
|
|
|||
|
|
|
|||
|
|
def select_best_match(items_by_abb: dict) -> dict:
|
|||
|
|
"""当同一ABB有多个条目时,选择最佳的一个"""
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
best = {}
|
|||
|
|
for abb, items in items_by_abb.items():
|
|||
|
|
if len(items) == 1:
|
|||
|
|
best[abb] = items[0]
|
|||
|
|
else:
|
|||
|
|
# 选择有有效数值结果的
|
|||
|
|
scored = []
|
|||
|
|
for item in items:
|
|||
|
|
score = 0
|
|||
|
|
result = item.get('result', '')
|
|||
|
|
|
|||
|
|
# 有数值结果加分
|
|||
|
|
if re.search(r'\d+\.?\d*', result):
|
|||
|
|
score += 10
|
|||
|
|
|
|||
|
|
# 有参考范围加分
|
|||
|
|
if item.get('reference'):
|
|||
|
|
score += 5
|
|||
|
|
|
|||
|
|
# 有单位加分
|
|||
|
|
if item.get('unit') and len(item.get('unit', '')) < 20:
|
|||
|
|
score += 3
|
|||
|
|
|
|||
|
|
# 定性结果(Negative/Positive等)也有效
|
|||
|
|
if result.lower() in ['negative', 'positive', 'normal', 'reactive', 'non-reactive']:
|
|||
|
|
score += 8
|
|||
|
|
|
|||
|
|
scored.append((score, item))
|
|||
|
|
|
|||
|
|
# 选择得分最高的
|
|||
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
best[abb] = scored[0][1]
|
|||
|
|
|
|||
|
|
return best
|
|||
|
|
|
|||
|
|
|
|||
|
|
def build_context(matched_data: dict) -> dict:
|
|||
|
|
"""
|
|||
|
|
将匹配数据转换为docxtpl上下文格式
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
matched_data: {ABB: {result, unit, reference, point}}
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
docxtpl context dict
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
context = {}
|
|||
|
|
|
|||
|
|
# 模块映射(根据project名称和ABB推断模块)
|
|||
|
|
def get_module(abb, project, result):
|
|||
|
|
abb_upper = abb.upper()
|
|||
|
|
project_lower = project.lower()
|
|||
|
|
result_lower = result.lower() if result else ''
|
|||
|
|
|
|||
|
|
# 尿检特有项目
|
|||
|
|
urine_projects = ['color', 'specific gravity', 'protein', 'glucose', 'ketone',
|
|||
|
|
'nitrite', 'turbidity', '颜色', '比重', '蛋白', '糖', '酮体', '亚硝酸']
|
|||
|
|
if any(kw in project_lower for kw in urine_projects):
|
|||
|
|
return 'URINE'
|
|||
|
|
|
|||
|
|
# 尿检WBC特征:project是"WBC"且result是小数字或Negative/Positive
|
|||
|
|
if abb_upper == 'WBC' and project_lower == 'wbc':
|
|||
|
|
return 'URINE'
|
|||
|
|
if abb_upper == 'WBC' and 'total' in project_lower:
|
|||
|
|
return 'CBC'
|
|||
|
|
|
|||
|
|
# pH在尿检中
|
|||
|
|
if abb_upper == 'PH' and 'ph' in project_lower and len(project) < 20:
|
|||
|
|
return 'URINE'
|
|||
|
|
|
|||
|
|
# 定性结果通常是尿检
|
|||
|
|
if abb_upper in ['PRO', 'GLU', 'KET', 'NIT', 'BLD'] and result_lower in ['negative', 'positive', 'trace']:
|
|||
|
|
return 'URINE'
|
|||
|
|
|
|||
|
|
return ''
|
|||
|
|
|
|||
|
|
# 重复ABB列表
|
|||
|
|
duplicate_abbs = ['PRO', 'WBC', 'COLOR', 'PH', 'GLU', 'SG', 'NIT', 'KET', 'BLD', 'ERY']
|
|||
|
|
|
|||
|
|
# ABB别名映射:提取数据ABB -> 模板变量名格式
|
|||
|
|
# 解决如 CA153 vs CA15_3、CA199 vs CA19_9 的格式差异
|
|||
|
|
abb_aliases = {
|
|||
|
|
'CA153': 'CA15_3',
|
|||
|
|
'CA199': 'CA19_9',
|
|||
|
|
'ABO': 'BLOODTYPE', # ABO血型 -> BLOODTYPE
|
|||
|
|
'RH': 'BLOODTYPERH', # Rh血型 -> BLOODTYPERH
|
|||
|
|
'CKMB': 'CK_MB', # 心肌酶
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for abb, data in matched_data.items():
|
|||
|
|
# 标准化变量名(只保留字母数字下划线)
|
|||
|
|
var_name = abb.replace('-', '_').replace('/', '_').replace('%', 'pct')
|
|||
|
|
var_name = re.sub(r'[^a-zA-Z0-9_]', '', var_name)
|
|||
|
|
|
|||
|
|
# 检查是否有别名映射
|
|||
|
|
abb_upper = abb.upper()
|
|||
|
|
if abb_upper in abb_aliases:
|
|||
|
|
alias_var = abb_aliases[abb_upper]
|
|||
|
|
# 同时生成别名格式的变量
|
|||
|
|
context[f"{alias_var}_result"] = data.get('result', '')
|
|||
|
|
context[f"{alias_var}_point"] = data.get('point', '')
|
|||
|
|
context[f"{alias_var}_refer"] = data.get('reference', '')
|
|||
|
|
context[f"{alias_var}_unit"] = data.get('unit', '')
|
|||
|
|
if not var_name or var_name[0].isdigit():
|
|||
|
|
var_name = 'V_' + var_name
|
|||
|
|
|
|||
|
|
# 对于重复ABB,根据project推断模块并添加前缀
|
|||
|
|
if abb.upper() in duplicate_abbs:
|
|||
|
|
module = get_module(abb, data.get('project', ''), data.get('result', ''))
|
|||
|
|
if module:
|
|||
|
|
var_name_with_module = f"{module}_{var_name}"
|
|||
|
|
context[f"{var_name_with_module}_result"] = data.get('result', '')
|
|||
|
|
context[f"{var_name_with_module}_point"] = data.get('point', '')
|
|||
|
|
context[f"{var_name_with_module}_refer"] = data.get('reference', '')
|
|||
|
|
context[f"{var_name_with_module}_unit"] = data.get('unit', '')
|
|||
|
|
|
|||
|
|
# 同时保留不带前缀的(兼容)
|
|||
|
|
context[f"{var_name}_result"] = data.get('result', '')
|
|||
|
|
context[f"{var_name}_point"] = data.get('point', '')
|
|||
|
|
context[f"{var_name}_refer"] = data.get('reference', '')
|
|||
|
|
context[f"{var_name}_unit"] = data.get('unit', '')
|
|||
|
|
|
|||
|
|
return context
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fill_template(template_path: str, matched_data: dict, output_path: str):
|
|||
|
|
"""
|
|||
|
|
使用docxtpl填充模板
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
template_path: docxtpl格式的模板路径
|
|||
|
|
matched_data: 匹配的数据
|
|||
|
|
output_path: 输出文件路径
|
|||
|
|
"""
|
|||
|
|
doc = DocxTemplate(template_path)
|
|||
|
|
|
|||
|
|
# 构建上下文
|
|||
|
|
context = build_context(matched_data)
|
|||
|
|
|
|||
|
|
print(f"准备填充 {len(context)} 个变量")
|
|||
|
|
|
|||
|
|
# 渲染
|
|||
|
|
doc.render(context)
|
|||
|
|
|
|||
|
|
# 保存
|
|||
|
|
doc.save(output_path)
|
|||
|
|
print(f"[OK] 已保存到: {output_path}")
|
|||
|
|
|
|||
|
|
return doc
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_empty_rows(doc_path: str, output_path: str):
|
|||
|
|
"""清理空白数据行,合并表格"""
|
|||
|
|
from docx import Document
|
|||
|
|
from docx.text.paragraph import Paragraph as EarlyPara
|
|||
|
|
import re
|
|||
|
|
import copy
|
|||
|
|
|
|||
|
|
doc = Document(doc_path)
|
|||
|
|
|
|||
|
|
# === 首先删除"异常指标汇总"区域的所有表格 ===
|
|||
|
|
# 这些表格在第一个检测模块之前,不应该存在
|
|||
|
|
body_early = doc._body._body
|
|||
|
|
children_early = list(body_early)
|
|||
|
|
|
|||
|
|
# 检测模块关键词(必须精确匹配检测模块标题)
|
|||
|
|
detection_kw = ['urine detection', '尿液检测', 'complete blood count', '血常规',
|
|||
|
|
'blood sugar', '血糖', 'blood lipid', '血脂', 'liver function', '肝功能',
|
|||
|
|
'kidney function', '肾功能', 'thyroid', '甲状腺', 'coagulation', '凝血',
|
|||
|
|
'infectious', '传染病', 'electrolyte', '电解质']
|
|||
|
|
exclude_kw = ['health program', '健康方案', 'health report', '健康报告',
|
|||
|
|
'abnormal', '异常', 'overall', '整体', 'assessment', '评估',
|
|||
|
|
'blood glucose', 'hematology', 'hormonal', 'immunology', 'nutrition']
|
|||
|
|
|
|||
|
|
# 找第一个检测模块位置(查找精确的模块标题)
|
|||
|
|
first_module_idx = len(children_early)
|
|||
|
|
for idx, elem in enumerate(children_early):
|
|||
|
|
if elem.tag.endswith('}p'):
|
|||
|
|
try:
|
|||
|
|
p = EarlyPara(elem, doc)
|
|||
|
|
txt = p.text.strip().lower()
|
|||
|
|
# 检测模块标题通常是短文本且包含特定关键词
|
|||
|
|
if txt and len(txt) < 80:
|
|||
|
|
is_mod = any(k in txt for k in detection_kw)
|
|||
|
|
is_exc = any(k in txt for k in exclude_kw)
|
|||
|
|
if is_mod and not is_exc:
|
|||
|
|
first_module_idx = idx
|
|||
|
|
print(f" 找到第一个检测模块: 位置{idx}")
|
|||
|
|
break
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 删除第一个检测模块之前的所有表格(无论有无数据)
|
|||
|
|
removed_early = 0
|
|||
|
|
for idx, elem in enumerate(children_early):
|
|||
|
|
if idx >= first_module_idx:
|
|||
|
|
break
|
|||
|
|
if elem.tag.endswith('}tbl'):
|
|||
|
|
try:
|
|||
|
|
elem.getparent().remove(elem)
|
|||
|
|
removed_early += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if removed_early > 0:
|
|||
|
|
print(f"[OK] 删除异常指标汇总区域表格: {removed_early} 个")
|
|||
|
|
|
|||
|
|
removed_rows = 0
|
|||
|
|
merged_count = 0
|
|||
|
|
|
|||
|
|
def has_data_in_row(cells):
|
|||
|
|
# 有效的定性结果列表
|
|||
|
|
valid_qualitative = [
|
|||
|
|
'negative', 'positive', 'normal', 'reactive', 'non-reactive',
|
|||
|
|
'trace', 'clear', 'cloudy', 'turbid',
|
|||
|
|
'yellow', 'pale yellow', 'dark yellow', 'amber', 'straw', # 尿液颜色
|
|||
|
|
'red', 'brown', 'green', 'orange',
|
|||
|
|
'a', 'b', 'ab', 'o', 'rh+', 'rh-', # 血型
|
|||
|
|
'detected', 'not detected', 'present', 'absent'
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 只以“Result列”判断是否有数据,避免把 Project/Refer 误判为结果
|
|||
|
|
# 模板结构通常为:
|
|||
|
|
# - 11列:0 ABB, 1-2 Project, 3-4 Result, 5-6 Point, 7-8 Refer, 9-10 Unit
|
|||
|
|
# - 6列:0 ABB, 1 Project, 2 Result, 3 Point, 4 Refer, 5 Unit
|
|||
|
|
if len(cells) >= 11:
|
|||
|
|
result_col_candidates = [3, 4]
|
|||
|
|
elif len(cells) >= 6:
|
|||
|
|
result_col_candidates = [2, 3]
|
|||
|
|
else:
|
|||
|
|
result_col_candidates = [2]
|
|||
|
|
|
|||
|
|
result_candidates = []
|
|||
|
|
for col_idx in result_col_candidates:
|
|||
|
|
if col_idx < len(cells):
|
|||
|
|
txt = (cells[col_idx].text or '').strip()
|
|||
|
|
if txt:
|
|||
|
|
result_candidates.append(txt)
|
|||
|
|
result_text = result_candidates[0] if result_candidates else ''
|
|||
|
|
|
|||
|
|
if not result_text:
|
|||
|
|
return False
|
|||
|
|
if result_text in ['', '-', '/', ' ', '.', ':', '{{', '}}']:
|
|||
|
|
return False
|
|||
|
|
if result_text.startswith('{{'):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 排除“范围值”形态(常出现在 Refer 列,但模板错位时也可能落到 Result/Point 列)
|
|||
|
|
if re.match(r'^[\(\[]?\s*[-+]?\d+(?:\.\d+)?\s*[-–~]\s*[-+]?\d+(?:\.\d+)?\s*[\)\]]?$', result_text):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
if re.search(r'\d', result_text):
|
|||
|
|
return True
|
|||
|
|
if result_text.lower() in valid_qualitative:
|
|||
|
|
return True
|
|||
|
|
if len(result_text) > 2 and result_text.isalpha():
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def is_header_row(row_text, cells=None):
|
|||
|
|
"""精确识别表头行"""
|
|||
|
|
# 先排除描述行,避免被误判为表头
|
|||
|
|
if 'clinical significance' in row_text or '临床意义' in row_text:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 表头必须具备“Abb/简称 + Project/项目 + Result/结果”组合特征
|
|||
|
|
has_abb = ('abb' in row_text) or ('简称' in row_text)
|
|||
|
|
has_project = ('project' in row_text) or ('项目' in row_text)
|
|||
|
|
has_result = ('result' in row_text) or ('结果' in row_text)
|
|||
|
|
if not (has_abb and has_project and has_result):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
if cells:
|
|||
|
|
non_empty_cells = [c for c in cells if c.text.strip()]
|
|||
|
|
if len(non_empty_cells) < 2:
|
|||
|
|
return False
|
|||
|
|
if any(len(c.text.strip()) > 30 for c in cells):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
def is_description_row(row_text):
|
|||
|
|
return 'clinical significance' in row_text or '临床意义' in row_text
|
|||
|
|
|
|||
|
|
def is_data_row(first_cell):
|
|||
|
|
if first_cell and 1 <= len(first_cell) <= 20:
|
|||
|
|
clean = re.sub(r'[^a-zA-Z0-9]', '', first_cell)
|
|||
|
|
return bool(clean) and clean.isalnum()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def analyze_table(table):
|
|||
|
|
info = {'header_idx': -1, 'desc_indices': [], 'data_with_result': [], 'data_without_result': []}
|
|||
|
|
for row_idx, row in enumerate(table.rows):
|
|||
|
|
cells = row.cells
|
|||
|
|
if len(cells) < 2:
|
|||
|
|
continue
|
|||
|
|
row_text = ' '.join([c.text.strip().lower() for c in cells])
|
|||
|
|
first_cell = cells[0].text.strip()
|
|||
|
|
|
|||
|
|
if is_header_row(row_text, cells):
|
|||
|
|
info['header_idx'] = row_idx
|
|||
|
|
elif is_description_row(row_text):
|
|||
|
|
info['desc_indices'].append(row_idx)
|
|||
|
|
elif is_data_row(first_cell):
|
|||
|
|
if has_data_in_row(cells):
|
|||
|
|
info['data_with_result'].append(row_idx)
|
|||
|
|
else:
|
|||
|
|
info['data_without_result'].append(row_idx)
|
|||
|
|
return info
|
|||
|
|
|
|||
|
|
def is_special_table(table):
|
|||
|
|
try:
|
|||
|
|
if len(table.rows) != 3:
|
|||
|
|
return False
|
|||
|
|
row2_text = ' '.join([c.text for c in table.rows[2].cells]).lower()
|
|||
|
|
return ('clinical significance' in row2_text) or ('临床意义' in row2_text)
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def special_table_has_data(table):
|
|||
|
|
try:
|
|||
|
|
if len(table.rows) < 2:
|
|||
|
|
return False
|
|||
|
|
cells = table.rows[1].cells
|
|||
|
|
if len(cells) < 3:
|
|||
|
|
return False
|
|||
|
|
result_text = (cells[2].text or '').strip()
|
|||
|
|
if not result_text:
|
|||
|
|
return False
|
|||
|
|
if result_text in ['', '-', '/', '.', ':']:
|
|||
|
|
return False
|
|||
|
|
if result_text.startswith('{{'):
|
|||
|
|
return False
|
|||
|
|
return True
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
removed_special_tables = 0
|
|||
|
|
for table in list(doc.tables):
|
|||
|
|
if is_special_table(table) and not special_table_has_data(table):
|
|||
|
|
try:
|
|||
|
|
table._tbl.getparent().remove(table._tbl)
|
|||
|
|
removed_special_tables += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 获取表格顺序
|
|||
|
|
body = doc._body._body
|
|||
|
|
table_order = []
|
|||
|
|
table_elem_indices = {} # 记录每个表格在body中的元素索引
|
|||
|
|
body_children = list(body)
|
|||
|
|
for idx, elem in enumerate(body_children):
|
|||
|
|
if elem.tag.endswith('}tbl'):
|
|||
|
|
for t in doc.tables:
|
|||
|
|
if t._tbl is elem:
|
|||
|
|
table_order.append(t)
|
|||
|
|
table_elem_indices[t] = idx
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 找到第一个检测模块标题的位置(用于排除文档开头的非检测模块表格)
|
|||
|
|
from docx.text.paragraph import Paragraph as Para
|
|||
|
|
first_module_elem_idx = len(body_children) # 默认在最后
|
|||
|
|
for idx, elem in enumerate(body_children):
|
|||
|
|
if elem.tag.endswith('}p'):
|
|||
|
|
try:
|
|||
|
|
p = Para(elem, doc)
|
|||
|
|
txt = p.text.strip().lower()
|
|||
|
|
# 检查是否是检测模块标题(排除非检测模块)
|
|||
|
|
if txt and len(txt) < 50:
|
|||
|
|
is_module = any(kw in txt for kw in module_keywords)
|
|||
|
|
is_exclude = any(kw in txt for kw in exclude_keywords)
|
|||
|
|
if is_module and not is_exclude:
|
|||
|
|
first_module_elem_idx = idx
|
|||
|
|
break
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 合并表格(只在下一个表头之前搜索,避免跨模块吸走数据)
|
|||
|
|
# 排除文档开头(第一个检测模块之前)的表格,避免把数据合并到非检测模块表格
|
|||
|
|
tables_to_remove = set()
|
|||
|
|
for i in range(len(table_order)):
|
|||
|
|
if table_order[i] in tables_to_remove:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
t1 = table_order[i]
|
|||
|
|
t1_elem_idx = table_elem_indices.get(t1, 0)
|
|||
|
|
|
|||
|
|
# 跳过第一个检测模块之前的表格(如"异常指标汇总")
|
|||
|
|
if t1_elem_idx < first_module_elem_idx:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
info1 = analyze_table(t1)
|
|||
|
|
|
|||
|
|
if info1['header_idx'] >= 0 and len(info1['data_with_result']) == 0:
|
|||
|
|
next_header_pos = None
|
|||
|
|
for k in range(i + 1, len(table_order)):
|
|||
|
|
if table_order[k] in tables_to_remove:
|
|||
|
|
continue
|
|||
|
|
k_info = analyze_table(table_order[k])
|
|||
|
|
if k_info['header_idx'] >= 0 and len(k_info['data_with_result']) == 0:
|
|||
|
|
next_header_pos = k
|
|||
|
|
break
|
|||
|
|
search_end = next_header_pos if next_header_pos is not None else len(table_order)
|
|||
|
|
|
|||
|
|
candidates = []
|
|||
|
|
for j in range(i + 1, search_end):
|
|||
|
|
if table_order[j] in tables_to_remove:
|
|||
|
|
continue
|
|||
|
|
candidate = table_order[j]
|
|||
|
|
candidate_info = analyze_table(candidate)
|
|||
|
|
if len(candidate_info['data_with_result']) > 0:
|
|||
|
|
candidates.append((candidate, candidate_info))
|
|||
|
|
|
|||
|
|
if not candidates:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 取第一条数据的项目名作为标题
|
|||
|
|
title_text = ''
|
|||
|
|
try:
|
|||
|
|
first_candidate, first_candidate_info = candidates[0]
|
|||
|
|
if first_candidate_info.get('data_with_result'):
|
|||
|
|
data_row_idx = first_candidate_info['data_with_result'][0]
|
|||
|
|
if len(first_candidate.rows[data_row_idx].cells) > 1:
|
|||
|
|
title_text = first_candidate.rows[data_row_idx].cells[1].text.strip()
|
|||
|
|
if not title_text:
|
|||
|
|
title_text = first_candidate.rows[data_row_idx].cells[0].text.strip()
|
|||
|
|
except:
|
|||
|
|
title_text = ''
|
|||
|
|
|
|||
|
|
# 清空:删除表头行之后所有旧行,但尽量保留表头下一行作为“标题行结构”
|
|||
|
|
header_idx = info1['header_idx']
|
|||
|
|
title_row_idx = header_idx + 1
|
|||
|
|
keep_title_row = title_row_idx < len(t1.rows)
|
|||
|
|
delete_from = (title_row_idx + 1) if keep_title_row else (header_idx + 1)
|
|||
|
|
for ridx in range(len(t1.rows) - 1, delete_from - 1, -1):
|
|||
|
|
try:
|
|||
|
|
t1._tbl.remove(t1.rows[ridx]._tr)
|
|||
|
|
removed_rows += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if not keep_title_row:
|
|||
|
|
try:
|
|||
|
|
new_tr = copy.deepcopy(t1.rows[header_idx]._tr)
|
|||
|
|
t1._tbl.insert(title_row_idx, new_tr)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
if title_row_idx < len(t1.rows):
|
|||
|
|
title_row = t1.rows[title_row_idx]
|
|||
|
|
for c in title_row.cells:
|
|||
|
|
c.text = ''
|
|||
|
|
if title_text:
|
|||
|
|
title_row.cells[0].text = title_text
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
for candidate, candidate_info in candidates:
|
|||
|
|
for row_idx in candidate_info['data_with_result'] + candidate_info['desc_indices']:
|
|||
|
|
new_tr = copy.deepcopy(candidate.rows[row_idx]._tr)
|
|||
|
|
t1._tbl.append(new_tr)
|
|||
|
|
|
|||
|
|
tables_to_remove.add(candidate)
|
|||
|
|
merged_count += 1
|
|||
|
|
|
|||
|
|
for t in tables_to_remove:
|
|||
|
|
try:
|
|||
|
|
t._tbl.getparent().remove(t._tbl)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 删除逻辑:
|
|||
|
|
# 1. 两个数据行都没数据 → 删除整个表格
|
|||
|
|
# 2. 一行有数据一行没有 → 只删没数据的行,保留解释行
|
|||
|
|
tables_to_delete = []
|
|||
|
|
|
|||
|
|
for table in doc.tables:
|
|||
|
|
info = analyze_table(table)
|
|||
|
|
data_with = info['data_with_result'] # 有数据的行
|
|||
|
|
data_without = info['data_without_result'] # 没数据的行
|
|||
|
|
|
|||
|
|
# 情况1:所有数据行都没有数据 → 删除整个表格
|
|||
|
|
if len(data_with) == 0 and len(data_without) > 0:
|
|||
|
|
tables_to_delete.append(table)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 情况2:有些行有数据,有些没有 → 只删除没数据的行
|
|||
|
|
if len(data_with) > 0 and len(data_without) > 0:
|
|||
|
|
for row_idx in sorted(data_without, reverse=True):
|
|||
|
|
try:
|
|||
|
|
table._tbl.remove(table.rows[row_idx]._tr)
|
|||
|
|
removed_rows += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 删除整个表格
|
|||
|
|
for table in tables_to_delete:
|
|||
|
|
try:
|
|||
|
|
table._tbl.getparent().remove(table._tbl)
|
|||
|
|
removed_rows += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 补全合并后的标题行(表头下一行为空时)
|
|||
|
|
for table in doc.tables:
|
|||
|
|
info = analyze_table(table)
|
|||
|
|
if info['header_idx'] < 0:
|
|||
|
|
continue
|
|||
|
|
if len(info['data_with_result']) == 0:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
title_row_idx = info['header_idx'] + 1
|
|||
|
|
if title_row_idx >= len(table.rows):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
title_row = table.rows[title_row_idx]
|
|||
|
|
# 如果表头下一行本身就是数据行,则需要插入一个独立标题行
|
|||
|
|
try:
|
|||
|
|
first_cell = title_row.cells[0].text.strip() if title_row.cells else ''
|
|||
|
|
if is_data_row(first_cell) and has_data_in_row(title_row.cells):
|
|||
|
|
extracted_title = ''
|
|||
|
|
try:
|
|||
|
|
if len(title_row.cells) > 1:
|
|||
|
|
extracted_title = title_row.cells[1].text.strip()
|
|||
|
|
if not extracted_title:
|
|||
|
|
extracted_title = title_row.cells[0].text.strip()
|
|||
|
|
except:
|
|||
|
|
extracted_title = ''
|
|||
|
|
|
|||
|
|
header_tr = copy.deepcopy(table.rows[info['header_idx']]._tr)
|
|||
|
|
table._tbl.insert(title_row_idx, header_tr)
|
|||
|
|
title_row = table.rows[title_row_idx]
|
|||
|
|
try:
|
|||
|
|
for c in title_row.cells:
|
|||
|
|
c.text = ''
|
|||
|
|
if extracted_title:
|
|||
|
|
title_row.cells[0].text = extracted_title
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
continue
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if any((c.text or '').strip() for c in title_row.cells):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
first_data_idx = info['data_with_result'][0]
|
|||
|
|
if first_data_idx >= len(table.rows):
|
|||
|
|
continue
|
|||
|
|
data_row = table.rows[first_data_idx]
|
|||
|
|
|
|||
|
|
title_text = ''
|
|||
|
|
if len(data_row.cells) > 1:
|
|||
|
|
title_text = data_row.cells[1].text.strip()
|
|||
|
|
if not title_text:
|
|||
|
|
title_text = data_row.cells[0].text.strip()
|
|||
|
|
if not title_text:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
for c in title_row.cells:
|
|||
|
|
c.text = ''
|
|||
|
|
title_row.cells[0].text = title_text
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 删除没有数据且没有表头的表格(保留表头表格)
|
|||
|
|
removed_tables = 0
|
|||
|
|
for table in list(doc.tables):
|
|||
|
|
info = analyze_table(table)
|
|||
|
|
# 只删除既没有数据也没有表头的表格
|
|||
|
|
if len(info['data_with_result']) == 0 and info['header_idx'] < 0:
|
|||
|
|
try:
|
|||
|
|
table._tbl.getparent().remove(table._tbl)
|
|||
|
|
removed_tables += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# === 新增:梳理文档结构 ===
|
|||
|
|
# 模块标题关键词(24个文字模块分类)
|
|||
|
|
module_keywords = [
|
|||
|
|
# 1. 尿液检测
|
|||
|
|
'urine detection', 'urine analysis', 'urinalysis', '尿液检测', '尿常规',
|
|||
|
|
# 2. 血常规
|
|||
|
|
'complete blood count', 'blood routine', 'cbc', '血常规',
|
|||
|
|
# 3. 血糖
|
|||
|
|
'blood sugar', 'glucose', 'blood glucose', '血糖', '糖代谢',
|
|||
|
|
# 4. 血脂
|
|||
|
|
'lipid panel', 'lipid profile', 'blood lipid', '血脂',
|
|||
|
|
# 5. 血型
|
|||
|
|
'blood type', 'blood group', 'abo', '血型',
|
|||
|
|
# 6. 凝血功能
|
|||
|
|
'coagulation', 'clotting', '凝血功能', '凝血',
|
|||
|
|
# 7. 传染病四项
|
|||
|
|
'infectious disease', 'hepatitis', '传染病四项', '传染病',
|
|||
|
|
# 8. 血电解质
|
|||
|
|
'electrolyte', 'serum electrolyte', '血电解质', '电解质',
|
|||
|
|
# 9. 肝功能
|
|||
|
|
'liver function', 'hepatic function', '肝功能',
|
|||
|
|
# 10. 肾功能
|
|||
|
|
'kidney function', 'renal function', '肾功能',
|
|||
|
|
# 11. 心肌酶谱
|
|||
|
|
'cardiac enzyme', 'myocardial enzyme', '心肌酶谱', '心肌酶',
|
|||
|
|
# 12. 甲状腺功能
|
|||
|
|
'thyroid function', 'thyroid', '甲状腺功能', '甲状腺',
|
|||
|
|
# 13. 心脑血管风险因子
|
|||
|
|
'cardiovascular risk', 'cerebrovascular', '心脑血管风险因子', '心脑血管', '心血管',
|
|||
|
|
# 14. 骨代谢
|
|||
|
|
'bone metabolism', 'bone marker', '骨代谢',
|
|||
|
|
# 15. 微量元素
|
|||
|
|
'trace element', 'microelement', 'heavy metal', '微量元素', '重金属',
|
|||
|
|
# 16. 淋巴细胞亚群
|
|||
|
|
'lymphocyte subsets', 'lymphocyte subpopulation', '淋巴细胞亚群',
|
|||
|
|
# 17. 体液免疫
|
|||
|
|
'humoral immunity', 'immunoglobulin', '体液免疫',
|
|||
|
|
# 18. 炎症反应
|
|||
|
|
'inflammation', 'inflammatory', '炎症反应', '炎症',
|
|||
|
|
# 19. 自身抗体
|
|||
|
|
'autoantibody', 'autoimmune', '自身抗体', '自身免疫',
|
|||
|
|
# 20. 女性荷尔蒙
|
|||
|
|
'female hormone', 'estrogen', 'progesterone', '女性荷尔蒙', '女性激素',
|
|||
|
|
# 21. 男性荷尔蒙
|
|||
|
|
'male hormone', 'testosterone', 'androgen', '男性荷尔蒙', '男性激素',
|
|||
|
|
# 22. 肿瘤标记物
|
|||
|
|
'tumor marker', 'cancer marker', '肿瘤标记物', '肿瘤标志物',
|
|||
|
|
# 23. 影像学检查
|
|||
|
|
'imaging', 'radiology', 'ultrasound', 'x-ray', 'ct', 'mri', '影像学检查', '影像',
|
|||
|
|
# 24. 女性专项检查
|
|||
|
|
'female specific', 'gynecological', 'gynecology', '女性专项检查', '妇科',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 排除列表:这些不是检测模块,不应该被识别为模块标题
|
|||
|
|
exclude_keywords = [
|
|||
|
|
'client health program', '客户健康方案',
|
|||
|
|
'health report', '健康报告',
|
|||
|
|
'overall health', '整体健康',
|
|||
|
|
'health assessment', '健康评估',
|
|||
|
|
'abnormal index', '异常指标',
|
|||
|
|
'be.u', 'wellness center',
|
|||
|
|
'name', 'gender', 'age', 'nation', # 用户信息字段
|
|||
|
|
'姓名', '性别', '年龄', '国籍',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def contains_exclude_keyword(text: str) -> bool:
|
|||
|
|
"""检查文本是否包含排除关键词"""
|
|||
|
|
text_lower = text.lower()
|
|||
|
|
return any(kw in text_lower for kw in exclude_keywords)
|
|||
|
|
|
|||
|
|
def is_module_title_table(table):
|
|||
|
|
"""检查表格是否是模块标题表格"""
|
|||
|
|
if len(table.rows) < 1:
|
|||
|
|
return False
|
|||
|
|
try:
|
|||
|
|
for row_idx in range(min(2, len(table.rows))):
|
|||
|
|
row_text = ' '.join([c.text.lower().strip() for c in table.rows[row_idx].cells])
|
|||
|
|
# 先检查排除关键词
|
|||
|
|
if contains_exclude_keyword(row_text):
|
|||
|
|
return False
|
|||
|
|
for kw in module_keywords:
|
|||
|
|
if kw in row_text:
|
|||
|
|
return True
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def table_has_data(table):
|
|||
|
|
"""检查表格是否有有效数据"""
|
|||
|
|
info = analyze_table(table)
|
|||
|
|
return len(info['data_with_result']) > 0
|
|||
|
|
|
|||
|
|
def is_module_title_paragraph(p_text: str) -> bool:
|
|||
|
|
"""检查段落是否是模块标题(文字模块)"""
|
|||
|
|
if not p_text:
|
|||
|
|
return False
|
|||
|
|
text = p_text.strip().lower()
|
|||
|
|
if not text:
|
|||
|
|
return False
|
|||
|
|
# 标题通常很短(避免误匹配正文)
|
|||
|
|
if len(text) > 40:
|
|||
|
|
return False
|
|||
|
|
# 先检查排除关键词
|
|||
|
|
if contains_exclude_keyword(text):
|
|||
|
|
return False
|
|||
|
|
return any(kw in text for kw in module_keywords)
|
|||
|
|
|
|||
|
|
# 1. 基于body元素顺序识别模块(支持段落标题与表格标题)
|
|||
|
|
from docx.oxml import OxmlElement
|
|||
|
|
from docx.oxml.ns import qn as oxml_qn
|
|||
|
|
from docx.text.paragraph import Paragraph
|
|||
|
|
from docx.table import Table
|
|||
|
|
|
|||
|
|
body = doc._body._body
|
|||
|
|
body_children = list(body)
|
|||
|
|
|
|||
|
|
tbl_map = {t._tbl: t for t in doc.tables}
|
|||
|
|
|
|||
|
|
def get_table_from_elem(elem):
|
|||
|
|
return tbl_map.get(elem)
|
|||
|
|
|
|||
|
|
def is_blank_paragraph_elem(elem):
|
|||
|
|
try:
|
|||
|
|
p = Paragraph(elem, doc)
|
|||
|
|
return p.text.strip() == ''
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def create_visible_blank_paragraph():
|
|||
|
|
"""创建可见的空行段落(含一个空格run,避免被Word折叠)"""
|
|||
|
|
p = OxmlElement('w:p')
|
|||
|
|
pPr = OxmlElement('w:pPr')
|
|||
|
|
spacing = OxmlElement('w:spacing')
|
|||
|
|
spacing.set(oxml_qn('w:after'), '0')
|
|||
|
|
spacing.set(oxml_qn('w:before'), '0')
|
|||
|
|
pPr.append(spacing)
|
|||
|
|
p.append(pPr)
|
|||
|
|
|
|||
|
|
r = OxmlElement('w:r')
|
|||
|
|
t = OxmlElement('w:t')
|
|||
|
|
t.text = ' '
|
|||
|
|
r.append(t)
|
|||
|
|
p.append(r)
|
|||
|
|
return p
|
|||
|
|
|
|||
|
|
def is_module_start_elem(elem):
|
|||
|
|
if elem.tag.endswith('}tbl'):
|
|||
|
|
t = get_table_from_elem(elem)
|
|||
|
|
return bool(t) and is_module_title_table(t)
|
|||
|
|
if elem.tag.endswith('}p'):
|
|||
|
|
try:
|
|||
|
|
p = Paragraph(elem, doc)
|
|||
|
|
return is_module_title_paragraph(p.text)
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 收集所有模块起点
|
|||
|
|
module_start_indices = [i for i, e in enumerate(body_children) if is_module_start_elem(e)]
|
|||
|
|
|
|||
|
|
# === 模块删除逻辑(删除无数据的文字模块及其表格)===
|
|||
|
|
# 规则:当一个文字模块中没有任何表格有数据时,删除该模块标题和所有表格
|
|||
|
|
removed_modules = 0
|
|||
|
|
elements_removed_in_modules = 0
|
|||
|
|
|
|||
|
|
if module_start_indices:
|
|||
|
|
# 从后往前处理每个模块,避免索引变化问题
|
|||
|
|
for idx in range(len(module_start_indices) - 1, -1, -1):
|
|||
|
|
start_i = module_start_indices[idx]
|
|||
|
|
end_i = module_start_indices[idx + 1] if idx + 1 < len(module_start_indices) else len(body_children)
|
|||
|
|
|
|||
|
|
# 获取模块区间内的所有元素
|
|||
|
|
module_elements = body_children[start_i:end_i]
|
|||
|
|
|
|||
|
|
# 检查模块内是否有任何表格有数据
|
|||
|
|
module_has_data = False
|
|||
|
|
module_tables = []
|
|||
|
|
for e in module_elements:
|
|||
|
|
if e.tag.endswith('}tbl'):
|
|||
|
|
t = get_table_from_elem(e)
|
|||
|
|
if t:
|
|||
|
|
module_tables.append(e)
|
|||
|
|
if table_has_data(t):
|
|||
|
|
module_has_data = True
|
|||
|
|
|
|||
|
|
# 如果模块没有数据,删除模块标题和所有表格
|
|||
|
|
if not module_has_data and module_tables:
|
|||
|
|
# 删除模块内的所有元素(从后往前删除)
|
|||
|
|
for e in reversed(module_elements):
|
|||
|
|
try:
|
|||
|
|
e.getparent().remove(e)
|
|||
|
|
elements_removed_in_modules += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
removed_modules += 1
|
|||
|
|
|
|||
|
|
# 重新抓取body(删除后索引已变化)
|
|||
|
|
body = doc._body._body
|
|||
|
|
body_children = list(body)
|
|||
|
|
|
|||
|
|
# 2. 在模块内表格之间添加空行(段落/表格标题均作为模块边界)
|
|||
|
|
space_count = 0
|
|||
|
|
current_module_started = False
|
|||
|
|
prev_was_data_table = False
|
|||
|
|
|
|||
|
|
i = 0
|
|||
|
|
while i < len(body_children):
|
|||
|
|
elem = body_children[i]
|
|||
|
|
|
|||
|
|
if is_module_start_elem(elem):
|
|||
|
|
current_module_started = True
|
|||
|
|
prev_was_data_table = False
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if current_module_started and elem.tag.endswith('}tbl'):
|
|||
|
|
t = get_table_from_elem(elem)
|
|||
|
|
is_title = bool(t) and is_module_title_table(t)
|
|||
|
|
is_data = bool(t) and (not is_title) and table_has_data(t)
|
|||
|
|
|
|||
|
|
if is_data:
|
|||
|
|
# 向上跳过空段落,判断前一个有效元素是否为数据表格
|
|||
|
|
j = i - 1
|
|||
|
|
while j >= 0 and body_children[j].tag.endswith('}p') and is_blank_paragraph_elem(body_children[j]):
|
|||
|
|
j -= 1
|
|||
|
|
|
|||
|
|
prev_is_data_table = False
|
|||
|
|
if j >= 0 and body_children[j].tag.endswith('}tbl'):
|
|||
|
|
prev_t = get_table_from_elem(body_children[j])
|
|||
|
|
if prev_t and (not is_module_title_table(prev_t)) and table_has_data(prev_t):
|
|||
|
|
prev_is_data_table = True
|
|||
|
|
|
|||
|
|
if prev_is_data_table:
|
|||
|
|
# 保证两表之间有一个“可见空行”段落
|
|||
|
|
prev_elem = body_children[i - 1] if i - 1 >= 0 else None
|
|||
|
|
|
|||
|
|
# 情况1:紧挨着上一张表(或非空段落)=> 插入可见空行
|
|||
|
|
if not (prev_elem is not None and prev_elem.tag.endswith('}p') and is_blank_paragraph_elem(prev_elem)):
|
|||
|
|
empty_p = create_visible_blank_paragraph()
|
|||
|
|
body.insert(i, empty_p)
|
|||
|
|
space_count += 1
|
|||
|
|
body_children = list(body)
|
|||
|
|
i += 1
|
|||
|
|
else:
|
|||
|
|
# 情况2:已有空段落,但可能不可见 => 补一个空格 run
|
|||
|
|
try:
|
|||
|
|
p_elem = prev_elem
|
|||
|
|
has_run = any(c.tag.endswith('}r') for c in list(p_elem))
|
|||
|
|
if not has_run:
|
|||
|
|
r = OxmlElement('w:r')
|
|||
|
|
tt = OxmlElement('w:t')
|
|||
|
|
tt.text = ' '
|
|||
|
|
r.append(tt)
|
|||
|
|
p_elem.append(r)
|
|||
|
|
space_count += 1
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
prev_was_data_table = is_data
|
|||
|
|
elif elem.tag.endswith('}p'):
|
|||
|
|
# 如果表格之间已经有段落(无论是否空白),就不重复插入
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
i += 1
|
|||
|
|
|
|||
|
|
doc.save(output_path)
|
|||
|
|
print(f"[OK] 清理完成: 删除 {removed_rows} 行, 合并 {merged_count} 对表格, 删除 {removed_tables} 个空表格")
|
|||
|
|
print(f"[OK] 清理特殊表格: 删除 {removed_special_tables} 个空特殊表格")
|
|||
|
|
print(f"[OK] 结构整理: 删除 {removed_modules} 个无数据模块, 删除 {elements_removed_in_modules} 个模块元素, 插入 {space_count} 个表格间空行")
|
|||
|
|
return doc
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
# 路径配置
|
|||
|
|
template_path = r"c:\Users\UI\Desktop\医疗报告\template_docxtpl.docx"
|
|||
|
|
filled_path = r"c:\Users\UI\Desktop\医疗报告\backend\reports\filled_docxtpl_temp.docx"
|
|||
|
|
reports_dir = Path(__file__).parent / "reports"
|
|||
|
|
reports_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
def get_next_output_path() -> str:
|
|||
|
|
existing = list(reports_dir.glob("filled_report_v*.docx"))
|
|||
|
|
max_v = 0
|
|||
|
|
for p in existing:
|
|||
|
|
name = p.stem
|
|||
|
|
try:
|
|||
|
|
v_str = name.split("filled_report_v", 1)[1]
|
|||
|
|
v = int(v_str)
|
|||
|
|
if v > max_v:
|
|||
|
|
max_v = v
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
return str(reports_dir / f"filled_report_v{max_v + 1}.docx")
|
|||
|
|
|
|||
|
|
output_path = get_next_output_path()
|
|||
|
|
|
|||
|
|
# 优先使用DeepSeek处理后的数据
|
|||
|
|
deepseek_file = Path(__file__).parent / "deepseek_processed_data.json"
|
|||
|
|
extracted_file = Path(__file__).parent / "extracted_medical_data.json"
|
|||
|
|
|
|||
|
|
# 加载ABB配置
|
|||
|
|
from config import load_abb_config
|
|||
|
|
abb_config = load_abb_config()
|
|||
|
|
|
|||
|
|
use_deepseek = deepseek_file.exists()
|
|||
|
|
|
|||
|
|
if use_deepseek:
|
|||
|
|
print("使用DeepSeek处理后的数据")
|
|||
|
|
with open(deepseek_file, 'r', encoding='utf-8') as f:
|
|||
|
|
matched_data = json.load(f)
|
|||
|
|
print(f"加载 {len(matched_data)} 个匹配项")
|
|||
|
|
|
|||
|
|
# 直接填充,跳过匹配步骤
|
|||
|
|
print("\n步骤1: 填充数据...")
|
|||
|
|
fill_template(template_path, matched_data, filled_path)
|
|||
|
|
|
|||
|
|
print("\n步骤2: 清理空白行...")
|
|||
|
|
clean_empty_rows(filled_path, output_path)
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
if os.path.exists(filled_path):
|
|||
|
|
os.remove(filled_path)
|
|||
|
|
|
|||
|
|
print(f"\n[SUCCESS] 完成! 输出: {output_path}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 原有逻辑:使用本地处理
|
|||
|
|
if not extracted_file.exists():
|
|||
|
|
print("[ERROR] 未找到提取数据,请先运行 deepseek_process.py 或 extract_and_fill_report.py")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
with open(extracted_file, 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
if isinstance(data, dict):
|
|||
|
|
extracted_items = data.get('items', [])
|
|||
|
|
else:
|
|||
|
|
extracted_items = data
|
|||
|
|
|
|||
|
|
# 清理数据(分离单位和参考范围)
|
|||
|
|
extracted_items = clean_extracted_data(extracted_items)
|
|||
|
|
|
|||
|
|
print(f"加载 {len(extracted_items)} 个提取项")
|
|||
|
|
|
|||
|
|
# 使用已加载的ABB配置
|
|||
|
|
template_abbs = {}
|
|||
|
|
for abb_upper, info in abb_config.get('abb_to_info', {}).items():
|
|||
|
|
template_abbs[abb_upper] = info
|
|||
|
|
# 处理包含/的ABB
|
|||
|
|
if '/' in abb_upper:
|
|||
|
|
for part in abb_upper.split('/'):
|
|||
|
|
template_abbs[part.strip()] = info
|
|||
|
|
|
|||
|
|
# 按ABB分组
|
|||
|
|
items_by_abb = {}
|
|||
|
|
for item in extracted_items:
|
|||
|
|
abb = item['abb'].upper()
|
|||
|
|
if abb not in items_by_abb:
|
|||
|
|
items_by_abb[abb] = []
|
|||
|
|
items_by_abb[abb].append(item)
|
|||
|
|
|
|||
|
|
# 选择每个ABB的最佳匹配
|
|||
|
|
best_items = select_best_match(items_by_abb)
|
|||
|
|
|
|||
|
|
# 与模板匹配
|
|||
|
|
matched_data = {}
|
|||
|
|
for abb, item in best_items.items():
|
|||
|
|
if abb in template_abbs:
|
|||
|
|
matched_data[abb] = item
|
|||
|
|
else:
|
|||
|
|
for t_abb in template_abbs:
|
|||
|
|
if abb in t_abb or t_abb in abb:
|
|||
|
|
matched_data[t_abb] = item
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
print(f"清理后 {len(best_items)} 个有效项, 匹配 {len(matched_data)} 个")
|
|||
|
|
|
|||
|
|
# 步骤1: 填充
|
|||
|
|
print("\n步骤1: 填充数据...")
|
|||
|
|
fill_template(template_path, matched_data, filled_path)
|
|||
|
|
|
|||
|
|
# 步骤2: 清理空行
|
|||
|
|
print("\n步骤2: 清理空白行...")
|
|||
|
|
clean_empty_rows(filled_path, output_path)
|
|||
|
|
|
|||
|
|
# 删除临时文件
|
|||
|
|
import os
|
|||
|
|
if os.path.exists(filled_path):
|
|||
|
|
os.remove(filled_path)
|
|||
|
|
|
|||
|
|
print(f"\n[SUCCESS] 完成! 输出: {output_path}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|