Files
yiliao/backend/extract_template_explanations.py

141 lines
4.8 KiB
Python
Raw Permalink Normal View History

"""
从模板文件中提取所有检测项目的临床意义
"""
from docx import Document
import json
import re
def extract_all_explanations():
doc = Document('template_complete.docx')
explanations = {}
# 遍历所有表格
for table_idx, table in enumerate(doc.tables):
rows = table.rows
if len(rows) < 2:
continue
# 检查是否是检测项目表格(通过表头判断)
header_text = ' '.join([cell.text.strip() for cell in rows[0].cells])
# 遍历每一行
current_abb = None
for row_idx, row in enumerate(rows):
cells = row.cells
if not cells:
continue
# 获取第一列文本通常是ABB
first_cell_text = cells[0].text.strip()
# 跳过表头行
if 'Abb' in first_cell_text or '简称' in first_cell_text:
continue
# 检查是否是ABB行短文本不是临床意义
if first_cell_text and len(first_cell_text) < 40:
if not first_cell_text.startswith('Clinical') and not first_cell_text.startswith('临床'):
# 可能是ABB
current_abb = first_cell_text
# 查找临床意义
for cell in cells:
text = cell.text.strip()
if 'Clinical Significance:' in text and '临床意义:' in text:
# 提取英文和中文
parts = text.split('临床意义:')
if len(parts) == 2:
en = parts[0].replace('Clinical Significance:', '').strip()
cn = parts[1].strip()
if current_abb and en and cn:
# 标准化ABB名称
abb_key = current_abb.upper().strip()
# 处理特殊字符
abb_key = abb_key.replace(' - ', '-').replace('', '(').replace('', ')')
if abb_key not in explanations:
explanations[abb_key] = {
'clinical_en': en,
'clinical_cn': cn
}
print(f'提取: {abb_key}')
return explanations
def main():
print('从模板提取临床意义...')
print('=' * 60)
template_explanations = extract_all_explanations()
print(f'\n从模板提取了 {len(template_explanations)} 个项目')
# 读取现有文件
try:
with open('template_explanations.json', 'r', encoding='utf-8') as f:
existing = json.load(f)
print(f'现有文件中有 {len(existing)} 个项目')
except:
existing = {}
print('创建新文件')
# 用模板内容更新(模板优先)
updated_count = 0
for abb, exp in template_explanations.items():
if abb not in existing or existing[abb] != exp:
existing[abb] = exp
updated_count += 1
# 保存
with open('template_explanations.json', 'w', encoding='utf-8') as f:
json.dump(existing, f, ensure_ascii=False, indent=2)
print(f'\n更新了 {updated_count} 个项目')
print(f'最终文件包含 {len(existing)} 个项目')
# 检查配置文件中的项目是否都有临床意义
print('\n' + '=' * 60)
print('检查配置文件中的项目覆盖情况...')
with open('abb_mapping_config.json', 'r', encoding='utf-8') as f:
config = json.load(f)
config_abbs = set()
for module_name, module_data in config.get('modules', {}).items():
for item in module_data.get('items', []):
abb = item.get('abb', '').upper().strip()
abb = abb.replace(' - ', '-').replace('', '(').replace('', ')')
config_abbs.add(abb)
# 检查缺失
missing = []
for abb in config_abbs:
if abb not in existing:
# 尝试一些变体
found = False
variants = [
abb,
abb.replace('-', ' '),
abb.replace(' ', '-'),
abb.replace('%', ''),
abb + ' COUNT',
abb + ' TYPE',
]
for v in variants:
if v in existing:
found = True
break
if not found:
missing.append(abb)
if missing:
print(f'\n缺失临床意义的项目 ({len(missing)}):')
for abb in sorted(missing):
print(f' {abb}')
else:
print('\n所有配置项目都有临床意义!')
if __name__ == '__main__':
main()