1144 lines
49 KiB
Python
1144 lines
49 KiB
Python
|
|
"""
|
|||
|
|
优化版医疗数据解析模块 - 处理多种OCR格式(英文+中文)
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ============================================================
|
|||
|
|
# 中文体检报告解析(格式: 检查名称 检查结果 参考值 单位)
|
|||
|
|
# ============================================================
|
|||
|
|
|
|||
|
|
# 中文项目名 → ABB 映射(按长度降序排列避免短匹配覆盖长匹配)
|
|||
|
|
CN_NAME_TO_ABB = {
|
|||
|
|
# 尿液分析
|
|||
|
|
'颜色': 'Color', '透明度': 'Clarity', '比重': 'SG', '酸碱度': 'pH',
|
|||
|
|
'蛋白质': 'PRO', '葡萄糖': 'GLU', '酮体': 'KET', '胆红素': 'BIL',
|
|||
|
|
'尿胆原': 'URO', '亚硝酸盐': 'NIT', '白细胞酯酶': 'LEU', '隐血': 'BLD',
|
|||
|
|
# 血型
|
|||
|
|
'ABO血型': 'ABO', 'Rh(D)血型': 'Rh(D)',
|
|||
|
|
# 血常规 - 长名称优先
|
|||
|
|
'中性粒细胞百分率(NEUT%)': 'NEUT%', '中性粒细胞数(NEUT#)': 'NEUT',
|
|||
|
|
'淋巴细胞百分率(LYMPH%)': 'LYMPH%', '淋巴细胞数(LYMPH#)': 'LYMPH',
|
|||
|
|
'单核细胞百分率(MONO%)': 'MONO%', '单核细胞数(MONO#)': 'MONO',
|
|||
|
|
'嗜酸性粒细胞百分率(EO%)': 'EOS%', '嗜酸性粒细胞数(EO#)': 'EOS',
|
|||
|
|
'嗜碱性粒细胞百分率(BASO%)': 'BAS%', '嗜碱性粒细胞数(BASO#)': 'BAS',
|
|||
|
|
'白细胞计数(WBC)': 'WBC', '白细胞计数': 'WBC',
|
|||
|
|
'红细胞计数(RBC)': 'RBC', '红细胞计数': 'RBC',
|
|||
|
|
'血红蛋白量(HGB)': 'Hb', '血红蛋白量': 'Hb', '血红蛋白': 'Hb',
|
|||
|
|
'红细胞比积(HCT)': 'HCT', '红细胞比积': 'HCT', '红细胞压积': 'HCT',
|
|||
|
|
'平均红细胞体积(MCV)': 'MCV', '平均红细胞体积': 'MCV',
|
|||
|
|
'平均红细胞血红蛋白量(MCH)': 'MCH', '平均红细胞血红蛋白量': 'MCH',
|
|||
|
|
'平均红细胞血红蛋白浓度(MCHC)': 'MCHC', '平均红细胞血红蛋白浓度': 'MCHC',
|
|||
|
|
'红细胞分布宽度-标准差(RDW-SD)': 'RDW-SD', '红细胞分布宽度-变异系数(RDW-CV)': 'RDW',
|
|||
|
|
'血小板计数(PLT)': 'PLT', '血小板计数': 'PLT',
|
|||
|
|
'血小板比积(PCT)': 'PCT', '平均血小板体积(MPV)': 'MPV',
|
|||
|
|
'血小板分布宽度(PDW)': 'PDW', '大型血小板比率(P-LCR)': 'P-LCR',
|
|||
|
|
# 肝功能
|
|||
|
|
'总胆红素': 'TBil', '直接胆红素': 'DBil', '间接胆红素': 'IBil',
|
|||
|
|
'总蛋白': 'TP', '白蛋白': 'ALB', '球蛋白': 'GLB', '白球比值': 'A/G',
|
|||
|
|
'谷丙转氨酶': 'ALT', '谷草转氨酶': 'AST',
|
|||
|
|
'γ-谷氨酰基转移酶': 'GGT', 'γ-谷氨酰转移酶': 'GGT',
|
|||
|
|
'碱性磷酸酶': 'ALP',
|
|||
|
|
'乳酸脱氢酶': 'LDH', '转铁蛋白': 'Tf',
|
|||
|
|
'胆碱酯酶': 'CHE',
|
|||
|
|
# 肾功能
|
|||
|
|
'尿素': 'BUN', '肌酐': 'Scr', '尿酸': 'UA',
|
|||
|
|
'胱抑素C': 'CysC', '血清β2微球蛋白': 'β2-MG',
|
|||
|
|
# 血脂
|
|||
|
|
'甘油三酯': 'TG', '总胆固醇': 'TC',
|
|||
|
|
'高密度脂蛋白胆固醇': 'HDL', '低密度脂蛋白胆固醇': 'LDL',
|
|||
|
|
'游离脂肪酸': 'FFA', '脂蛋白(a)': 'Lp(a)',
|
|||
|
|
# 血糖
|
|||
|
|
'葡萄糖(空腹)': 'FBS', '胰岛素(空腹)': 'INS',
|
|||
|
|
'糖化血红蛋白': 'HbA1C',
|
|||
|
|
# 心肌酶
|
|||
|
|
'肌酸激酶同工酶MB': 'CK-MB', '肌酸激酶': 'CK',
|
|||
|
|
# 心血管风险因子
|
|||
|
|
'超敏C反应蛋白': 'hs-CRP', '同型半胱氨酸': 'Hcy',
|
|||
|
|
# 甲状腺
|
|||
|
|
'三碘甲状腺原氨酸T3': 'T3', '甲状腺素T4': 'T4',
|
|||
|
|
'游离三碘甲状腺原氨酸FT3': 'FT3', '游离甲状腺素FT4': 'FT4',
|
|||
|
|
'促甲状腺素TSH': 'TSH', '甲状腺球蛋白': 'Tg',
|
|||
|
|
'抗甲状腺球蛋白抗体': 'TgAb', '抗甲状腺过氧化物酶抗体': 'TPO-Ab',
|
|||
|
|
# 胃功能
|
|||
|
|
'胃蛋白酶原I': 'PGI', '胃蛋白酶原Ⅱ': 'PGII', '胃蛋白酶原比值': 'PGR',
|
|||
|
|
'胃泌素-17': 'G-17',
|
|||
|
|
# 传染病
|
|||
|
|
'乙肝表面抗原': 'HBsAg', '乙肝表面抗体': 'HBsAb',
|
|||
|
|
'乙肝e抗原': 'HBeAg', '乙肝e抗体': 'HBeAb', '乙肝核心抗体': 'HBcAb',
|
|||
|
|
# 风湿/免疫
|
|||
|
|
'C反应蛋白': 'CRP', '抗链球菌溶血素"0"': 'ASO', '抗链球菌溶血素': 'ASO',
|
|||
|
|
'抗核抗体': 'ANA', '类风湿因子': 'RF',
|
|||
|
|
# 电解质
|
|||
|
|
'钾': 'K', '钠': 'Na', '氯': 'Cl', '总钙': 'Ca', '磷': 'P',
|
|||
|
|
# 骨代谢
|
|||
|
|
'甲状旁腺素': 'PTH', '骨钙素': 'OST',
|
|||
|
|
# 贫血/维生素
|
|||
|
|
'维生素B12': 'VitB12', '血清铁蛋白': 'Fer',
|
|||
|
|
'维生素B9(叶酸)血药浓度测定': 'Folate', '叶酸': 'Folate',
|
|||
|
|
'25-羟基维生素D血药浓度测定': '25-OH-VD2+D3', '25-羟基维生素D': '25-OH-VD2+D3',
|
|||
|
|
'25-羟基维生素D3血药浓度测定': 'VD3', '25-羟基维生素D2血药浓度测定': 'VD2',
|
|||
|
|
'维生素A血药浓度测定': 'VitA', '维生素E血药浓度测定': 'VitE',
|
|||
|
|
'维生素K1血药浓度测定': 'VitK1',
|
|||
|
|
'维生素B1血药浓度测定': 'VitB1', '维生素B2血药浓度测定': 'VitB2',
|
|||
|
|
'维生素B3血药浓度测定': 'VitB3', '维生素B5血药浓度测定': 'VitB5',
|
|||
|
|
'维生素B6血药浓度测定': 'VitB6',
|
|||
|
|
# 肿瘤标志物
|
|||
|
|
'甲胎蛋白': 'AFP', '癌胚抗原': 'CEA',
|
|||
|
|
'糖类抗原19-9': 'CA19-9', '糖类抗原72-4': 'CA72-4',
|
|||
|
|
'糖类抗原24-2': 'CA24-2', '糖类抗原50': 'CA50',
|
|||
|
|
'糖类抗原125': 'CA125',
|
|||
|
|
'神经元特异性烯醇化酶': 'NSE', '细胞角蛋白19片段': 'CYFRA21-1',
|
|||
|
|
'鳞状细胞癌相关抗原': 'SCC',
|
|||
|
|
'胃泌素释放肽前体': 'ProGRP',
|
|||
|
|
'总前列腺特异抗原': 'TPSA', '游离前列腺特异抗原': 'FPSA',
|
|||
|
|
'游离PSA/总PSA': 'F/TPSA',
|
|||
|
|
# 碳13呼气试验
|
|||
|
|
'碳13尿素呼气试验DOB值': 'C13-DOB',
|
|||
|
|
}
|
|||
|
|
# 按key长度降序排列,确保长名称优先匹配
|
|||
|
|
CN_SORTED_KEYS = sorted(CN_NAME_TO_ABB.keys(), key=len, reverse=True)
|
|||
|
|
|
|||
|
|
# 中文报告中应跳过的行关键词
|
|||
|
|
CN_SKIP_PATTERNS = [
|
|||
|
|
'检查名称', '检查结果', '参考值', '单位', # 表头
|
|||
|
|
'打印日期', '健康管理体检报告', '身份证', # 页眉
|
|||
|
|
'科室小结', '医生建议', '检查医师', '检查日期', # 非数据行
|
|||
|
|
'既往病史', '体检单号',
|
|||
|
|
'检查所见', '检查提示', '检查结论', # 影像描述
|
|||
|
|
'近3次体检', '最高值', '最低值', '结果', # 趋势图
|
|||
|
|
'健康服务中心', 'EPIQTC', 'DlanYo', 'S5-1', 'HRes', # 超声设备信息
|
|||
|
|
'彩色血流', 'MI1.2', 'MI 0.7', 'Generi', 'A/B Ratio',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _is_cn_report(lines: list) -> bool:
|
|||
|
|
"""检测是否是中文体检报告格式"""
|
|||
|
|
cn_markers = 0
|
|||
|
|
for line in lines[:50]: # 只检查前50行
|
|||
|
|
if '健康管理体检报告' in line:
|
|||
|
|
return True
|
|||
|
|
if '检查名称' in line and '检查结果' in line:
|
|||
|
|
return True
|
|||
|
|
if re.match(r'^[一二三四五六七八九十]+.*检查$', line):
|
|||
|
|
cn_markers += 1
|
|||
|
|
return cn_markers >= 2
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _parse_cn_data_line(line: str, source_file: str) -> dict:
|
|||
|
|
"""
|
|||
|
|
解析中文体检报告的数据行
|
|||
|
|
格式: 检查名称 检查结果 参考值 单位
|
|||
|
|
例如: 白细胞计数(WBC) 5.1 3.5-9.5 x10^9/L
|
|||
|
|
甲胎蛋白 0.5 <=7.0 ng/ml
|
|||
|
|
维生素B1血药浓度测定 ↓ 1.67 2.4-9.02 ng/ml
|
|||
|
|
"""
|
|||
|
|
# 先查找ABB
|
|||
|
|
abb = None
|
|||
|
|
project = None
|
|||
|
|
|
|||
|
|
for cn_key in CN_SORTED_KEYS:
|
|||
|
|
if cn_key in line:
|
|||
|
|
abb = CN_NAME_TO_ABB[cn_key]
|
|||
|
|
project = cn_key
|
|||
|
|
# 取项目名之后的部分
|
|||
|
|
rest = line[line.index(cn_key) + len(cn_key):].strip()
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if not abb or not rest:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 如果行内还有括号内的英文ABB如 (WBC),去掉
|
|||
|
|
rest = re.sub(r'^\s*\([A-Za-z0-9%#]+\)', '', rest).strip()
|
|||
|
|
|
|||
|
|
# 处理异常标记 ↓ ↑ * 在结果前面
|
|||
|
|
point = ''
|
|||
|
|
if rest.startswith('↓'):
|
|||
|
|
point = '↓'
|
|||
|
|
rest = rest[1:].strip()
|
|||
|
|
elif rest.startswith('↑'):
|
|||
|
|
point = '↑'
|
|||
|
|
rest = rest[1:].strip()
|
|||
|
|
elif rest.startswith('*'):
|
|||
|
|
rest = rest[1:].strip()
|
|||
|
|
|
|||
|
|
# 尝试解析: 数值 参考范围 单位
|
|||
|
|
# 模式1: 数值 参考范围 单位 (如 "5.1 3.5-9.5 x10^9/L")
|
|||
|
|
# 模式2: 数值 <=参考值 单位 (如 "0.5 <=7.0 ng/ml")
|
|||
|
|
# 模式3: 定性结果 (如 "阴性", "阳性", "正常", "未检出")
|
|||
|
|
# 模式4: 定性结果 定性参考 (如 "阴性 阴性")
|
|||
|
|
|
|||
|
|
# 定性结果
|
|||
|
|
qualitative = ['阴性', '阳性', '弱阳性', '正常', '未检出', '未提示',
|
|||
|
|
'深黄色', '浅黄色', '黄色', '清亮', '混浊',
|
|||
|
|
'A型', 'B型', 'AB型', 'O型',
|
|||
|
|
'拒检指检', '无', '肥胖']
|
|||
|
|
for q in qualitative:
|
|||
|
|
if rest.startswith(q):
|
|||
|
|
result = q
|
|||
|
|
ref_rest = rest[len(q):].strip()
|
|||
|
|
reference = ref_rest.split()[0] if ref_rest.split() else ''
|
|||
|
|
return {
|
|||
|
|
'abb': abb, 'project': project, 'result': result,
|
|||
|
|
'point': point, 'unit': '', 'reference': reference,
|
|||
|
|
'source': source_file
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 数值型结果解析
|
|||
|
|
# 匹配: 数值 [参考范围] [单位]
|
|||
|
|
# 数值可能带 < > 前缀,如 "<2.00"
|
|||
|
|
# 参考范围格式: "3.5-9.5", "<=7.0", ">=30", "<1.0", ">1.0", "无参考范围"
|
|||
|
|
parts = rest.split()
|
|||
|
|
if not parts:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 第一个token应该是数值结果
|
|||
|
|
result_str = parts[0]
|
|||
|
|
# 验证是数值(可带<>前缀)
|
|||
|
|
if not re.match(r'^[<>]?[\d\.]+$', result_str) and result_str not in qualitative:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
result = result_str
|
|||
|
|
reference = ''
|
|||
|
|
unit = ''
|
|||
|
|
|
|||
|
|
# 解析剩余部分
|
|||
|
|
remaining = parts[1:]
|
|||
|
|
if remaining:
|
|||
|
|
# 检查是否是参考范围 (含数字或<=/>=/无参考范围)
|
|||
|
|
ref_part = remaining[0]
|
|||
|
|
if re.match(r'^[<>=\d\.\-]+', ref_part) or '参考范围' in ref_part:
|
|||
|
|
reference = ref_part
|
|||
|
|
if len(remaining) > 1:
|
|||
|
|
unit = remaining[1]
|
|||
|
|
else:
|
|||
|
|
# 可能直接是单位
|
|||
|
|
unit = ref_part
|
|||
|
|
|
|||
|
|
# 检测异常标记(如果结果超出参考范围但没有↑↓标记)
|
|||
|
|
if not point and reference and result:
|
|||
|
|
try:
|
|||
|
|
val = float(result.replace('<', '').replace('>', ''))
|
|||
|
|
ref_match = re.match(r'^([\d\.]+)-([\d\.]+)$', reference)
|
|||
|
|
if ref_match:
|
|||
|
|
low = float(ref_match.group(1))
|
|||
|
|
high = float(ref_match.group(2))
|
|||
|
|
if val < low:
|
|||
|
|
point = '↓'
|
|||
|
|
elif val > high:
|
|||
|
|
point = '↑'
|
|||
|
|
elif reference.startswith('<='):
|
|||
|
|
threshold = float(reference[2:])
|
|||
|
|
if val > threshold:
|
|||
|
|
point = '↑'
|
|||
|
|
elif reference.startswith('>='):
|
|||
|
|
threshold = float(reference[2:])
|
|||
|
|
if val < threshold:
|
|||
|
|
point = '↓'
|
|||
|
|
except (ValueError, TypeError):
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'abb': abb, 'project': project, 'result': result,
|
|||
|
|
'point': point, 'unit': unit, 'reference': reference,
|
|||
|
|
'source': source_file
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_chinese_medical_data(text: str, source_file: str) -> list:
|
|||
|
|
"""解析中文健康管理体检报告"""
|
|||
|
|
items = []
|
|||
|
|
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|||
|
|
|
|||
|
|
for line in lines:
|
|||
|
|
# 跳过无关行
|
|||
|
|
if any(p in line for p in CN_SKIP_PATTERNS):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过页眉(姓名行)
|
|||
|
|
if re.match(r'^姓名', line):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过科室标题行(如 "十、血常规检查")
|
|||
|
|
if re.match(r'^[一二三四五六七八九十百]+[、.]', line):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过纯文字描述行(无数字的行通常不是数据行)
|
|||
|
|
if not re.search(r'\d', line):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过趋势图中的年份行
|
|||
|
|
if re.match(r'^\d{4}-\d{2}-\d{2}$', line):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过超声图片注释行
|
|||
|
|
if re.match(r'^\d+:\d+', line) or 'cm' == line.strip():
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 尝试解析为数据行
|
|||
|
|
item = _parse_cn_data_line(line, source_file)
|
|||
|
|
if item:
|
|||
|
|
items.append(item)
|
|||
|
|
|
|||
|
|
return items
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_medical_data_v2(text: str, source_file: str) -> list:
|
|||
|
|
"""从OCR文本中解析医疗检测数据 - 优化版,支持英文+中文格式"""
|
|||
|
|
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|||
|
|
|
|||
|
|
# 自动检测报告语言/格式
|
|||
|
|
if _is_cn_report(lines):
|
|||
|
|
print(" [检测] 中文体检报告格式,使用中文解析器")
|
|||
|
|
return parse_chinese_medical_data(text, source_file)
|
|||
|
|
|
|||
|
|
# 以下是原有的英文报告解析逻辑
|
|||
|
|
items = []
|
|||
|
|
|
|||
|
|
# 项目名称到ABB的映射
|
|||
|
|
name_to_abb = {
|
|||
|
|
# 血常规 - 添加更多变体
|
|||
|
|
'mean cell hb concentration': 'MCHC', 'mchc': 'MCHC',
|
|||
|
|
'mean corpuscular hemoglobin concentration': 'MCHC',
|
|||
|
|
'mean corpuscular hemoglobin': 'MCH', 'mean cell hemoglobin': 'MCH',
|
|||
|
|
'rbc distribution width': 'RDW', 'rdw': 'RDW',
|
|||
|
|
'red cell distribution width': 'RDW',
|
|||
|
|
'total wbc': 'WBC', 'white blood cell': 'WBC', 'wbc': 'WBC', 'white blood cells': 'WBC',
|
|||
|
|
'red blood cell': 'RBC', 'rbc count': 'RBC', 'total rbc': 'RBC', 'red blood cells': 'RBC',
|
|||
|
|
'hemoglobin(hb)': 'Hb', 'hemoglobin': 'Hb',
|
|||
|
|
'hematocrit(hct)': 'HCT', 'hematocrit': 'HCT', 'hct': 'HCT',
|
|||
|
|
'mean cell volume': 'MCV', 'mcv': 'MCV', 'mean corpuscular volume': 'MCV',
|
|||
|
|
'platelet count': 'PLT', 'platelet': 'PLT', 'plt': 'PLT', 'platelets': 'PLT',
|
|||
|
|
'mean platelet volume': 'MPV', 'mpv': 'MPV',
|
|||
|
|
'neutrophil': 'NEUT', 'neut': 'NEUT', 'neutrophils': 'NEUT',
|
|||
|
|
'lymphocyte': 'LYMPH', 'lymph': 'LYMPH', 'lymphocytes': 'LYMPH',
|
|||
|
|
'monocyte': 'MONO', 'mono': 'MONO', 'monocytes': 'MONO',
|
|||
|
|
'eosinophil': 'EOS', 'eos': 'EOS', 'eosinophils': 'EOS',
|
|||
|
|
'basophil': 'BAS', 'bas': 'BAS', 'basophils': 'BAS',
|
|||
|
|
'esr': 'ESR', 'erythrocyte sedimentation': 'ESR', 'esr 1 hour': 'ESR',
|
|||
|
|
'esr 1 hour': 'ESR', # 重复确保匹配
|
|||
|
|
|
|||
|
|
# 血糖 - 使用标准ABB: FBS, HbA1C
|
|||
|
|
'glucose(fasting)': 'FBS', 'fasting glucose': 'FBS', 'glucose': 'GLU',
|
|||
|
|
'fasting blood sugar': 'FBS', 'fbs': 'FBS',
|
|||
|
|
'hba1c': 'HbA1C', 'glycated hemoglobin': 'HbA1C', 'haemoglobin a1c': 'HbA1C',
|
|||
|
|
'haemoglobin alc': 'HbA1C', 'hemoglobin a1c': 'HbA1C',
|
|||
|
|
'estimated average glucose': 'EAG',
|
|||
|
|
|
|||
|
|
# 血脂
|
|||
|
|
'hdl-cholesterol': 'HDL', 'hdl cholesterol': 'HDL', 'hdl': 'HDL',
|
|||
|
|
'ldl-cholesterol': 'LDL', 'ldl cholesterol': 'LDL', 'ldl direct': 'LDL',
|
|||
|
|
'ldl-cholesterol(direct)': 'LDL',
|
|||
|
|
'vldl-cholesterol': 'VLDL', 'vldl': 'VLDL',
|
|||
|
|
'total cholesterol': 'TC', 'cholesterol': 'TC',
|
|||
|
|
'triglyceride': 'TG', 'tg': 'TG',
|
|||
|
|
'cholesterol/hdl-c ratio': 'TC/HDL', 'cholesterol/hdl ratio': 'TC/HDL',
|
|||
|
|
'ldl/hdl ratio': 'LDL/HDL',
|
|||
|
|
'lipoprotein(a)': 'Lp(a)', 'lipoprotein a': 'Lp(a)',
|
|||
|
|
'apolipoprotein a1': 'ApoA1', 'apolipoprotein a': 'ApoA1',
|
|||
|
|
'apolipoprotein b': 'ApoB',
|
|||
|
|
|
|||
|
|
# 肝功能 - 注意:ast/alt需要精确匹配,避免误匹配
|
|||
|
|
'alt(alanine transaminase)': 'ALT', 'alanine aminotransferase': 'ALT', 'sgpt': 'ALT',
|
|||
|
|
'ast(aspartate transaminase)': 'AST', 'aspartate aminotransferase': 'AST', 'sgot': 'AST',
|
|||
|
|
'gamma glutamyl transferase': 'GGT', 'gamma gt': 'GGT', 'ggt': 'GGT',
|
|||
|
|
'ggt( gamma gt)': 'GGT',
|
|||
|
|
'alp': 'ALP', 'alkaline phosphatase': 'ALP', 'alp(alkaline phosphatase)': 'ALP',
|
|||
|
|
'total bilirubin': 'TBil', 'bilirubin(total)': 'TBil',
|
|||
|
|
'direct bilirubin': 'DBil', 'bilirubin(direct)': 'DBil', 'bilirubin (direct)': 'DBil',
|
|||
|
|
'ldh': 'LDH', 'lactate dehydrogenase': 'LDH', 'ldh(lactate dehydrogenase)': 'LDH',
|
|||
|
|
'total protein': 'TP',
|
|||
|
|
'albumin': 'ALB', 'albumir': 'ALB', # OCR可能识别错误
|
|||
|
|
'globulin': 'GLB',
|
|||
|
|
|
|||
|
|
# 肾功能
|
|||
|
|
'bun': 'BUN', 'urea nitrogen': 'BUN', 'blood urea nitrogen': 'BUN',
|
|||
|
|
'creatinine': 'Scr',
|
|||
|
|
'uric acid': 'UA',
|
|||
|
|
'egfr': 'eGFR', 'egfr for thai': 'eGFR',
|
|||
|
|
|
|||
|
|
# 电解质
|
|||
|
|
'sodium': 'Na',
|
|||
|
|
'potassium': 'K',
|
|||
|
|
'chloride': 'Cl',
|
|||
|
|
'tco2': 'TCO2',
|
|||
|
|
'anion gap': 'AG',
|
|||
|
|
'calcium': 'Ca',
|
|||
|
|
'phosphorus': 'P', 'phosphate': 'P', 'inorganic phosphate': 'P',
|
|||
|
|
'magnesium': 'Mg', 'magnesium(mg)': 'Mg',
|
|||
|
|
|
|||
|
|
# 凝血功能 - 注意:partial thromboplastin 要在前面,避免被ast匹配
|
|||
|
|
'partial thromboplastin time': 'APTT', 'activated partial thromboplastin': 'APTT',
|
|||
|
|
'prothrombin time': 'PT', 'prothrombin time(pt)': 'PT',
|
|||
|
|
'thrombin time': 'TT', 'thrombin time(tt)': 'TT',
|
|||
|
|
'fibrinogen': 'FIB', 'fibrinogen level': 'FIB',
|
|||
|
|
'd-dimer': 'D-Dimer', 'fdp d-dimer': 'D-Dimer',
|
|||
|
|
'aptt': 'APTT',
|
|||
|
|
'inr': 'INR',
|
|||
|
|
|
|||
|
|
# 甲状腺
|
|||
|
|
'tsh': 'TSH', 'thyroid stimulating': 'TSH',
|
|||
|
|
'free t3': 'FT3', 'free t3(free triiodothyronine)': 'FT3',
|
|||
|
|
'free t4': 'FT4', 'free t4 (free thyroxine)': 'FT4',
|
|||
|
|
'total t3': 'T3', 'total t3(triiodothyronine)': 'T3',
|
|||
|
|
'total t4': 'T4', 'totalt4 (thyroxine)': 'T4',
|
|||
|
|
|
|||
|
|
# 性激素 - 使用标准ABB: T, COR, DHEAS
|
|||
|
|
'estradiol(e2)': 'E2', 'estradiol': 'E2', 'estrogen': 'E2',
|
|||
|
|
'progesterone': 'PROG',
|
|||
|
|
'testosterone': 'T', # 标准ABB是T
|
|||
|
|
'fsh': 'FSH', 'follicle stimulating': 'FSH', 'folicle stimulating hormone': 'FSH',
|
|||
|
|
'folicle stimulating hormone(fsh)': 'FSH', # OCR可能拼写错误
|
|||
|
|
'lh(luteinizing hormone)': 'LH', 'lh': 'LH', 'luteinizing hormone': 'LH',
|
|||
|
|
'prolactin': 'PRL',
|
|||
|
|
'cortisol': 'COR', # 标准ABB是COR
|
|||
|
|
'dhea-sulphate': 'DHEAS', 'dhea': 'DHEA', 'dhea-s': 'DHEAS', # 标准ABB是DHEAS
|
|||
|
|
'igf-1': 'IGF-1', 'igf1': 'IGF-1',
|
|||
|
|
'calcitonin': 'CT', # 标准ABB是CT
|
|||
|
|
|
|||
|
|
# 肿瘤标志物 - 使用标准ABB: CA15-3, CA19-9, TPSA
|
|||
|
|
'afp': 'AFP', 'alpha fetoprotein': 'AFP', 'afp(alpha fetoprotein)': 'AFP',
|
|||
|
|
'cea': 'CEA', 'carcinoembryonic': 'CEA', 'cea(carcinoembryonic antigen)': 'CEA',
|
|||
|
|
'ca125': 'CA125', 'ca 125': 'CA125', 'cancer antigen 125': 'CA125',
|
|||
|
|
'ca153': 'CA15-3', 'ca 15-3': 'CA15-3', 'carbohydrate antigen 15-3': 'CA15-3', 'cancer antigen 15-3': 'CA15-3', # 标准ABB
|
|||
|
|
'ca199': 'CA19-9', 'ca 19-9': 'CA19-9', 'carbohydrate antigen 19-9': 'CA19-9', # 标准ABB
|
|||
|
|
'psa': 'TPSA', 'total psa': 'TPSA', 'prostate specific antigen': 'TPSA', # 标准ABB是TPSA
|
|||
|
|
'free psa': 'FPSA', 'fpsa': 'FPSA',
|
|||
|
|
'nse': 'NSE', 'neuron specific enolase': 'NSE',
|
|||
|
|
'cyfra 21-1': 'CYFRA21-1', 'cyfra 21-1(nonsmall cell lung)': 'CYFRA21-1',
|
|||
|
|
'thyroglobulin': 'Tg', 'tg': 'Tg', # 甲状腺球蛋白
|
|||
|
|
|
|||
|
|
# 炎症指标
|
|||
|
|
'c-reactive protein(high sens)': 'hs-CRP', 'hs-crp': 'hs-CRP',
|
|||
|
|
'c-reactive protein high sens': 'hs-CRP', # 无括号版本
|
|||
|
|
'crp': 'CRP', 'c-reactive protein': 'CRP',
|
|||
|
|
'rf': 'RF', 'rheumatoid factor': 'RF',
|
|||
|
|
'anti streptolysin o titre(aso)': 'ASO', 'anti streptolysin o titre': 'ASO',
|
|||
|
|
'aso': 'ASO', 'anti-streptolysin': 'ASO',
|
|||
|
|
|
|||
|
|
# 免疫球蛋白
|
|||
|
|
'immunoglobulin g(igg)': 'IgG', 'immunoglobulin g': 'IgG', 'igg': 'IgG',
|
|||
|
|
'immunoglobulin a(iga)': 'IgA', 'immunoglobulin a': 'IgA', 'iga': 'IgA',
|
|||
|
|
'immunoglobulin m(igm)': 'IgM', 'immunoglobulin m': 'IgM', 'igm': 'IgM',
|
|||
|
|
'immunoglobulin e(ige)': 'IgE', 'immunoglobulin e': 'IgE', 'ige': 'IgE',
|
|||
|
|
'complement c3(b1c)': 'C3', 'complement c3': 'C3', 'c3': 'C3',
|
|||
|
|
'complement c4': 'C4', 'c4': 'C4', 'complement c4': 'C4',
|
|||
|
|
|
|||
|
|
# 淋巴细胞亚群
|
|||
|
|
'cd3+': 'CD3+', 'cd3': 'CD3+', 't lymphocyte': 'CD3+', 't-lymphocyte': 'CD3+',
|
|||
|
|
'cd3+ t lymphocyte': 'CD3+', 'cd3+t': 'CD3+',
|
|||
|
|
'cd4+': 'CD4+', 'cd4': 'CD4+', 'helper t cell': 'CD4+', 'cd4+ t helper': 'CD4+',
|
|||
|
|
'cd4+t': 'CD4+', 'cd4+ helper': 'CD4+',
|
|||
|
|
'cd8+': 'CD8+', 'cd8': 'CD8+', 'cytotoxic t cell': 'CD8+', 'cd8+ t cytotoxic': 'CD8+',
|
|||
|
|
'cd8+t': 'CD8+', 'suppressor t cell': 'CD8+',
|
|||
|
|
'cd4/cd8': 'CD4/CD8', 'cd4/cd8 ratio': 'CD4/CD8', 'cd4:cd8': 'CD4/CD8',
|
|||
|
|
'nk cell': 'NK', 'nk cells': 'NK', 'natural killer': 'NK', 'cd16+cd56+': 'NK',
|
|||
|
|
'cd16/cd56': 'NK', 'nk': 'NK', '% nk cell': 'NK', 'flowcytometry for nk cell': 'NK',
|
|||
|
|
'b lymphocyte': 'B-Lymph', 'b-lymphocyte': 'B-Lymph', 'b cell': 'B-Lymph',
|
|||
|
|
'cd19+': 'B-Lymph', 'cd19': 'B-Lymph',
|
|||
|
|
't lymphocyte count': 'T-Lymph', 't-lymphocyte count': 'T-Lymph',
|
|||
|
|
|
|||
|
|
# 自身抗体
|
|||
|
|
'ana': 'ANA', 'antinuclear antibody': 'ANA',
|
|||
|
|
'thyroglobulin antibody': 'TgAb',
|
|||
|
|
|
|||
|
|
# 传染病 - 使用标准ABB: HCV
|
|||
|
|
'hbsag(hepatitis b surface antigen)': 'HBsAg', 'hepatitis b surface antigen': 'HBsAg', 'hbsag': 'HBsAg',
|
|||
|
|
'hbsab(hepatitis b surface antibody)': 'HBsAb', 'hepatitis b surface antibody': 'HBsAb', 'hbsab': 'HBsAb',
|
|||
|
|
'hbe ag(hepatitis be antigen)': 'HBeAg', 'hepatitis be antigen': 'HBeAg', 'hbeag': 'HBeAg',
|
|||
|
|
'hbe ab(hepatitis be antibody)': 'HBeAb', 'hepatitis be antibody': 'HBeAb', 'hbeab': 'HBeAb',
|
|||
|
|
'hbcab(hepatitis b core antibody)': 'HBcAb', 'hepatitis b core antibody': 'HBcAb', 'hbcab': 'HBcAb',
|
|||
|
|
'hcv ab (hepatitis c antibody)': 'HCV', 'hepatitis c antibody': 'HCV', 'anti-hcv': 'HCV', # 标准ABB是HCV
|
|||
|
|
'hiv-1/hiv-2 antibody': 'HIV', 'hiv': 'HIV',
|
|||
|
|
'rpr (rapid plasma reagin)': 'TRUST', 'rapid plasma reagin': 'TRUST', 'rpr': 'TRUST',
|
|||
|
|
'rpr(rapid plasma reagin)': 'TRUST', # 无空格版本 # 标准ABB是TRUST
|
|||
|
|
'h.pylori': 'H.pylori', 'helicobacter': 'H.pylori',
|
|||
|
|
|
|||
|
|
# 血型 - Rh(D)是标准ABB
|
|||
|
|
'abo group': 'ABO', 'abo blood group': 'ABO',
|
|||
|
|
'rh group': 'Rh', 'rh blood group': 'Rh',
|
|||
|
|
'rh(d)': 'Rh(D)', 'rh factor': 'Rh(D)', 'rh-d': 'Rh(D)',
|
|||
|
|
|
|||
|
|
# 尿检
|
|||
|
|
'color': 'Color', 'colour': 'Color',
|
|||
|
|
'transparency': 'Clarity',
|
|||
|
|
'specific gravity': 'SG',
|
|||
|
|
'ph': 'pH',
|
|||
|
|
'protein': 'PRO',
|
|||
|
|
'ketone': 'KET',
|
|||
|
|
'bilirubin': 'BIL',
|
|||
|
|
'urobilinogen': 'URO',
|
|||
|
|
'nitrite': 'NIT',
|
|||
|
|
'leukocyte': 'LEU', 'leucocyte': 'LEU',
|
|||
|
|
'erythrocyte': 'ERY',
|
|||
|
|
'squamous epithelial cell': 'SEC', 'squamous epithelial': 'SEC',
|
|||
|
|
'calcium oxalate crystal': 'CRY', 'calcium oxalate': 'CRY', # 标准ABB是CRY
|
|||
|
|
|
|||
|
|
# 微量元素/重金属 - 使用标准ABB: Fer, 25-OH-VD2+D3, Hcy
|
|||
|
|
'iron': 'Fe', 'serum iron': 'Fe',
|
|||
|
|
'ferritin': 'Fer', # 标准ABB是Fer
|
|||
|
|
'zinc': 'Zn',
|
|||
|
|
'copper': 'Cu',
|
|||
|
|
'vitamin b12': 'VitB12', 'vit b12': 'VitB12',
|
|||
|
|
'folate': 'Folate', 'folic acid': 'Folate',
|
|||
|
|
'vitamin d(25-oh vitamin d total)': '25-OH-VD2+D3', 'vitamin d': '25-OH-VD2+D3', '25-oh vitamin d': '25-OH-VD2+D3', # 标准ABB
|
|||
|
|
'25-hydroxyvitamin d': '25-OH-VD2+D3', '25-oh-vitd': '25-OH-VD2+D3',
|
|||
|
|
'homocysteine': 'Hcy', # 标准ABB是Hcy
|
|||
|
|
'lead in blood': 'Pb', 'lead': 'Pb',
|
|||
|
|
'mercury in blood': 'Hg', 'mercury': 'Hg',
|
|||
|
|
'cadmium in blood': 'Cd', 'cadmium': 'Cd',
|
|||
|
|
'chromium in blood': 'Cr', 'chromium': 'Cr',
|
|||
|
|
'manganese in blood': 'Mn', 'manganese': 'Mn',
|
|||
|
|
'nickel in blood': 'Ni', 'nickel': 'Ni',
|
|||
|
|
|
|||
|
|
# 心肌酶
|
|||
|
|
'ck-mb': 'CK-MB', 'creatine kinase-mb': 'CK-MB',
|
|||
|
|
'creatine kinase': 'CK',
|
|||
|
|
|
|||
|
|
# 骨代谢 - 使用标准ABB: OST, TPINP, β-CTX
|
|||
|
|
'n-mid osteocalcin': 'OST', 'osteocalcin': 'OST', # 标准ABB是OST
|
|||
|
|
'p1np': 'TPINP', 'total procollagen': 'TPINP', # 标准ABB是TPINP
|
|||
|
|
'beta crosslap': 'β-CTX', 'ctx': 'β-CTX', 'b-ctx': 'β-CTX', 'beta-crosslaps': 'β-CTX', # 标准ABB是β-CTX
|
|||
|
|
'pth(intact)': 'PTH', 'pth': 'PTH', 'parathyroid hormone': 'PTH',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 跳过关键词
|
|||
|
|
skip_words = [
|
|||
|
|
'page ', 'patient name', 'doctor:', 'laboratory', 'specimen',
|
|||
|
|
'collected date', 'printed', 'bangkok', 'thailand',
|
|||
|
|
'tel.', 'fax.', 'email:', 'iso 15189', 'iso15189',
|
|||
|
|
'accreditation', 'lab no', 'mrn', 'requested date',
|
|||
|
|
'received date', 'address/', 'sex :', 'sex:', 'age :',
|
|||
|
|
'dob :', 'ref.no', 'copyright', 'reported by', 'authorised by',
|
|||
|
|
'print date', 'remark:', 'confidential', 'this report',
|
|||
|
|
'reference range', 'test name', 'result unit', 'edta',
|
|||
|
|
'morphology', 'adequate', 'differential count',
|
|||
|
|
'complete blood count', 'issue date', 'revision', 'normal range',
|
|||
|
|
'for 10-year', 'this equation', 'calculated by',
|
|||
|
|
'approved by', 'trimester', 'women(', 'female 21',
|
|||
|
|
'comment:', 'method:', 'method.', 'serum',
|
|||
|
|
'borderline', 'optimal', 'near optimal', 'very high',
|
|||
|
|
'low risk', 'average risk', 'high risk', 'aha', 'cdc',
|
|||
|
|
'national', 'healthcare', 'systems', 'new petchburi',
|
|||
|
|
'physical examination', 'chemical examination', 'urine sedimentation',
|
|||
|
|
'result comment', 'repeated result',
|
|||
|
|
'immunoturbidimetric', 'electrochemiluminescence',
|
|||
|
|
'macroscopic', 'sensitivity:', 'fta-abs', 'tpha',
|
|||
|
|
'reactive screening', 'gold standard', 'syphilis',
|
|||
|
|
'egfr comment', 'ckd - epi', 'kidney foundation',
|
|||
|
|
'ascvd', 'cardiovascular', 'diabetes mellitus', 'target should',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 按key长度排序
|
|||
|
|
sorted_keys = sorted(name_to_abb.keys(), key=len, reverse=True)
|
|||
|
|
|
|||
|
|
def find_abb(project_name):
|
|||
|
|
"""查找项目对应的ABB"""
|
|||
|
|
pl = project_name.lower().strip()
|
|||
|
|
# 移除点号和冒号
|
|||
|
|
pl = re.sub(r'[\.:\s]+$', '', pl)
|
|||
|
|
pl = re.sub(r'\.{2,}', '', pl)
|
|||
|
|
|
|||
|
|
for key in sorted_keys:
|
|||
|
|
if key in pl:
|
|||
|
|
return name_to_abb[key]
|
|||
|
|
|
|||
|
|
# 生成ABB
|
|||
|
|
words = [w for w in project_name.split() if len(w) > 0 and w[0].isalpha()]
|
|||
|
|
if words:
|
|||
|
|
return ''.join([w[0].upper() for w in words])[:6]
|
|||
|
|
return project_name[:6].upper()
|
|||
|
|
|
|||
|
|
def parse_value(text):
|
|||
|
|
"""解析数值,返回 (result, point, unit)"""
|
|||
|
|
text = text.strip()
|
|||
|
|
|
|||
|
|
# 特殊处理:跳过开头的单独点号(OCR可能把分隔符识别为点号)
|
|||
|
|
# 如 ". Negative" -> "Negative"
|
|||
|
|
if text.startswith('. ') or text.startswith('。 '):
|
|||
|
|
text = text[2:].strip()
|
|||
|
|
elif text == '.' or text == '。':
|
|||
|
|
return None, '', ''
|
|||
|
|
|
|||
|
|
# 特殊处理:如果结果只是连续的点号,说明OCR识别错误,返回None
|
|||
|
|
# 如 ".............." 应该被跳过
|
|||
|
|
if re.match(r'^\.{3,}$', text):
|
|||
|
|
return None, '', ''
|
|||
|
|
|
|||
|
|
# 特殊处理:如果包含冒号,可能是 "<20.0 pg/mL: Normal" 格式
|
|||
|
|
if ':' in text:
|
|||
|
|
parts = text.split(':')
|
|||
|
|
text = parts[0].strip() # 只取冒号前的部分
|
|||
|
|
|
|||
|
|
# 格式0: "5.95 *10^3/mm3" 或 "4.69 *10^6/mm3" 或 "209 10^3/mm3"
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)\s*([HL])?\s*(\*?10\^[\d]+[/a-zA-Z0-9\^]+)', text, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
result = m.group(1)
|
|||
|
|
point = ''
|
|||
|
|
if m.group(2):
|
|||
|
|
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
|||
|
|
unit = m.group(3)
|
|||
|
|
return result, point, unit
|
|||
|
|
|
|||
|
|
# 格式0.5: "41.3 号" (OCR识别错误,号应该是%)
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)\s*([HL])?\s*号', text, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
result = m.group(1)
|
|||
|
|
point = ''
|
|||
|
|
if m.group(2):
|
|||
|
|
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
|||
|
|
return result, point, '%'
|
|||
|
|
|
|||
|
|
# 格式1: "230H" 或 "5.7H%" 或 "140H mg/dL"
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)\s*([HL])\s*(%)?(.*)$', text, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
result = m.group(1)
|
|||
|
|
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
|||
|
|
unit = (m.group(3) or '') + (m.group(4) or '').strip()
|
|||
|
|
return result, point, unit
|
|||
|
|
|
|||
|
|
# 格式2: "158.00mg/dL" (数值和单位连在一起)
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)([a-zA-Z/%][a-zA-Z0-9/%\^\*]*)$', text)
|
|||
|
|
if m:
|
|||
|
|
return m.group(1), '', m.group(2)
|
|||
|
|
|
|||
|
|
# 格式3: "113.00 H IU/mL"
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)\s+([HL])\s+(.+)$', text, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
|||
|
|
return m.group(1), point, m.group(3).strip()
|
|||
|
|
|
|||
|
|
# 格式4: 纯数值 "95" 或 "18.4" 或 "<20.0"
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)$', text)
|
|||
|
|
if m:
|
|||
|
|
return m.group(1), '', ''
|
|||
|
|
|
|||
|
|
# 格式5: 带单位 "20 H mm/hr" 或 "5.07H mg/L" 或 "<20.0 pg/mL"
|
|||
|
|
m = re.match(r'^([<>]?[\d\.]+)\s*([HL])?\s*([a-zA-Z/%\*].*)$', text, re.IGNORECASE)
|
|||
|
|
if m:
|
|||
|
|
point = ''
|
|||
|
|
if m.group(2):
|
|||
|
|
point = '↑' if m.group(2).upper() == 'H' else '↓'
|
|||
|
|
return m.group(1), point, m.group(3).strip()
|
|||
|
|
|
|||
|
|
# 格式6: 定性结果
|
|||
|
|
qualitative = ['positive', 'negative', 'reactive', 'non reactive', 'non-reactive',
|
|||
|
|
'normal', 'abnormal', 'yellow', 'clear', 'straw', 'amber',
|
|||
|
|
'a', 'b', 'ab', 'o', 'less than']
|
|||
|
|
text_lower = text.lower()
|
|||
|
|
for q in qualitative:
|
|||
|
|
if text_lower.startswith(q):
|
|||
|
|
return text.split()[0], '', ''
|
|||
|
|
|
|||
|
|
# 格式7: 范围结果 "0-1 Cells/HPF" 或 "2-3 Cells/HPF"
|
|||
|
|
m = re.match(r'^(\d+\-\d+)\s*(.*)$', text)
|
|||
|
|
if m:
|
|||
|
|
return m.group(1), '', m.group(2).strip()
|
|||
|
|
|
|||
|
|
return None, '', ''
|
|||
|
|
|
|||
|
|
def is_project_line(line):
|
|||
|
|
"""判断是否是项目名行"""
|
|||
|
|
# 移除开头的(*)
|
|||
|
|
clean = re.sub(r'^\(\*\)', '', line).strip()
|
|||
|
|
|
|||
|
|
# 如果移除(*)后只剩下很短的文本,可能只是标题行,不是项目行
|
|||
|
|
# 如 "(*)ESR" -> "ESR",这种情况下应该继续查找下一行
|
|||
|
|
if len(clean) < 5 and '...' not in line and ':' not in line and ':' not in line:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 包含连续点号的行
|
|||
|
|
if '...' in line or '...' in line:
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
# 以冒号结尾
|
|||
|
|
if line.endswith(':') or line.endswith(':'):
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
# 包含冒号且冒号后有内容(如 "项目名 : 结果")
|
|||
|
|
if ':' in line or ':' in line:
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
# 已知项目名
|
|||
|
|
line_lower = line.lower()
|
|||
|
|
for key in sorted_keys:
|
|||
|
|
if key in line_lower:
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def extract_project_and_result(line):
|
|||
|
|
"""从行中提取项目名和结果(处理多种格式)"""
|
|||
|
|
# 移除开头的(*)
|
|||
|
|
line = re.sub(r'^\(\*\)', '', line).strip()
|
|||
|
|
|
|||
|
|
# 格式0: "FSH. 5.85 mIU/mL" - 项目名后是单个点号加空格加数值
|
|||
|
|
# 匹配: 项目名. 数值 单位
|
|||
|
|
m = re.match(r'^([A-Za-z][A-Za-z0-9\-\(\)]+)\.\s+([<>]?[\d\.]+)\s*([a-zA-Z/%].*)$', line)
|
|||
|
|
if m:
|
|||
|
|
project = m.group(1).strip()
|
|||
|
|
rest = m.group(2) + ' ' + m.group(3)
|
|||
|
|
result, point, unit = parse_value(rest)
|
|||
|
|
if result:
|
|||
|
|
return project, result, point, unit, ''
|
|||
|
|
|
|||
|
|
# 格式1: "项目名...... 结果 [Normal: xxx]" 或 "项目名...... 结果 (参考范围)"
|
|||
|
|
# 如 "Color........................ Yellow [Normal : Yellow]"
|
|||
|
|
# 或 "pH......... 6.0 (4.5-8.0)"
|
|||
|
|
# 注意:先检查点号分隔,因为有些行包含 [Normal: xxx] 中的冒号
|
|||
|
|
if '...' in line or '...' in line:
|
|||
|
|
# 用点号分割
|
|||
|
|
parts = re.split(r'\.{2,}', line, maxsplit=1)
|
|||
|
|
if len(parts) == 2:
|
|||
|
|
project = parts[0].strip()
|
|||
|
|
rest = parts[1].strip()
|
|||
|
|
|
|||
|
|
# 去掉rest开头的冒号(如 ": 5.07H mg/L")
|
|||
|
|
rest = re.sub(r'^[:\:]\s*', '', rest)
|
|||
|
|
|
|||
|
|
if rest:
|
|||
|
|
# 先提取 [Normal: xxx] 格式的参考范围
|
|||
|
|
reference = ''
|
|||
|
|
normal_match = re.search(r'\[Normal\s*[:\:]\s*([^\]]+)\]', rest, re.IGNORECASE)
|
|||
|
|
if normal_match:
|
|||
|
|
reference = f'[Normal: {normal_match.group(1).strip()}]'
|
|||
|
|
rest = rest[:normal_match.start()].strip()
|
|||
|
|
|
|||
|
|
# 再提取 (xxx) 格式的参考范围
|
|||
|
|
if not reference:
|
|||
|
|
ref_match = re.search(r'\(([^\)]+)\)\s*$', rest)
|
|||
|
|
if ref_match:
|
|||
|
|
reference = f'({ref_match.group(1)})'
|
|||
|
|
rest = rest[:ref_match.start()].strip()
|
|||
|
|
|
|||
|
|
# 解析结果和单位
|
|||
|
|
result, point, unit = parse_value(rest)
|
|||
|
|
|
|||
|
|
# 特殊处理:如果结果为空(可能是OCR只识别到点号),但有[Normal: xxx]参考范围
|
|||
|
|
# 对于尿检项目,如果参考范围是Negative,结果也应该是Negative
|
|||
|
|
if not result and reference:
|
|||
|
|
# 从参考范围中提取预期值
|
|||
|
|
normal_val_match = re.search(r'\[Normal[:\:]\s*([^\]]+)\]', reference, re.IGNORECASE)
|
|||
|
|
if normal_val_match:
|
|||
|
|
expected_val = normal_val_match.group(1).strip()
|
|||
|
|
# 如果预期值是定性结果(如Negative, Yellow等),使用它作为结果
|
|||
|
|
qualitative_vals = ['negative', 'positive', 'normal', 'yellow', 'clear', 'straw', 'amber', 'not found']
|
|||
|
|
if expected_val.lower() in qualitative_vals:
|
|||
|
|
result = expected_val
|
|||
|
|
point = ''
|
|||
|
|
unit = ''
|
|||
|
|
|
|||
|
|
if result:
|
|||
|
|
return project, result, point, unit, reference
|
|||
|
|
|
|||
|
|
# 格式2: "项目名...: 结果 单位 (参考范围)" 或 "项目名: 结果"
|
|||
|
|
if ':' in line or ':' in line:
|
|||
|
|
# 使用正则分割,支持中英文冒号
|
|||
|
|
parts = re.split(r'[:\:]', line, maxsplit=1)
|
|||
|
|
if len(parts) == 2:
|
|||
|
|
project = parts[0].strip()
|
|||
|
|
rest = parts[1].strip()
|
|||
|
|
|
|||
|
|
# 清理项目名中的点号
|
|||
|
|
project = re.sub(r'\.{2,}', '', project).strip()
|
|||
|
|
|
|||
|
|
# 解析rest部分
|
|||
|
|
if rest:
|
|||
|
|
# 先提取 [Normal: xxx] 格式的参考范围
|
|||
|
|
reference = ''
|
|||
|
|
normal_match = re.search(r'\[Normal\s*[:\:]\s*([^\]]+)\]', rest, re.IGNORECASE)
|
|||
|
|
if normal_match:
|
|||
|
|
reference = f'[Normal: {normal_match.group(1).strip()}]'
|
|||
|
|
rest = rest[:normal_match.start()].strip()
|
|||
|
|
|
|||
|
|
# 再提取 (xxx) 格式的参考范围
|
|||
|
|
if not reference:
|
|||
|
|
ref_match = re.search(r'\(([^\)]+)\)\s*$', rest)
|
|||
|
|
if ref_match:
|
|||
|
|
reference = f'({ref_match.group(1)})'
|
|||
|
|
rest = rest[:ref_match.start()].strip()
|
|||
|
|
|
|||
|
|
# 解析结果和单位
|
|||
|
|
result, point, unit = parse_value(rest)
|
|||
|
|
|
|||
|
|
if result:
|
|||
|
|
return project, result, point, unit, reference
|
|||
|
|
|
|||
|
|
# 格式3: "项目名 结果" 格式(无点号无冒号)
|
|||
|
|
# 如 "INR 0.93" 或 "Color Yellow"
|
|||
|
|
parts = line.split()
|
|||
|
|
if len(parts) >= 2:
|
|||
|
|
potential_project = parts[0]
|
|||
|
|
potential_value = ' '.join(parts[1:])
|
|||
|
|
|
|||
|
|
# 检查是否是已知项目
|
|||
|
|
pl = potential_project.lower()
|
|||
|
|
for key in sorted_keys:
|
|||
|
|
if pl == key or key in pl:
|
|||
|
|
result, point, unit = parse_value(potential_value)
|
|||
|
|
if result:
|
|||
|
|
return potential_project, result, point, unit, ''
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 如果没有冒号也没有点号,返回原始项目名
|
|||
|
|
project = re.sub(r'\.{2,}', '', line).strip()
|
|||
|
|
project = re.sub(r'[:\:]+\s*$', '', project).strip()
|
|||
|
|
return project, None, '', '', ''
|
|||
|
|
|
|||
|
|
# 主解析循环
|
|||
|
|
i = 0
|
|||
|
|
while i < len(lines):
|
|||
|
|
line = lines[i].strip()
|
|||
|
|
line_lower = line.lower()
|
|||
|
|
|
|||
|
|
# 跳过无关行
|
|||
|
|
if any(w in line_lower for w in skip_words):
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过空行和太短的行
|
|||
|
|
if len(line) < 2:
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过纯数字参考范围行 如 "(0-15)" 或 "(<200)"
|
|||
|
|
if re.match(r'^\([<>]?[\d\.\-]+\)$', line):
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过纯单位行
|
|||
|
|
if re.match(r'^[a-zA-Z/%\^]+$', line) and len(line) < 10:
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查是否是项目名行
|
|||
|
|
if is_project_line(line):
|
|||
|
|
# 尝试从同一行提取项目名和结果
|
|||
|
|
project, result, point, unit, reference = extract_project_and_result(line)
|
|||
|
|
|
|||
|
|
# 跳过太短或太长的项目名
|
|||
|
|
if len(project) < 2 or len(project) > 60:
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 跳过噪音项目
|
|||
|
|
if project.lower() in ['report by', 'reported by', 'health', 'age', 'high', 'low']:
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
abb = find_abb(project)
|
|||
|
|
|
|||
|
|
# 如果同一行没有结果,查找下一行
|
|||
|
|
if result is None:
|
|||
|
|
j = i + 1
|
|||
|
|
while j < len(lines) and j < i + 5:
|
|||
|
|
next_line = lines[j].strip()
|
|||
|
|
next_lower = next_line.lower()
|
|||
|
|
|
|||
|
|
# 跳过无关行
|
|||
|
|
if any(w in next_lower for w in skip_words):
|
|||
|
|
j += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 如果是新的项目名(包含冒号或点号分隔),停止
|
|||
|
|
if is_project_line(next_line) and (':' in next_line or '...' in next_line):
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 如果是 "FSH. 5.85 mIU/mL" 格式的行,也停止(让主循环处理)
|
|||
|
|
if re.match(r'^[A-Za-z][A-Za-z0-9\-\(\)]+\.\s+[<>]?[\d\.]+', next_line):
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 参考范围行
|
|||
|
|
if next_line.startswith('(') and ')' in next_line:
|
|||
|
|
reference = next_line
|
|||
|
|
j += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# [Normal: xxx] 格式
|
|||
|
|
if next_line.startswith('['):
|
|||
|
|
j += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 尝试解析为结果
|
|||
|
|
r, p, u = parse_value(next_line)
|
|||
|
|
if r:
|
|||
|
|
result = r
|
|||
|
|
point = p
|
|||
|
|
unit = u
|
|||
|
|
j += 1
|
|||
|
|
|
|||
|
|
# 继续查找单位和参考范围
|
|||
|
|
while j < len(lines) and j < i + 5:
|
|||
|
|
next2 = lines[j].strip()
|
|||
|
|
|
|||
|
|
# 单位行
|
|||
|
|
if re.match(r'^[a-zA-Z/%\^][a-zA-Z0-9/%\^\*]+$', next2) and not unit:
|
|||
|
|
unit = next2
|
|||
|
|
j += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 参考范围
|
|||
|
|
if next2.startswith('(') and ')' in next2:
|
|||
|
|
reference = next2
|
|||
|
|
j += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
break
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
j += 1
|
|||
|
|
|
|||
|
|
i = j
|
|||
|
|
else:
|
|||
|
|
i += 1
|
|||
|
|
|
|||
|
|
# 保存结果
|
|||
|
|
if result and abb:
|
|||
|
|
# 过滤噪音
|
|||
|
|
if project.lower() in ['age', 'high', 'low', 'a', 'h', 'l', 'report by']:
|
|||
|
|
continue
|
|||
|
|
if len(project) > 50:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 白细胞分类项目特殊处理:根据参考范围判断是数量还是百分比
|
|||
|
|
wbc_diff_abbs = {'NEUT', 'LYMPH', 'MONO', 'EOS', 'BAS'}
|
|||
|
|
if abb.upper() in wbc_diff_abbs:
|
|||
|
|
is_percentage = False
|
|||
|
|
# 检查单位是否是百分比
|
|||
|
|
if unit and '%' in unit:
|
|||
|
|
is_percentage = True
|
|||
|
|
# 检查参考范围是否是百分比形式(0-100之间的数值)
|
|||
|
|
elif reference:
|
|||
|
|
ref_match = re.search(r'\(?([\d\.]+)\s*[-–]\s*([\d\.]+)\)?', reference)
|
|||
|
|
if ref_match:
|
|||
|
|
try:
|
|||
|
|
low = float(ref_match.group(1))
|
|||
|
|
high = float(ref_match.group(2))
|
|||
|
|
# 如果参考范围在0-100之间,且没有10^3等单位标识,认为是百分比
|
|||
|
|
if 0 <= low <= 100 and 0 <= high <= 100 and '10^' not in reference and '*10' not in reference:
|
|||
|
|
is_percentage = True
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if is_percentage:
|
|||
|
|
abb = abb.upper() + '%'
|
|||
|
|
if not unit:
|
|||
|
|
unit = '%'
|
|||
|
|
|
|||
|
|
items.append({
|
|||
|
|
'abb': abb,
|
|||
|
|
'project': project,
|
|||
|
|
'result': result,
|
|||
|
|
'point': point,
|
|||
|
|
'unit': unit,
|
|||
|
|
'reference': reference,
|
|||
|
|
'source': source_file
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 白细胞分类项目特殊处理:检查下一行是否是绝对值数据
|
|||
|
|
# PDF格式如:
|
|||
|
|
# Neutrophils............... 54.4 % (46.5-75.0)
|
|||
|
|
# 3237 /mm3 (2000-7500)
|
|||
|
|
wbc_diff_names = {'neutrophil', 'lymphocyte', 'monocyte', 'eosinophil', 'basophil',
|
|||
|
|
'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils', 'basophils'}
|
|||
|
|
if project.lower() in wbc_diff_names or any(n in project.lower() for n in wbc_diff_names):
|
|||
|
|
# 查找下一行的绝对值数据
|
|||
|
|
# 注意:此时 i 已经指向下一行,所以从 i 开始查找
|
|||
|
|
next_idx = i
|
|||
|
|
search_limit = min(next_idx + 5, len(lines)) # 最多查找5行
|
|||
|
|
while next_idx < search_limit:
|
|||
|
|
next_line = lines[next_idx].strip()
|
|||
|
|
# 跳过空行
|
|||
|
|
if not next_line:
|
|||
|
|
next_idx += 1
|
|||
|
|
continue
|
|||
|
|
# 如果是新的项目名行,停止
|
|||
|
|
if is_project_line(next_line):
|
|||
|
|
break
|
|||
|
|
# 检查是否是绝对值数据行(数值 + /mm3 或 10^3/mm3 等单位)
|
|||
|
|
# 格式如:3237 /mm3 (2000-7500) 或 3237 10^3/mm3 (2000-7500)
|
|||
|
|
abs_match = re.match(r'^\s*([<>]?[\d\.]+)\s*([HL])?\s*(/mm3|\*?10\^[\d]+[/a-zA-Z0-9\^]+|[/a-zA-Z0-9\^]+mm3)\s*(\([^\)]+\))?', next_line, re.IGNORECASE)
|
|||
|
|
if abs_match:
|
|||
|
|
abs_result = abs_match.group(1)
|
|||
|
|
abs_point = ''
|
|||
|
|
if abs_match.group(2):
|
|||
|
|
abs_point = '↑' if abs_match.group(2).upper() == 'H' else '↓'
|
|||
|
|
abs_unit = abs_match.group(3) if abs_match.group(3) else ''
|
|||
|
|
abs_reference = abs_match.group(4) if abs_match.group(4) else ''
|
|||
|
|
|
|||
|
|
# 生成绝对值的ABB(去掉%后缀,或使用原始ABB)
|
|||
|
|
base_abb = abb.replace('%', '').upper()
|
|||
|
|
|
|||
|
|
items.append({
|
|||
|
|
'abb': base_abb,
|
|||
|
|
'project': project,
|
|||
|
|
'result': abs_result,
|
|||
|
|
'point': abs_point,
|
|||
|
|
'unit': abs_unit,
|
|||
|
|
'reference': abs_reference,
|
|||
|
|
'source': source_file
|
|||
|
|
})
|
|||
|
|
next_idx += 1
|
|||
|
|
break
|
|||
|
|
next_idx += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查是否是 "项目名 结果" 格式(无点号无冒号)
|
|||
|
|
# 如 "Color Yellow" 或 "pH 6.0"
|
|||
|
|
parts = line.split()
|
|||
|
|
if len(parts) >= 2:
|
|||
|
|
potential_project = parts[0]
|
|||
|
|
potential_value = ' '.join(parts[1:])
|
|||
|
|
|
|||
|
|
# 检查是否是已知项目
|
|||
|
|
pl = potential_project.lower()
|
|||
|
|
for key in sorted_keys:
|
|||
|
|
if pl == key or pl.startswith(key):
|
|||
|
|
abb = name_to_abb[key]
|
|||
|
|
result, point, unit = parse_value(potential_value)
|
|||
|
|
if result:
|
|||
|
|
items.append({
|
|||
|
|
'abb': abb,
|
|||
|
|
'project': potential_project,
|
|||
|
|
'result': result,
|
|||
|
|
'point': point,
|
|||
|
|
'unit': unit,
|
|||
|
|
'reference': '',
|
|||
|
|
'source': source_file
|
|||
|
|
})
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
i += 1
|
|||
|
|
|
|||
|
|
return items
|
|||
|
|
|
|||
|
|
|
|||
|
|
def clean_extracted_data_v2(items: list) -> list:
|
|||
|
|
"""清洗提取的数据"""
|
|||
|
|
cleaned = []
|
|||
|
|
seen = set() # 去重
|
|||
|
|
|
|||
|
|
# 噪音ABB列表
|
|||
|
|
noise_abbs = {
|
|||
|
|
'A', 'H', 'L', 'R', 'AGE', 'NHY', 'D', 'O', 'RB', 'N', 'Q', 'C', 'J', 'Y', 'FY', 'OEP',
|
|||
|
|
'F', 'M', 'MY', 'S', 'AC', 'AH', 'AR', 'AS', 'WCC', # 新增噪音
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 噪音项目名 - 使用单词边界匹配
|
|||
|
|
noise_projects = [
|
|||
|
|
'received', 'collected', 'report by', 'reported by',
|
|||
|
|
'health', 'name', 'dob', 'oct', 'patient', 'doctor', 'lab no', 'mrn',
|
|||
|
|
'sex', 'address', 'ref.no', 'requested', 'printed', 'page',
|
|||
|
|
'female', 'male', 'adult', 'sep ', 'anti-n rnp', 'anti smith', # 新增噪音
|
|||
|
|
'absolute count', 'white cell count', # 这些是NK cell的附属数据,不是独立项目
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 需要完全匹配的噪音词(避免误过滤如 "High Sens", "Average")
|
|||
|
|
noise_exact = ['high', 'low', 'age']
|
|||
|
|
|
|||
|
|
for item in items:
|
|||
|
|
abb = item.get('abb', '').upper()
|
|||
|
|
result = item.get('result', '')
|
|||
|
|
project = item.get('project', '')
|
|||
|
|
project_lower = project.lower()
|
|||
|
|
|
|||
|
|
# 过滤无效数据
|
|||
|
|
if not abb or not result:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 修复无效结果:如果结果是连续点号,尝试从参考范围中提取
|
|||
|
|
if re.match(r'^\.{3,}$', result):
|
|||
|
|
reference = item.get('reference', '')
|
|||
|
|
if reference:
|
|||
|
|
# 从参考范围中提取预期值
|
|||
|
|
normal_val_match = re.search(r'\[Normal[:\:]\s*([^\]]+)\]', reference, re.IGNORECASE)
|
|||
|
|
if normal_val_match:
|
|||
|
|
expected_val = normal_val_match.group(1).strip()
|
|||
|
|
# 如果预期值是定性结果,使用它作为结果
|
|||
|
|
qualitative_vals = ['negative', 'positive', 'normal', 'yellow', 'clear', 'straw', 'amber', 'not found']
|
|||
|
|
if expected_val.lower() in qualitative_vals:
|
|||
|
|
result = expected_val
|
|||
|
|
item['result'] = result
|
|||
|
|
else:
|
|||
|
|
continue # 无法修复,跳过
|
|||
|
|
else:
|
|||
|
|
continue # 无法修复,跳过
|
|||
|
|
else:
|
|||
|
|
continue # 无参考范围,跳过
|
|||
|
|
|
|||
|
|
# 过滤噪音ABB
|
|||
|
|
if abb in noise_abbs:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 过滤噪音项目名
|
|||
|
|
if any(n in project_lower for n in noise_projects):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 过滤完全匹配的噪音词
|
|||
|
|
if project_lower in noise_exact:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 过滤太短的项目名(但保留已知的短项目名如pH)
|
|||
|
|
known_short_projects = {'ph', 'k', 'p', 'na', 'cl', 'mg', 'ca', 'fe', 'zn', 'cu', 'pb', 'hg', 'cd', 'cr', 'mn', 'ni'}
|
|||
|
|
if len(project) < 3 and project_lower not in known_short_projects:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 去重 - 使用ABB和结果组合
|
|||
|
|
key = f"{abb}:{result}"
|
|||
|
|
if key in seen:
|
|||
|
|
continue
|
|||
|
|
seen.add(key)
|
|||
|
|
|
|||
|
|
cleaned.append(item)
|
|||
|
|
|
|||
|
|
return cleaned
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
# 测试
|
|||
|
|
test_text = """
|
|||
|
|
ABO Group.................:
|
|||
|
|
B
|
|||
|
|
Rh Group...................:
|
|||
|
|
Positive
|
|||
|
|
ESR 1 Hour ...................:
|
|||
|
|
20 H mm/hr
|
|||
|
|
(0-15)
|
|||
|
|
Thrombin Time(TT)............:
|
|||
|
|
18.4
|
|||
|
|
Secs.
|
|||
|
|
(15.8-19.0)
|
|||
|
|
Cholesterol...................:
|
|||
|
|
230H
|
|||
|
|
mg/dL
|
|||
|
|
(<200)
|
|||
|
|
Color........................
|
|||
|
|
Yellow
|
|||
|
|
[Normal : Yellow]
|
|||
|
|
pH.........
|
|||
|
|
6.0
|
|||
|
|
(4.5-8.0)
|
|||
|
|
Immunoglobulin M(IgM)......:
|
|||
|
|
158.00mg/dL
|
|||
|
|
(40.00-230.00)
|
|||
|
|
Sodium........................:
|
|||
|
|
141
|
|||
|
|
mmol/L
|
|||
|
|
(136-145)
|
|||
|
|
Potassium.....................:
|
|||
|
|
4.77
|
|||
|
|
mmol/L
|
|||
|
|
(3.50-5.10)
|
|||
|
|
Hemoglobin(Hb) : 13.8 g/dL (13.0-18.0)
|
|||
|
|
Mean Cell Volume : 88.1 fL (80.0-100.0)
|
|||
|
|
HDL-Cholesterol : 41 mg/dL (>40)
|
|||
|
|
LDL Direct : 140H mg/dL (<130)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
items = parse_medical_data_v2(test_text, 'test.pdf')
|
|||
|
|
items = clean_extracted_data_v2(items)
|
|||
|
|
|
|||
|
|
print(f"提取了 {len(items)} 个项目:")
|
|||
|
|
for item in items:
|
|||
|
|
print(f" {item['abb']}: {item['project'][:30]} = {item['result']} {item['point']} {item['unit']} {item['reference']}")
|