Files
yiliao/backend/test_extraction_logic.py

552 lines
22 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
"""
测试提取逻辑 - 不调用OCR/DeepSeek API纯本地测试
测试内容
1. parse_medical_data_v2: OCR文本 检测项解析
2. classify_abb_module: ABB/项目名 模块分类含中文关键词
3. match_with_template: 提取数据 模板匹配
"""
import sys
import os
import io
import json
# 修复 Windows 终端 UTF-8
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
# 确保 backend 目录在 path 中
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from parse_medical_v2 import parse_medical_data_v2, clean_extracted_data_v2
from extract_and_fill_report import classify_abb_module, match_with_template
# ============================================================
# 测试1: classify_abb_module - ABB硬编码映射
# ============================================================
def test_abb_mapping():
"""测试ABB硬编码映射能否正确分类"""
print("\n" + "=" * 70)
print("[测试1] ABB硬编码映射")
print("=" * 70)
test_cases = [
# (abb, project_name, expected_module)
# 尿检
("COLOR", "Color", "Urine Detection"),
("PH", "pH", "Urine Detection"),
("PRO", "Protein", "Urine Detection"),
("SG", "Specific Gravity", "Urine Detection"),
# 血常规
("WBC", "White Blood Cell", "Complete Blood Count"),
("RBC", "Red Blood Cell", "Complete Blood Count"),
("HGB", "Hemoglobin", "Complete Blood Count"),
("PLT", "Platelet Count", "Complete Blood Count"),
("ESR", "ESR 1 Hour", "Complete Blood Count"),
# 肝功能
("ALT", "Alanine Aminotransferase", "Liver Function"),
("AST", "Aspartate Aminotransferase", "Liver Function"),
("GGT", "Gamma GT", "Liver Function"),
("TBIL", "Total Bilirubin", "Liver Function"),
("ALB", "Albumin", "Liver Function"),
# 肾功能
("BUN", "Blood Urea Nitrogen", "Kidney Function"),
("CREA", "Creatinine", "Kidney Function"),
("UA", "Uric Acid", "Kidney Function"),
# 血脂
("TC", "Total Cholesterol", "Lipid Panel"),
("TG", "Triglyceride", "Lipid Panel"),
("HDL", "HDL Cholesterol", "Lipid Panel"),
("LDL", "LDL Cholesterol", "Lipid Panel"),
# 电解质
("NA", "Sodium", "Electrolytes"),
("K", "Potassium", "Electrolytes"),
("CL", "Chloride", "Electrolytes"),
("CA", "Calcium", "Electrolytes"),
# 血糖
("FPG", "Fasting Glucose", "Glucose"),
("HBA1C", "HbA1c", "Glucose"),
# 甲状腺
("TSH", "TSH", "Thyroid"),
("FT3", "Free T3", "Thyroid"),
("FT4", "Free T4", "Thyroid"),
# 激素
("E2", "Estradiol", "Hormone"),
("FSH", "FSH", "Hormone"),
("LH", "LH", "Hormone"),
("CORTISOL", "Cortisol", "Hormone"),
# 肿瘤标志物
("AFP", "Alpha Fetoprotein", "Tumor Markers"),
("CEA", "CEA", "Tumor Markers"),
("CA125", "CA125", "Tumor Markers"),
("PSA", "PSA", "Tumor Markers"),
# 凝血
("PT", "Prothrombin Time", "Coagulation"),
("APTT", "APTT", "Coagulation"),
("FIB", "Fibrinogen", "Coagulation"),
# 传染病
("HBSAG", "HBsAg", "Infectious Disease"),
("HIV", "HIV", "Infectious Disease"),
# 免疫功能
("IGG", "IgG", "Immune Function"),
("C3", "Complement C3", "Immune Function"),
("CRP", "CRP", "Immune Function"),
# 骨代谢
("OSTE", "Osteocalcin", "Bone Metabolism"),
("PTH", "PTH", "Bone Metabolism"),
# 重金属
("PB", "Lead", "Heavy Metals"),
("HG", "Mercury", "Heavy Metals"),
# 维生素
("VITB12", "Vitamin B12", "Vitamin"),
("FOLATE", "Folate", "Vitamin"),
# 同型半胱氨酸
("HCY", "Homocysteine", "Homocysteine"),
# 血型
("ABO", "ABO Blood Group", "Blood Type"),
]
passed = 0
failed = 0
for abb, project, expected in test_cases:
result = classify_abb_module(abb, project, api_key=None)
if result == expected:
passed += 1
else:
failed += 1
print(f" [FAIL] ABB={abb}, project={project}")
print(f" 期望: {expected}, 实际: {result}")
print(f"\n 结果: {passed} 通过, {failed} 失败 / 共 {len(test_cases)}")
return failed == 0
# ============================================================
# 测试2: classify_abb_module - 中文关键词匹配
# ============================================================
def test_chinese_keyword_matching():
"""测试中文关键词能否正确匹配模块"""
print("\n" + "=" * 70)
print("[测试2] 中文关键词匹配")
print("=" * 70)
# 用不在ABB映射中的假ABB强制走keyword匹配
test_cases = [
# (abb, project_name_cn, expected_module)
# 尿液
("X001", "尿液分析", "Urine Detection"),
("X002", "尿检常规", "Urine Detection"),
("X003", "隐血试验", "Urine Detection"),
("X004", "酮体检测", "Urine Detection"),
# 血常规
("X010", "红细胞计数", "Complete Blood Count"),
("X011", "白细胞分类", "Complete Blood Count"),
("X012", "血红蛋白测定", "Complete Blood Count"),
("X013", "血小板计数", "Complete Blood Count"),
("X014", "中性粒细胞百分比", "Complete Blood Count"),
("X015", "嗜酸性粒细胞", "Complete Blood Count"),
("X016", "单核细胞计数", "Complete Blood Count"),
# 肝功能
("X020", "肝功能全套", "Liver Function"),
("X021", "总蛋白测定", "Liver Function"),
("X022", "白蛋白测定", "Liver Function"),
("X023", "胆红素测定", "Liver Function"),
("X024", "转氨酶检测", "Liver Function"),
("X025", "谷氨酰转肽酶", "Liver Function"),
# 肾功能
("X030", "肾功能检测", "Kidney Function"),
("X031", "血清肌酐", "Kidney Function"),
("X032", "尿素氮测定", "Kidney Function"),
("X033", "尿酸检测", "Kidney Function"),
# 血脂
("X040", "总胆固醇", "Lipid Panel"),
("X041", "甘油三酯测定", "Lipid Panel"),
("X042", "高密度脂蛋白", "Lipid Panel"),
("X043", "血脂四项", "Lipid Panel"),
# 血糖
("X050", "空腹血糖测定", "Glucose"),
("X051", "糖化血红蛋白检测", "Glucose"),
("X052", "随机血糖", "Glucose"),
# 甲状腺
("X060", "甲状腺功能", "Thyroid"),
("X061", "促甲状腺激素", "Thyroid"),
# 激素
("X070", "雌二醇测定", "Hormone"),
("X071", "孕酮检测", "Hormone"),
("X072", "睾酮水平", "Hormone"),
("X073", "皮质醇测定", "Hormone"),
("X074", "催乳素检测", "Hormone"),
("X075", "荷尔蒙全套", "Hormone"),
("X076", "促卵泡生成素", "Hormone"),
("X077", "促黄体生成素", "Hormone"),
("X078", "脱氢表雄酮硫酸盐", "Hormone"),
("X079", "胰岛素样生长因子", "Hormone"),
("X080", "抗缪勒管激素", "Hormone"),
# 肿瘤标志物
("X090", "肿瘤标志物全套", "Tumor Markers"),
("X091", "甲胎蛋白检测", "Tumor Markers"),
("X092", "癌胚抗原测定", "Tumor Markers"),
("X093", "铁蛋白检测", "Tumor Markers"),
("X094", "糖类抗原125", "Tumor Markers"),
("X095", "前列腺特异性抗原", "Tumor Markers"),
("X096", "鳞状细胞癌抗原", "Tumor Markers"),
("X097", "神经元特异性烯醇化酶", "Tumor Markers"),
# 凝血
("X100", "凝血功能检测", "Coagulation"),
("X101", "纤维蛋白原测定", "Coagulation"),
# 传染病
("X110", "乙肝五项", "Infectious Disease"),
("X111", "丙肝抗体", "Infectious Disease"),
("X112", "梅毒筛查", "Infectious Disease"),
("X113", "传染病四项", "Infectious Disease"),
# 免疫功能
("X120", "免疫球蛋白测定", "Immune Function"),
("X121", "补体C3检测", "Immune Function"),
("X122", "c反应蛋白测定", "Immune Function"),
("X123", "抗核抗体检测", "Immune Function"),
("X124", "类风湿因子测定", "Immune Function"),
("X125", "红细胞沉降速率", "Immune Function"),
# 骨代谢
("X130", "骨代谢标志物", "Bone Metabolism"),
("X131", "骨钙素检测", "Bone Metabolism"),
("X132", "甲状旁腺激素", "Bone Metabolism"),
("X133", "25-羟维生素d检测", "Bone Metabolism"),
# 重金属
("X140", "微量元素检测", "Heavy Metals"),
("X141", "重金属筛查", "Heavy Metals"),
# 同型半胱氨酸
("X150", "同型半胱氨酸检测", "Homocysteine"),
# 血型
("X160", "ABO血型鉴定", "Blood Type"),
# 电解质
("X170", "电解质全套", "Electrolytes"),
("X171", "血清钾测定", "Electrolytes"),
("X172", "血清钠检测", "Electrolytes"),
("X173", "血清钙测定", "Electrolytes"),
]
passed = 0
failed = 0
for abb, project, expected in test_cases:
result = classify_abb_module(abb, project, api_key=None)
if result == expected:
passed += 1
else:
failed += 1
print(f" [FAIL] project={project}")
print(f" 期望: {expected}, 实际: {result}")
print(f"\n 结果: {passed} 通过, {failed} 失败 / 共 {len(test_cases)}")
return failed == 0
# ============================================================
# 测试3: parse_medical_data_v2 - OCR文本解析
# ============================================================
def test_parse_ocr_text():
"""测试OCR文本解析能否正确提取检测项"""
print("\n" + "=" * 70)
print("[测试3] OCR文本解析 (parse_medical_data_v2)")
print("=" * 70)
# 模拟典型的百度OCR提取文本英文报告格式
sample_ocr_text = """Page 1
Patient Name: MR. TEST PATIENT
Sex : Male Age : 45Y
Collected Date/Time: 20 Jan 2025
Complete Blood Count
Total WBC............... 6.50 *10^3/mm3 (4.0-10.0)
Red Blood Cell.......... 4.69 *10^6/mm3 (4.5-5.5)
Hemoglobin(Hb)......... 14.2 g/dL (13.0-17.0)
Hematocrit(HCT)........ 41.3 % (40-54)
MCV.................... 88.1 fL (80-100)
MCH.................... 30.3 pg (27-34)
MCHC................... 34.4 g/dL (32-36)
Platelet Count......... 230 *10^3/mm3 (150-400)
Neutrophil............. 62.3 % (40-70)
Lymphocyte............. 28.5 % (20-40)
Monocyte............... 6.2 % (2-8)
Eosinophil............. 2.5 % (1-6)
Basophil............... 0.5 % (0-1)
ESR 1 Hour............. 8 mm/hr (0-15)
Liver Function
ALT(Alanine Transaminase)...... 25 U/L (0-41)
AST(Aspartate Transaminase).... 22 U/L (0-40)
GGT( Gamma GT)................. 30 U/L (8-61)
ALP(Alkaline Phosphatase)...... 70 U/L (40-130)
Total Bilirubin................ 0.8 mg/dL (0.1-1.2)
Direct Bilirubin............... 0.2 mg/dL (0-0.3)
Total Protein.................. 7.2 g/dL (6.6-8.3)
Albumin........................ 4.5 g/dL (3.5-5.2)
Globulin....................... 2.7 g/dL (2.0-3.5)
Kidney Function
BUN............................ 15 mg/dL (6-20)
Creatinine..................... 0.95 mg/dL (0.67-1.17)
Uric Acid...................... 5.8 mg/dL (3.4-7.0)
eGFR........................... 92 mL/min (>90)
Lipid Profile
Total Cholesterol.............. 195 mg/dL (<200)
Triglyceride................... 120 mg/dL (<150)
HDL-Cholesterol................ 55 mg/dL (>40)
LDL-Cholesterol(Direct)........ 118 mg/dL (<100)
Glucose(Fasting)............... 95 mg/dL (74-100)
HbA1c.......................... 5.7 % (4.0-5.6)
Thyroid Function
TSH............................ 2.15 mIU/L (0.27-4.2)
Free T3........................ 3.2 pg/mL (2.0-4.4)
Free T4........................ 1.25 ng/dL (0.93-1.7)
Hormones
Estradiol(E2).................. 28.5 pg/mL (11.3-43.2)
Testosterone................... 450 ng/dL (249-836)
Cortisol....................... 12.5 ug/dL (6.2-19.4)
FSH............................ 5.85 mIU/mL (1.5-12.4)
LH(Luteinizing Hormone)....... 4.2 mIU/mL (1.7-8.6)
Prolactin...................... 8.5 ng/mL (4.0-15.2)
DHEA-Sulphate.................. 280 ug/dL (88.9-427)
IGF-1.......................... 165 ng/mL (101-267)
Tumor Markers
AFP(Alpha Fetoprotein)......... 3.2 ng/mL (0-7)
CEA(Carcinoembryonic Antigen).. 2.1 ng/mL (0-5)
Total PSA...................... 0.8 ng/mL (0-4)
CA125.......................... 12.5 U/mL (0-35)
Coagulation
Prothrombin Time(PT)........... 12.5 sec (10-14)
APTT........................... 28.3 sec (25-35)
Thrombin Time(TT).............. 16.2 sec (14-21)
Fibrinogen..................... 2.8 g/L (2.0-4.0)
INR............................ 0.93 (0.8-1.2)
Infectious Disease
HBsAg(Hepatitis B Surface Antigen)... Negative
HBsAb(Hepatitis B Surface Antibody).. Positive
HCV Ab (Hepatitis C Antibody)........ Non Reactive
HIV-1/HIV-2 Antibody................. Non Reactive
RPR (Rapid Plasma Reagin)............ Non Reactive
Electrolytes
Sodium......................... 140 mmol/L (136-145)
Potassium...................... 4.2 mmol/L (3.5-5.1)
Chloride....................... 103 mmol/L (98-107)
Calcium........................ 9.5 mg/dL (8.6-10.2)
Immune Function
Immunoglobulin G(IgG).......... 1050 mg/dL (700-1600)
Immunoglobulin A(IgA).......... 220 mg/dL (70-400)
Immunoglobulin M(IgM).......... 95 mg/dL (40-230)
Complement C3(B1C)............. 110 mg/dL (90-180)
Complement C4.................. 28 mg/dL (10-40)
C-Reactive Protein(High Sens).. 0.5 mg/L (<3)
Bone Metabolism
N-mid Osteocalcin.............. 15.2 ng/mL (14-46)
PTH(Intact).................... 35 pg/mL (15-65)
Vitamin D(25-OH Vitamin D Total) 32 ng/mL (30-100)
Blood Type
ABO Group...................... A
Rh Group....................... Positive
Homocysteine................... 10.5 umol/L (5-15)
Vitamin B12.................... 450 pg/mL (197-771)
Folate......................... 12.3 ng/mL (>3.0)
"""
items = parse_medical_data_v2(sample_ocr_text, "test_sample.pdf")
items = clean_extracted_data_v2(items)
print(f" 解析出 {len(items)} 个检测项")
# 期望至少能解析出的关键ABB
expected_abbs = {
'WBC', 'RBC', 'Hb', 'HCT', 'MCV', 'MCH', 'MCHC', 'PLT',
'NEUT', 'LYMPH', 'MONO', 'EOS', 'BAS', 'ESR',
'ALT', 'AST', 'GGT', 'ALP', 'TBil', 'DBil', 'TP', 'ALB', 'GLB',
'BUN', 'Scr', 'UA', 'eGFR',
'TC', 'TG', 'HDL', 'LDL',
'FBS', 'HbA1C',
'TSH', 'FT3', 'FT4',
'E2', 'T', 'COR', 'FSH', 'LH', 'PRL', 'DHEAS', 'IGF-1',
'AFP', 'CEA', 'TPSA', 'CA125',
'PT', 'APTT', 'TT', 'FIB', 'INR',
'HBsAg', 'HBsAb', 'HCV', 'HIV', 'TRUST',
'Na', 'K', 'Cl', 'Ca',
'IgG', 'IgA', 'IgM', 'C3', 'C4', 'hs-CRP',
'OST', 'PTH', '25-OH-VD2+D3',
'ABO', 'Rh',
'Hcy',
'VitB12', 'Folate',
}
found_abbs = {item['abb'] for item in items}
matched = expected_abbs & found_abbs
missing = expected_abbs - found_abbs
extra = found_abbs - expected_abbs
print(f" 期望 {len(expected_abbs)} 个ABB")
print(f" 匹配 {len(matched)}")
if missing:
print(f" [WARN] 未匹配 {len(missing)} 个: {sorted(missing)}")
if extra:
print(f" [INFO] 额外识别 {len(extra)} 个: {sorted(extra)}")
# 打印所有解析出的项目详情
print(f"\n {'ABB':<15} {'结果':<12} {'标记':<4} {'单位':<20} {'参考范围'}")
print(" " + "-" * 70)
for item in sorted(items, key=lambda x: x['abb']):
abb = item['abb']
result = item.get('result', '')[:10]
point = item.get('point', '')
unit = item.get('unit', '')[:18]
ref = item.get('reference', '')[:25]
marker = "" if abb in expected_abbs else " "
print(f" {marker} {abb:<13} {result:<12} {point:<4} {unit:<20} {ref}")
coverage = len(matched) / len(expected_abbs) * 100 if expected_abbs else 0
print(f"\n 覆盖率: {coverage:.1f}% ({len(matched)}/{len(expected_abbs)})")
return coverage >= 70 # 至少70%覆盖率算通过
# ============================================================
# 测试4: 分类 + 模板匹配联合测试
# ============================================================
def test_classify_with_template():
"""测试提取数据经过分类后能否正确归入模块"""
print("\n" + "=" * 70)
print("[测试4] 分类 → 模板匹配联合测试")
print("=" * 70)
# 加载真实配置
config_path = os.path.join(os.path.dirname(__file__), "abb_mapping_config.json")
if not os.path.exists(config_path):
print(" [SKIP] 配置文件不存在")
return True
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# 模拟提取的数据混合英文ABB和中文项目名
mock_items = [
{"abb": "WBC", "project": "White Blood Cell", "result": "6.5", "point": "", "unit": "*10^3/mm3", "reference": "(4.0-10.0)", "source": "test.pdf"},
{"abb": "ALT", "project": "Alanine Aminotransferase", "result": "25", "point": "", "unit": "U/L", "reference": "(0-41)", "source": "test.pdf"},
{"abb": "TC", "project": "Total Cholesterol", "result": "195", "point": "", "unit": "mg/dL", "reference": "(<200)", "source": "test.pdf"},
{"abb": "TSH", "project": "TSH", "result": "2.15", "point": "", "unit": "mIU/L", "reference": "(0.27-4.2)", "source": "test.pdf"},
{"abb": "AFP", "project": "Alpha Fetoprotein", "result": "3.2", "point": "", "unit": "ng/mL", "reference": "(0-7)", "source": "test.pdf"},
{"abb": "E2", "project": "Estradiol", "result": "28.5", "point": "", "unit": "pg/mL", "reference": "", "source": "test.pdf"},
{"abb": "PT", "project": "Prothrombin Time", "result": "12.5", "point": "", "unit": "sec", "reference": "(10-14)", "source": "test.pdf"},
{"abb": "HBsAg", "project": "HBsAg", "result": "Negative", "point": "", "unit": "", "reference": "", "source": "test.pdf"},
{"abb": "Na", "project": "Sodium", "result": "140", "point": "", "unit": "mmol/L", "reference": "(136-145)", "source": "test.pdf"},
{"abb": "Hcy", "project": "Homocysteine", "result": "10.5", "point": "", "unit": "umol/L", "reference": "(5-15)", "source": "test.pdf"},
]
matched = match_with_template(mock_items, config)
print(f"\n 模板匹配结果: {len(matched)} 个项目")
# 检查每个项目分类
for abb in ['WBC', 'ALT', 'TC', 'TSH', 'AFP', 'E2', 'PT', 'HBsAg', 'Na', 'Hcy']:
data = matched.get(abb, {})
project = data.get('project', '?')
result = data.get('result', '?')
module = classify_abb_module(abb, project, api_key=None)
print(f" {abb:<8} result={result:<10} → [{module}]")
return len(matched) >= 8
# ============================================================
# 测试5: 边界情况 - 关键词冲突
# ============================================================
def test_keyword_conflicts():
"""测试潜在的关键词冲突场景"""
print("\n" + "=" * 70)
print("[测试5] 关键词冲突/边界测试")
print("=" * 70)
test_cases = [
# 长关键词应优先于短关键词
("X200", "红细胞沉降速率测定", "Immune Function"), # 不应匹配到 CBC 的 '红细胞'
("X201", "红细胞计数", "Complete Blood Count"), # 应正常匹配 '红细胞'
# 白蛋白 vs 白细胞
("X202", "血清白蛋白", "Liver Function"), # '白蛋白' → Liver
("X203", "白细胞分类计数", "Complete Blood Count"), # '白细胞' → CBC
# 甲状腺 vs 甲状旁腺
("X204", "甲状旁腺激素检测", "Bone Metabolism"), # '甲状旁腺' → Bone
("X205", "甲状腺功能五项", "Thyroid"), # '甲状腺' → Thyroid
# 维生素D归属
("X206", "25-羟维生素d总量", "Bone Metabolism"), # '维生素d' → Bone (非Vitamin)
# 尿酸 不应匹配 尿液
("X207", "血清尿酸", "Kidney Function"), # '尿酸' → Kidney
# 胆固醇 不应匹配 胆红素
("X208", "总胆固醇", "Lipid Panel"), # '胆固醇' → Lipid
("X209", "总胆红素", "Liver Function"), # '胆红素' → Liver
# 免疫缺陷病毒
("X210", "人类免疫缺陷病毒抗体", "Infectious Disease"), # 不应匹配 '免疫球蛋白'
]
passed = 0
failed = 0
for abb, project, expected in test_cases:
result = classify_abb_module(abb, project, api_key=None)
status = "OK" if result == expected else "FAIL"
if result == expected:
passed += 1
else:
failed += 1
icon = "" if status == "OK" else ""
print(f" {icon} {project:<25} 期望: {expected:<20} 实际: {result}")
print(f"\n 结果: {passed} 通过, {failed} 失败 / 共 {len(test_cases)}")
return failed == 0
# ============================================================
# 主函数
# ============================================================
def main():
print("=" * 70)
print(" 医疗数据提取逻辑测试")
print(" (不调用OCR/DeepSeek API纯本地离线测试)")
print("=" * 70)
results = {}
results["ABB硬编码映射"] = test_abb_mapping()
results["中文关键词匹配"] = test_chinese_keyword_matching()
results["OCR文本解析"] = test_parse_ocr_text()
results["分类+模板匹配"] = test_classify_with_template()
results["关键词冲突检测"] = test_keyword_conflicts()
# 汇总
print("\n" + "=" * 70)
print(" 测试汇总")
print("=" * 70)
all_pass = True
for name, passed in results.items():
icon = "✓ PASS" if passed else "✗ FAIL"
print(f" {icon} {name}")
if not passed:
all_pass = False
print("=" * 70)
if all_pass:
print(" 所有测试通过!")
else:
print(" 存在失败项,请检查上方详情")
print("=" * 70)
return 0 if all_pass else 1
if __name__ == "__main__":
sys.exit(main())