from docx import Document from lxml import etree doc = Document(r'C:\Users\UI\Desktop\医疗报告\backend\reports\filled_report_20260212_165326.docx') body = doc.element.body children = list(body) ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' def show_para_format(elem, label): text = ''.join(elem.itertext()).strip() print(f'=== {label} ===') print(f'Text: {text[:80]}') # pPr pPr = elem.find('w:pPr', ns) if pPr is not None: jc = pPr.find('w:jc', ns) if jc is not None: print(f' jc: {jc.get(f"{{{w}}}val")}') pStyle = pPr.find('w:pStyle', ns) if pStyle is not None: print(f' pStyle: {pStyle.get(f"{{{w}}}val")}') # runs for r in elem.findall('w:r', ns): rPr = r.find('w:rPr', ns) rt = ''.join(r.itertext()).strip() if not rt: continue print(f' Run: "{rt[:50]}"') if rPr is not None: rFonts = rPr.find('w:rFonts', ns) sz = rPr.find('w:sz', ns) szCs = rPr.find('w:szCs', ns) b = rPr.find('w:b', ns) bCs = rPr.find('w:bCs', ns) color = rPr.find('w:color', ns) if rFonts is not None: fonts = {} for attr in ['ascii', 'hAnsi', 'eastAsia', 'cs']: v = rFonts.get(f'{{{w}}}{attr}') if v: fonts[attr] = v print(f' fonts: {fonts}') if sz is not None: print(f' sz: {sz.get(f"{{{w}}}val")} (={int(sz.get(f"{{{w}}}val"))//2}pt)') if szCs is not None: print(f' szCs: {szCs.get(f"{{{w}}}val")}') if b is not None: print(f' bold: yes') if bCs is not None: print(f' boldCs: yes') if color is not None: print(f' color: {color.get(f"{{{w}}}val")}') else: print(f' (no rPr)') # Overall Health Assessment for i, elem in enumerate(children): text = ''.join(elem.itertext()).strip() if 'Overall Health' in text and 'Assessment' in text and len(text) < 200: show_para_format(elem, f'Overall Health Assessment [{i}]') break print() # Medical Intervention for i, elem in enumerate(children): text = ''.join(elem.itertext()).strip() if 'Medical Intervention' in text and '医学干预' in text and len(text) < 200: show_para_format(elem, f'Medical Intervention [{i}]') break print() # FHA Title for i, elem in enumerate(children): text = ''.join(elem.itertext()).strip() if 'Functional Medical Health Advice' in text and '功能医学健康建议' in text and len(text) < 300: show_para_format(elem, f'FHA Title [{i}]') break