Files
yiliao/backend/analyze_output.py

81 lines
2.7 KiB
Python

"""分析生成文件的结构问题"""
from docx import Document
from lxml import etree
import zipfile
import os
def analyze_file(filepath, name):
"""分析文件结构"""
print(f"\n{'='*70}")
print(f"分析: {name}")
print(f"文件: {filepath}")
print(f"{'='*70}")
# 读取 XML
with zipfile.ZipFile(filepath, 'r') as z:
xml_content = z.read('word/document.xml')
tree = etree.fromstring(xml_content)
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
body = tree.find('.//w:body', ns)
# 找到 Urine Detection 相关的元素
print("\n搜索 'Urine Detection' 相关元素:")
print("-" * 70)
urine_positions = []
for i, elem in enumerate(body):
text = ''.join(elem.itertext()).strip()
if 'Urine' in text and 'Detection' in text:
tag = elem.tag.split('}')[-1]
text_preview = text[:100].replace('\n', '\\n')
print(f" [{i}] <{tag}>: {text_preview}...")
urine_positions.append(i)
if not urine_positions:
print(" 未找到")
return
# 分析第一个 Urine Detection 位置前后的元素
first_pos = urine_positions[0]
print(f"\n从第一个 Urine Detection (位置 {first_pos}) 开始的40个元素:")
print("-" * 70)
for i in range(first_pos, min(first_pos + 40, len(body))):
elem = body[i]
tag = elem.tag.split('}')[-1]
text = ''.join(elem.itertext()).strip()
text_preview = text[:80].replace('\n', '\\n') if text else '[空]'
# 额外信息
extra = ""
if tag == 'tbl':
rows = elem.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tr')
extra = f" [行数:{len(rows)}]"
# 检查是否是表头
if len(rows) == 1 and ('Abb' in text or 'Project' in text):
extra += " [表头]"
elif tag == 'p':
# 检查是否有分页符
page_breaks = elem.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br')
for br in page_breaks:
br_type = br.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type')
if br_type == 'page':
extra = " [分页符]"
break
print(f" [{i}] <{tag}>{extra}: {text_preview}")
def main():
# 模板
template_path = r"../Be.U Wellness Center功能医学健康报告&定制化方案-案例.docx"
# 最新生成的文件
generated_path = "reports/filled_report_20260115_204528.docx"
analyze_file(template_path, "模板")
analyze_file(generated_path, "生成文件")
if __name__ == "__main__":
main()