初始化医疗报告生成项目,添加核心代码文件
This commit is contained in:
177
backend/xml_safe_save.py
Normal file
177
backend/xml_safe_save.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
安全保存模块 - 使用 lxml 精确处理 XML 元素
|
||||
"""
|
||||
import zipfile
|
||||
import shutil
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def safe_save(doc, output_path, template_path):
|
||||
"""
|
||||
安全保存 - 使用 lxml 精确处理 XML
|
||||
|
||||
策略:
|
||||
1. 先保存文档到临时文件
|
||||
2. 使用 lxml 解析 XML
|
||||
3. 从模板复制前四页元素(到 Client Health Program 为止)
|
||||
4. 从处理后文件复制 Client Health Program 之后的所有内容
|
||||
5. 合并并保存
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
output_path = Path(output_path)
|
||||
template_path = Path(template_path)
|
||||
|
||||
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
||||
|
||||
temp_fd, temp_path = tempfile.mkstemp(suffix='.docx')
|
||||
os.close(temp_fd)
|
||||
|
||||
try:
|
||||
# 1. 保存到临时文件
|
||||
doc.save(temp_path)
|
||||
|
||||
# 2. 读取模板 XML
|
||||
with zipfile.ZipFile(template_path, 'r') as z:
|
||||
template_xml = z.read('word/document.xml')
|
||||
template_tree = etree.fromstring(template_xml)
|
||||
template_body = template_tree.find('.//w:body', ns)
|
||||
|
||||
# 3. 读取处理后 XML
|
||||
with zipfile.ZipFile(temp_path, 'r') as z:
|
||||
modified_xml = z.read('word/document.xml')
|
||||
modified_tree = etree.fromstring(modified_xml)
|
||||
modified_body = modified_tree.find('.//w:body', ns)
|
||||
|
||||
if template_body is None or modified_body is None:
|
||||
print(" [安全保存] 无法解析 XML body")
|
||||
shutil.copy(temp_path, output_path)
|
||||
return
|
||||
|
||||
template_children = list(template_body)
|
||||
modified_children = list(modified_body)
|
||||
|
||||
# 4. 找到模板中的保护边界(Client Health Program 之后)
|
||||
boundary_pos = -1
|
||||
for i, elem in enumerate(template_children):
|
||||
text = ''.join(elem.itertext()).strip()
|
||||
if 'Client Health Program' in text or '客户健康方案' in text:
|
||||
boundary_pos = i + 1 # 包括这个元素
|
||||
break
|
||||
|
||||
if boundary_pos < 0:
|
||||
# 默认使用 80 个元素
|
||||
boundary_pos = min(80, len(template_children))
|
||||
|
||||
# 5. 找到处理后文件中的数据起始位置
|
||||
# 关键修改:从 Client Health Program 之后开始,而不是从 health report analysis 开始
|
||||
# 这样可以保留 Functional Medical Health Advice 等内容
|
||||
data_start_pos = -1
|
||||
|
||||
# 首先尝试找 Client Health Program 的位置
|
||||
for i, elem in enumerate(modified_children):
|
||||
text = ''.join(elem.itertext()).strip()
|
||||
if 'Client Health Program' in text or '客户健康方案' in text:
|
||||
data_start_pos = i + 1 # 从 Client Health Program 之后开始
|
||||
print(f" [安全保存] 找到 Client Health Program 位置: {i}")
|
||||
break
|
||||
|
||||
# 如果找不到,使用备用关键词
|
||||
if data_start_pos < 0:
|
||||
start_keywords = ['health report analysis', '健康报告分析',
|
||||
'abnormal index', '异常指标',
|
||||
'functional medical health advice', '功能医学健康建议',
|
||||
'urine detection', '尿液检测']
|
||||
|
||||
for i, elem in enumerate(modified_children):
|
||||
text = ''.join(elem.itertext()).strip().lower()
|
||||
if any(kw in text for kw in start_keywords):
|
||||
data_start_pos = i
|
||||
break
|
||||
|
||||
if data_start_pos < 0:
|
||||
data_start_pos = boundary_pos
|
||||
|
||||
print(f" [安全保存] 边界位置:{boundary_pos}, 数据起始:{data_start_pos}")
|
||||
|
||||
# 6. 清空模板 body,重新构建
|
||||
# 保存模板的 sectPr 元素(包含页脚引用)
|
||||
sectPr = None
|
||||
for elem in template_children:
|
||||
if elem.tag.endswith('}sectPr'):
|
||||
sectPr = etree.fromstring(etree.tostring(elem))
|
||||
break
|
||||
|
||||
# 清空 body
|
||||
for elem in list(template_body):
|
||||
template_body.remove(elem)
|
||||
|
||||
# 7. 添加模板的前 boundary_pos 个元素(前四页)
|
||||
# 重新读取模板以获取原始元素
|
||||
with zipfile.ZipFile(template_path, 'r') as z:
|
||||
orig_template_xml = z.read('word/document.xml')
|
||||
orig_template_tree = etree.fromstring(orig_template_xml)
|
||||
orig_template_body = orig_template_tree.find('.//w:body', ns)
|
||||
orig_template_children = list(orig_template_body)
|
||||
|
||||
protected_count = 0
|
||||
for i in range(min(boundary_pos, len(orig_template_children))):
|
||||
elem = orig_template_children[i]
|
||||
if elem.tag.endswith('}sectPr'):
|
||||
continue
|
||||
elem_copy = etree.fromstring(etree.tostring(elem))
|
||||
template_body.append(elem_copy)
|
||||
protected_count += 1
|
||||
|
||||
# 8. 添加处理后文件的数据部分(从 Client Health Program 之后开始)
|
||||
data_count = 0
|
||||
for i in range(data_start_pos, len(modified_children)):
|
||||
elem = modified_children[i]
|
||||
if elem.tag.endswith('}sectPr'):
|
||||
continue
|
||||
elem_copy = etree.fromstring(etree.tostring(elem))
|
||||
template_body.append(elem_copy)
|
||||
data_count += 1
|
||||
|
||||
# 9. 添加 sectPr
|
||||
if sectPr is not None:
|
||||
template_body.append(sectPr)
|
||||
|
||||
print(f" [安全保存] 保护部分:{protected_count}, 数据部分:{data_count}")
|
||||
|
||||
# 10. 保存 XML
|
||||
new_xml = etree.tostring(template_tree, xml_declaration=True, encoding='UTF-8', standalone='yes')
|
||||
|
||||
# 11. 基于模板创建输出文件
|
||||
temp_result = str(output_path) + '.temp.docx'
|
||||
with zipfile.ZipFile(template_path, 'r') as zin:
|
||||
with zipfile.ZipFile(temp_result, 'w', zipfile.ZIP_DEFLATED) as zout:
|
||||
for item in zin.infolist():
|
||||
if item.filename == 'word/document.xml':
|
||||
zout.writestr(item, new_xml)
|
||||
else:
|
||||
zout.writestr(item, zin.read(item.filename))
|
||||
|
||||
# 12. 移动到最终位置
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
shutil.move(temp_result, output_path)
|
||||
|
||||
print(f" [安全保存] ✓ 完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f" [安全保存] 错误: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# 回退到普通保存
|
||||
doc.save(output_path)
|
||||
finally:
|
||||
for f in [temp_path, str(output_path) + '.temp.docx']:
|
||||
if os.path.exists(f):
|
||||
try:
|
||||
os.remove(f)
|
||||
except:
|
||||
pass
|
||||
Reference in New Issue
Block a user