Files
yiliao/backend/xml_safe_save.py

178 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
安全保存模块 - 使用 lxml 精确处理 XML 元素
"""
import zipfile
import shutil
import os
import re
from pathlib import Path
from lxml import etree
def safe_save(doc, output_path, template_path):
"""
安全保存 - 使用 lxml 精确处理 XML
策略:
1. 先保存文档到临时文件
2. 使用 lxml 解析 XML
3. 从模板复制前四页元素(到 Client Health Program 为止)
4. 从处理后文件复制 Client Health Program 之后的所有内容
5. 合并并保存
"""
import tempfile
output_path = Path(output_path)
template_path = Path(template_path)
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
temp_fd, temp_path = tempfile.mkstemp(suffix='.docx')
os.close(temp_fd)
try:
# 1. 保存到临时文件
doc.save(temp_path)
# 2. 读取模板 XML
with zipfile.ZipFile(template_path, 'r') as z:
template_xml = z.read('word/document.xml')
template_tree = etree.fromstring(template_xml)
template_body = template_tree.find('.//w:body', ns)
# 3. 读取处理后 XML
with zipfile.ZipFile(temp_path, 'r') as z:
modified_xml = z.read('word/document.xml')
modified_tree = etree.fromstring(modified_xml)
modified_body = modified_tree.find('.//w:body', ns)
if template_body is None or modified_body is None:
print(" [安全保存] 无法解析 XML body")
shutil.copy(temp_path, output_path)
return
template_children = list(template_body)
modified_children = list(modified_body)
# 4. 找到模板中的保护边界Client Health Program 之后)
boundary_pos = -1
for i, elem in enumerate(template_children):
text = ''.join(elem.itertext()).strip()
if 'Client Health Program' in text or '客户健康方案' in text:
boundary_pos = i + 1 # 包括这个元素
break
if boundary_pos < 0:
# 默认使用 80 个元素
boundary_pos = min(80, len(template_children))
# 5. 找到处理后文件中的数据起始位置
# 关键修改:从 Client Health Program 之后开始,而不是从 health report analysis 开始
# 这样可以保留 Functional Medical Health Advice 等内容
data_start_pos = -1
# 首先尝试找 Client Health Program 的位置
for i, elem in enumerate(modified_children):
text = ''.join(elem.itertext()).strip()
if 'Client Health Program' in text or '客户健康方案' in text:
data_start_pos = i + 1 # 从 Client Health Program 之后开始
print(f" [安全保存] 找到 Client Health Program 位置: {i}")
break
# 如果找不到,使用备用关键词
if data_start_pos < 0:
start_keywords = ['health report analysis', '健康报告分析',
'abnormal index', '异常指标',
'functional medical health advice', '功能医学健康建议',
'urine detection', '尿液检测']
for i, elem in enumerate(modified_children):
text = ''.join(elem.itertext()).strip().lower()
if any(kw in text for kw in start_keywords):
data_start_pos = i
break
if data_start_pos < 0:
data_start_pos = boundary_pos
print(f" [安全保存] 边界位置:{boundary_pos}, 数据起始:{data_start_pos}")
# 6. 清空模板 body重新构建
# 保存模板的 sectPr 元素(包含页脚引用)
sectPr = None
for elem in template_children:
if elem.tag.endswith('}sectPr'):
sectPr = etree.fromstring(etree.tostring(elem))
break
# 清空 body
for elem in list(template_body):
template_body.remove(elem)
# 7. 添加模板的前 boundary_pos 个元素(前四页)
# 重新读取模板以获取原始元素
with zipfile.ZipFile(template_path, 'r') as z:
orig_template_xml = z.read('word/document.xml')
orig_template_tree = etree.fromstring(orig_template_xml)
orig_template_body = orig_template_tree.find('.//w:body', ns)
orig_template_children = list(orig_template_body)
protected_count = 0
for i in range(min(boundary_pos, len(orig_template_children))):
elem = orig_template_children[i]
if elem.tag.endswith('}sectPr'):
continue
elem_copy = etree.fromstring(etree.tostring(elem))
template_body.append(elem_copy)
protected_count += 1
# 8. 添加处理后文件的数据部分(从 Client Health Program 之后开始)
data_count = 0
for i in range(data_start_pos, len(modified_children)):
elem = modified_children[i]
if elem.tag.endswith('}sectPr'):
continue
elem_copy = etree.fromstring(etree.tostring(elem))
template_body.append(elem_copy)
data_count += 1
# 9. 添加 sectPr
if sectPr is not None:
template_body.append(sectPr)
print(f" [安全保存] 保护部分:{protected_count}, 数据部分:{data_count}")
# 10. 保存 XML
new_xml = etree.tostring(template_tree, xml_declaration=True, encoding='UTF-8', standalone='yes')
# 11. 基于模板创建输出文件
temp_result = str(output_path) + '.temp.docx'
with zipfile.ZipFile(template_path, 'r') as zin:
with zipfile.ZipFile(temp_result, 'w', zipfile.ZIP_DEFLATED) as zout:
for item in zin.infolist():
if item.filename == 'word/document.xml':
zout.writestr(item, new_xml)
else:
zout.writestr(item, zin.read(item.filename))
# 12. 移动到最终位置
if output_path.exists():
output_path.unlink()
shutil.move(temp_result, output_path)
print(f" [安全保存] ✓ 完成")
except Exception as e:
print(f" [安全保存] 错误: {e}")
import traceback
traceback.print_exc()
# 回退到普通保存
doc.save(output_path)
finally:
for f in [temp_path, str(output_path) + '.temp.docx']:
if os.path.exists(f):
try:
os.remove(f)
except:
pass