178 lines
6.8 KiB
Python
178 lines
6.8 KiB
Python
|
|
"""
|
|||
|
|
安全保存模块 - 使用 lxml 精确处理 XML 元素
|
|||
|
|
"""
|
|||
|
|
import zipfile
|
|||
|
|
import shutil
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
from lxml import etree
|
|||
|
|
|
|||
|
|
|
|||
|
|
def safe_save(doc, output_path, template_path):
|
|||
|
|
"""
|
|||
|
|
安全保存 - 使用 lxml 精确处理 XML
|
|||
|
|
|
|||
|
|
策略:
|
|||
|
|
1. 先保存文档到临时文件
|
|||
|
|
2. 使用 lxml 解析 XML
|
|||
|
|
3. 从模板复制前四页元素(到 Client Health Program 为止)
|
|||
|
|
4. 从处理后文件复制 Client Health Program 之后的所有内容
|
|||
|
|
5. 合并并保存
|
|||
|
|
"""
|
|||
|
|
import tempfile
|
|||
|
|
|
|||
|
|
output_path = Path(output_path)
|
|||
|
|
template_path = Path(template_path)
|
|||
|
|
|
|||
|
|
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|||
|
|
|
|||
|
|
temp_fd, temp_path = tempfile.mkstemp(suffix='.docx')
|
|||
|
|
os.close(temp_fd)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 1. 保存到临时文件
|
|||
|
|
doc.save(temp_path)
|
|||
|
|
|
|||
|
|
# 2. 读取模板 XML
|
|||
|
|
with zipfile.ZipFile(template_path, 'r') as z:
|
|||
|
|
template_xml = z.read('word/document.xml')
|
|||
|
|
template_tree = etree.fromstring(template_xml)
|
|||
|
|
template_body = template_tree.find('.//w:body', ns)
|
|||
|
|
|
|||
|
|
# 3. 读取处理后 XML
|
|||
|
|
with zipfile.ZipFile(temp_path, 'r') as z:
|
|||
|
|
modified_xml = z.read('word/document.xml')
|
|||
|
|
modified_tree = etree.fromstring(modified_xml)
|
|||
|
|
modified_body = modified_tree.find('.//w:body', ns)
|
|||
|
|
|
|||
|
|
if template_body is None or modified_body is None:
|
|||
|
|
print(" [安全保存] 无法解析 XML body")
|
|||
|
|
shutil.copy(temp_path, output_path)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
template_children = list(template_body)
|
|||
|
|
modified_children = list(modified_body)
|
|||
|
|
|
|||
|
|
# 4. 找到模板中的保护边界(Client Health Program 之后)
|
|||
|
|
boundary_pos = -1
|
|||
|
|
for i, elem in enumerate(template_children):
|
|||
|
|
text = ''.join(elem.itertext()).strip()
|
|||
|
|
if 'Client Health Program' in text or '客户健康方案' in text:
|
|||
|
|
boundary_pos = i + 1 # 包括这个元素
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if boundary_pos < 0:
|
|||
|
|
# 默认使用 80 个元素
|
|||
|
|
boundary_pos = min(80, len(template_children))
|
|||
|
|
|
|||
|
|
# 5. 找到处理后文件中的数据起始位置
|
|||
|
|
# 关键修改:从 Client Health Program 之后开始,而不是从 health report analysis 开始
|
|||
|
|
# 这样可以保留 Functional Medical Health Advice 等内容
|
|||
|
|
data_start_pos = -1
|
|||
|
|
|
|||
|
|
# 首先尝试找 Client Health Program 的位置
|
|||
|
|
for i, elem in enumerate(modified_children):
|
|||
|
|
text = ''.join(elem.itertext()).strip()
|
|||
|
|
if 'Client Health Program' in text or '客户健康方案' in text:
|
|||
|
|
data_start_pos = i + 1 # 从 Client Health Program 之后开始
|
|||
|
|
print(f" [安全保存] 找到 Client Health Program 位置: {i}")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 如果找不到,使用备用关键词
|
|||
|
|
if data_start_pos < 0:
|
|||
|
|
start_keywords = ['health report analysis', '健康报告分析',
|
|||
|
|
'abnormal index', '异常指标',
|
|||
|
|
'functional medical health advice', '功能医学健康建议',
|
|||
|
|
'urine detection', '尿液检测']
|
|||
|
|
|
|||
|
|
for i, elem in enumerate(modified_children):
|
|||
|
|
text = ''.join(elem.itertext()).strip().lower()
|
|||
|
|
if any(kw in text for kw in start_keywords):
|
|||
|
|
data_start_pos = i
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if data_start_pos < 0:
|
|||
|
|
data_start_pos = boundary_pos
|
|||
|
|
|
|||
|
|
print(f" [安全保存] 边界位置:{boundary_pos}, 数据起始:{data_start_pos}")
|
|||
|
|
|
|||
|
|
# 6. 清空模板 body,重新构建
|
|||
|
|
# 保存模板的 sectPr 元素(包含页脚引用)
|
|||
|
|
sectPr = None
|
|||
|
|
for elem in template_children:
|
|||
|
|
if elem.tag.endswith('}sectPr'):
|
|||
|
|
sectPr = etree.fromstring(etree.tostring(elem))
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 清空 body
|
|||
|
|
for elem in list(template_body):
|
|||
|
|
template_body.remove(elem)
|
|||
|
|
|
|||
|
|
# 7. 添加模板的前 boundary_pos 个元素(前四页)
|
|||
|
|
# 重新读取模板以获取原始元素
|
|||
|
|
with zipfile.ZipFile(template_path, 'r') as z:
|
|||
|
|
orig_template_xml = z.read('word/document.xml')
|
|||
|
|
orig_template_tree = etree.fromstring(orig_template_xml)
|
|||
|
|
orig_template_body = orig_template_tree.find('.//w:body', ns)
|
|||
|
|
orig_template_children = list(orig_template_body)
|
|||
|
|
|
|||
|
|
protected_count = 0
|
|||
|
|
for i in range(min(boundary_pos, len(orig_template_children))):
|
|||
|
|
elem = orig_template_children[i]
|
|||
|
|
if elem.tag.endswith('}sectPr'):
|
|||
|
|
continue
|
|||
|
|
elem_copy = etree.fromstring(etree.tostring(elem))
|
|||
|
|
template_body.append(elem_copy)
|
|||
|
|
protected_count += 1
|
|||
|
|
|
|||
|
|
# 8. 添加处理后文件的数据部分(从 Client Health Program 之后开始)
|
|||
|
|
data_count = 0
|
|||
|
|
for i in range(data_start_pos, len(modified_children)):
|
|||
|
|
elem = modified_children[i]
|
|||
|
|
if elem.tag.endswith('}sectPr'):
|
|||
|
|
continue
|
|||
|
|
elem_copy = etree.fromstring(etree.tostring(elem))
|
|||
|
|
template_body.append(elem_copy)
|
|||
|
|
data_count += 1
|
|||
|
|
|
|||
|
|
# 9. 添加 sectPr
|
|||
|
|
if sectPr is not None:
|
|||
|
|
template_body.append(sectPr)
|
|||
|
|
|
|||
|
|
print(f" [安全保存] 保护部分:{protected_count}, 数据部分:{data_count}")
|
|||
|
|
|
|||
|
|
# 10. 保存 XML
|
|||
|
|
new_xml = etree.tostring(template_tree, xml_declaration=True, encoding='UTF-8', standalone='yes')
|
|||
|
|
|
|||
|
|
# 11. 基于模板创建输出文件
|
|||
|
|
temp_result = str(output_path) + '.temp.docx'
|
|||
|
|
with zipfile.ZipFile(template_path, 'r') as zin:
|
|||
|
|
with zipfile.ZipFile(temp_result, 'w', zipfile.ZIP_DEFLATED) as zout:
|
|||
|
|
for item in zin.infolist():
|
|||
|
|
if item.filename == 'word/document.xml':
|
|||
|
|
zout.writestr(item, new_xml)
|
|||
|
|
else:
|
|||
|
|
zout.writestr(item, zin.read(item.filename))
|
|||
|
|
|
|||
|
|
# 12. 移动到最终位置
|
|||
|
|
if output_path.exists():
|
|||
|
|
output_path.unlink()
|
|||
|
|
shutil.move(temp_result, output_path)
|
|||
|
|
|
|||
|
|
print(f" [安全保存] ✓ 完成")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" [安全保存] 错误: {e}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|
|||
|
|
# 回退到普通保存
|
|||
|
|
doc.save(output_path)
|
|||
|
|
finally:
|
|||
|
|
for f in [temp_path, str(output_path) + '.temp.docx']:
|
|||
|
|
if os.path.exists(f):
|
|||
|
|
try:
|
|||
|
|
os.remove(f)
|
|||
|
|
except:
|
|||
|
|
pass
|