Files
ztb/utils/attachment.py
2026-02-13 18:15:20 +08:00

196 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
附件下载和解析模块
支持PDF和Word文档
"""
import os
import re
import requests
import pdfplumber
from docx import Document
from typing import Optional, Dict, List
class AttachmentHandler:
"""附件处理器"""
def __init__(self, download_dir: str = "attachments"):
self.download_dir = download_dir
os.makedirs(download_dir, exist_ok=True)
def download(self, url: str, filename: str = None) -> Optional[str]:
"""
下载附件
Args:
url: 附件URL
filename: 保存的文件名(可选)
Returns:
保存的文件路径失败返回None
"""
try:
# 处理URL
if not url.startswith('http'):
return None
# 生成文件名
if not filename:
filename = url.split('/')[-1]
# 清理文件名
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filepath = os.path.join(self.download_dir, filename)
# 下载文件
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
}
response = requests.get(url, headers=headers, timeout=60, stream=True)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f" 下载成功: {filename}")
return filepath
except Exception as e:
print(f" 下载失败: {e}")
return None
def extract_pdf_text(self, filepath: str) -> str:
"""
提取PDF文本内容
Args:
filepath: PDF文件路径
Returns:
提取的文本内容
"""
text = ""
try:
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
except Exception as e:
print(f" PDF解析失败: {e}")
return text.strip()
def extract_docx_text(self, filepath: str) -> str:
"""
提取Word文档文本内容
Args:
filepath: Word文件路径
Returns:
提取的文本内容
"""
text = ""
try:
doc = Document(filepath)
for para in doc.paragraphs:
text += para.text + "\n"
# 提取表格内容
for table in doc.tables:
for row in table.rows:
row_text = " | ".join([cell.text.strip() for cell in row.cells])
text += row_text + "\n"
except Exception as e:
print(f" Word解析失败: {e}")
return text.strip()
def extract_text(self, filepath: str) -> str:
"""
根据文件类型提取文本
Args:
filepath: 文件路径
Returns:
提取的文本内容
"""
if not filepath or not os.path.exists(filepath):
return ""
ext = os.path.splitext(filepath)[1].lower()
if ext == '.pdf':
return self.extract_pdf_text(filepath)
elif ext in ['.doc', '.docx']:
return self.extract_docx_text(filepath)
else:
return ""
def download_and_extract(self, url: str, filename: str = None) -> Dict:
"""
下载并提取附件内容
Args:
url: 附件URL
filename: 保存的文件名
Returns:
包含文件路径和文本内容的字典
"""
result = {
"url": url,
"filepath": None,
"text": "",
"success": False
}
filepath = self.download(url, filename)
if filepath:
result["filepath"] = filepath
result["text"] = self.extract_text(filepath)
result["success"] = True
return result
def find_attachments(page) -> List[Dict]:
"""
从页面中查找附件链接
Args:
page: DrissionPage页面对象
Returns:
附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...]
"""
attachments = []
# 查找PDF链接
pdf_links = page.eles('css:a[href*=".pdf"]')
for link in pdf_links:
href = link.attr('href') or ''
name = link.text.strip() or href.split('/')[-1]
if href:
# 处理相对路径
if href.startswith('/'):
base_url = '/'.join(page.url.split('/')[:3])
href = base_url + href
attachments.append({"name": name, "url": href, "type": "pdf"})
# 查找Word链接
doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]']
for sel in doc_selectors:
doc_links = page.eles(sel)
for link in doc_links:
href = link.attr('href') or ''
name = link.text.strip() or href.split('/')[-1]
if href:
if href.startswith('/'):
base_url = '/'.join(page.url.split('/')[:3])
href = base_url + href
attachments.append({"name": name, "url": href, "type": "docx"})
return attachments