196 lines
5.6 KiB
Python
196 lines
5.6 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
附件下载和解析模块
|
||
支持PDF和Word文档
|
||
"""
|
||
import os
|
||
import re
|
||
import requests
|
||
import pdfplumber
|
||
from docx import Document
|
||
from typing import Optional, Dict, List
|
||
|
||
|
||
class AttachmentHandler:
|
||
"""附件处理器"""
|
||
|
||
def __init__(self, download_dir: str = "attachments"):
|
||
self.download_dir = download_dir
|
||
os.makedirs(download_dir, exist_ok=True)
|
||
|
||
def download(self, url: str, filename: str = None) -> Optional[str]:
|
||
"""
|
||
下载附件
|
||
|
||
Args:
|
||
url: 附件URL
|
||
filename: 保存的文件名(可选)
|
||
|
||
Returns:
|
||
保存的文件路径,失败返回None
|
||
"""
|
||
try:
|
||
# 处理URL
|
||
if not url.startswith('http'):
|
||
return None
|
||
|
||
# 生成文件名
|
||
if not filename:
|
||
filename = url.split('/')[-1]
|
||
# 清理文件名
|
||
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||
|
||
filepath = os.path.join(self.download_dir, filename)
|
||
|
||
# 下载文件
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=60, stream=True)
|
||
response.raise_for_status()
|
||
|
||
with open(filepath, 'wb') as f:
|
||
for chunk in response.iter_content(chunk_size=8192):
|
||
f.write(chunk)
|
||
|
||
print(f" 下载成功: {filename}")
|
||
return filepath
|
||
|
||
except Exception as e:
|
||
print(f" 下载失败: {e}")
|
||
return None
|
||
|
||
def extract_pdf_text(self, filepath: str) -> str:
|
||
"""
|
||
提取PDF文本内容
|
||
|
||
Args:
|
||
filepath: PDF文件路径
|
||
|
||
Returns:
|
||
提取的文本内容
|
||
"""
|
||
text = ""
|
||
try:
|
||
with pdfplumber.open(filepath) as pdf:
|
||
for page in pdf.pages:
|
||
page_text = page.extract_text()
|
||
if page_text:
|
||
text += page_text + "\n\n"
|
||
except Exception as e:
|
||
print(f" PDF解析失败: {e}")
|
||
return text.strip()
|
||
|
||
def extract_docx_text(self, filepath: str) -> str:
|
||
"""
|
||
提取Word文档文本内容
|
||
|
||
Args:
|
||
filepath: Word文件路径
|
||
|
||
Returns:
|
||
提取的文本内容
|
||
"""
|
||
text = ""
|
||
try:
|
||
doc = Document(filepath)
|
||
for para in doc.paragraphs:
|
||
text += para.text + "\n"
|
||
|
||
# 提取表格内容
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
row_text = " | ".join([cell.text.strip() for cell in row.cells])
|
||
text += row_text + "\n"
|
||
except Exception as e:
|
||
print(f" Word解析失败: {e}")
|
||
return text.strip()
|
||
|
||
def extract_text(self, filepath: str) -> str:
|
||
"""
|
||
根据文件类型提取文本
|
||
|
||
Args:
|
||
filepath: 文件路径
|
||
|
||
Returns:
|
||
提取的文本内容
|
||
"""
|
||
if not filepath or not os.path.exists(filepath):
|
||
return ""
|
||
|
||
ext = os.path.splitext(filepath)[1].lower()
|
||
|
||
if ext == '.pdf':
|
||
return self.extract_pdf_text(filepath)
|
||
elif ext in ['.doc', '.docx']:
|
||
return self.extract_docx_text(filepath)
|
||
else:
|
||
return ""
|
||
|
||
def download_and_extract(self, url: str, filename: str = None) -> Dict:
|
||
"""
|
||
下载并提取附件内容
|
||
|
||
Args:
|
||
url: 附件URL
|
||
filename: 保存的文件名
|
||
|
||
Returns:
|
||
包含文件路径和文本内容的字典
|
||
"""
|
||
result = {
|
||
"url": url,
|
||
"filepath": None,
|
||
"text": "",
|
||
"success": False
|
||
}
|
||
|
||
filepath = self.download(url, filename)
|
||
if filepath:
|
||
result["filepath"] = filepath
|
||
result["text"] = self.extract_text(filepath)
|
||
result["success"] = True
|
||
|
||
return result
|
||
|
||
|
||
def find_attachments(page) -> List[Dict]:
|
||
"""
|
||
从页面中查找附件链接
|
||
|
||
Args:
|
||
page: DrissionPage页面对象
|
||
|
||
Returns:
|
||
附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...]
|
||
"""
|
||
attachments = []
|
||
|
||
# 查找PDF链接
|
||
pdf_links = page.eles('css:a[href*=".pdf"]')
|
||
for link in pdf_links:
|
||
href = link.attr('href') or ''
|
||
name = link.text.strip() or href.split('/')[-1]
|
||
if href:
|
||
# 处理相对路径
|
||
if href.startswith('/'):
|
||
base_url = '/'.join(page.url.split('/')[:3])
|
||
href = base_url + href
|
||
attachments.append({"name": name, "url": href, "type": "pdf"})
|
||
|
||
# 查找Word链接
|
||
doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]']
|
||
for sel in doc_selectors:
|
||
doc_links = page.eles(sel)
|
||
for link in doc_links:
|
||
href = link.attr('href') or ''
|
||
name = link.text.strip() or href.split('/')[-1]
|
||
if href:
|
||
if href.startswith('/'):
|
||
base_url = '/'.join(page.url.split('/')[:3])
|
||
href = base_url + href
|
||
attachments.append({"name": name, "url": href, "type": "docx"})
|
||
|
||
return attachments
|