196 lines
5.6 KiB
Python
196 lines
5.6 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
附件下载和解析模块
|
|||
|
|
支持PDF和Word文档
|
|||
|
|
"""
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import requests
|
|||
|
|
import pdfplumber
|
|||
|
|
from docx import Document
|
|||
|
|
from typing import Optional, Dict, List
|
|||
|
|
|
|||
|
|
|
|||
|
|
class AttachmentHandler:
|
|||
|
|
"""附件处理器"""
|
|||
|
|
|
|||
|
|
def __init__(self, download_dir: str = "attachments"):
|
|||
|
|
self.download_dir = download_dir
|
|||
|
|
os.makedirs(download_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
def download(self, url: str, filename: str = None) -> Optional[str]:
|
|||
|
|
"""
|
|||
|
|
下载附件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
url: 附件URL
|
|||
|
|
filename: 保存的文件名(可选)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
保存的文件路径,失败返回None
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 处理URL
|
|||
|
|
if not url.startswith('http'):
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 生成文件名
|
|||
|
|
if not filename:
|
|||
|
|
filename = url.split('/')[-1]
|
|||
|
|
# 清理文件名
|
|||
|
|
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|||
|
|
|
|||
|
|
filepath = os.path.join(self.download_dir, filename)
|
|||
|
|
|
|||
|
|
# 下载文件
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
|
|||
|
|
}
|
|||
|
|
response = requests.get(url, headers=headers, timeout=60, stream=True)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
with open(filepath, 'wb') as f:
|
|||
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|||
|
|
f.write(chunk)
|
|||
|
|
|
|||
|
|
print(f" 下载成功: {filename}")
|
|||
|
|
return filepath
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" 下载失败: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def extract_pdf_text(self, filepath: str) -> str:
|
|||
|
|
"""
|
|||
|
|
提取PDF文本内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
filepath: PDF文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容
|
|||
|
|
"""
|
|||
|
|
text = ""
|
|||
|
|
try:
|
|||
|
|
with pdfplumber.open(filepath) as pdf:
|
|||
|
|
for page in pdf.pages:
|
|||
|
|
page_text = page.extract_text()
|
|||
|
|
if page_text:
|
|||
|
|
text += page_text + "\n\n"
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" PDF解析失败: {e}")
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
def extract_docx_text(self, filepath: str) -> str:
|
|||
|
|
"""
|
|||
|
|
提取Word文档文本内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
filepath: Word文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容
|
|||
|
|
"""
|
|||
|
|
text = ""
|
|||
|
|
try:
|
|||
|
|
doc = Document(filepath)
|
|||
|
|
for para in doc.paragraphs:
|
|||
|
|
text += para.text + "\n"
|
|||
|
|
|
|||
|
|
# 提取表格内容
|
|||
|
|
for table in doc.tables:
|
|||
|
|
for row in table.rows:
|
|||
|
|
row_text = " | ".join([cell.text.strip() for cell in row.cells])
|
|||
|
|
text += row_text + "\n"
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" Word解析失败: {e}")
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
def extract_text(self, filepath: str) -> str:
|
|||
|
|
"""
|
|||
|
|
根据文件类型提取文本
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
filepath: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容
|
|||
|
|
"""
|
|||
|
|
if not filepath or not os.path.exists(filepath):
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
ext = os.path.splitext(filepath)[1].lower()
|
|||
|
|
|
|||
|
|
if ext == '.pdf':
|
|||
|
|
return self.extract_pdf_text(filepath)
|
|||
|
|
elif ext in ['.doc', '.docx']:
|
|||
|
|
return self.extract_docx_text(filepath)
|
|||
|
|
else:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
def download_and_extract(self, url: str, filename: str = None) -> Dict:
|
|||
|
|
"""
|
|||
|
|
下载并提取附件内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
url: 附件URL
|
|||
|
|
filename: 保存的文件名
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
包含文件路径和文本内容的字典
|
|||
|
|
"""
|
|||
|
|
result = {
|
|||
|
|
"url": url,
|
|||
|
|
"filepath": None,
|
|||
|
|
"text": "",
|
|||
|
|
"success": False
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
filepath = self.download(url, filename)
|
|||
|
|
if filepath:
|
|||
|
|
result["filepath"] = filepath
|
|||
|
|
result["text"] = self.extract_text(filepath)
|
|||
|
|
result["success"] = True
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def find_attachments(page) -> List[Dict]:
|
|||
|
|
"""
|
|||
|
|
从页面中查找附件链接
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
page: DrissionPage页面对象
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...]
|
|||
|
|
"""
|
|||
|
|
attachments = []
|
|||
|
|
|
|||
|
|
# 查找PDF链接
|
|||
|
|
pdf_links = page.eles('css:a[href*=".pdf"]')
|
|||
|
|
for link in pdf_links:
|
|||
|
|
href = link.attr('href') or ''
|
|||
|
|
name = link.text.strip() or href.split('/')[-1]
|
|||
|
|
if href:
|
|||
|
|
# 处理相对路径
|
|||
|
|
if href.startswith('/'):
|
|||
|
|
base_url = '/'.join(page.url.split('/')[:3])
|
|||
|
|
href = base_url + href
|
|||
|
|
attachments.append({"name": name, "url": href, "type": "pdf"})
|
|||
|
|
|
|||
|
|
# 查找Word链接
|
|||
|
|
doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]']
|
|||
|
|
for sel in doc_selectors:
|
|||
|
|
doc_links = page.eles(sel)
|
|||
|
|
for link in doc_links:
|
|||
|
|
href = link.attr('href') or ''
|
|||
|
|
name = link.text.strip() or href.split('/')[-1]
|
|||
|
|
if href:
|
|||
|
|
if href.startswith('/'):
|
|||
|
|
base_url = '/'.join(page.url.split('/')[:3])
|
|||
|
|
href = base_url + href
|
|||
|
|
attachments.append({"name": name, "url": href, "type": "docx"})
|
|||
|
|
|
|||
|
|
return attachments
|