# -*- coding: utf-8 -*- """ 附件下载和解析模块 支持PDF和Word文档 """ import os import re import requests import pdfplumber from docx import Document from typing import Optional, Dict, List class AttachmentHandler: """附件处理器""" def __init__(self, download_dir: str = "attachments"): self.download_dir = download_dir os.makedirs(download_dir, exist_ok=True) def download(self, url: str, filename: str = None) -> Optional[str]: """ 下载附件 Args: url: 附件URL filename: 保存的文件名(可选) Returns: 保存的文件路径,失败返回None """ try: # 处理URL if not url.startswith('http'): return None # 生成文件名 if not filename: filename = url.split('/')[-1] # 清理文件名 filename = re.sub(r'[<>:"/\\|?*]', '_', filename) filepath = os.path.join(self.download_dir, filename) # 下载文件 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0' } response = requests.get(url, headers=headers, timeout=60, stream=True) response.raise_for_status() with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f" 下载成功: {filename}") return filepath except Exception as e: print(f" 下载失败: {e}") return None def extract_pdf_text(self, filepath: str) -> str: """ 提取PDF文本内容 Args: filepath: PDF文件路径 Returns: 提取的文本内容 """ text = "" try: with pdfplumber.open(filepath) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n\n" except Exception as e: print(f" PDF解析失败: {e}") return text.strip() def extract_docx_text(self, filepath: str) -> str: """ 提取Word文档文本内容 Args: filepath: Word文件路径 Returns: 提取的文本内容 """ text = "" try: doc = Document(filepath) for para in doc.paragraphs: text += para.text + "\n" # 提取表格内容 for table in doc.tables: for row in table.rows: row_text = " | ".join([cell.text.strip() for cell in row.cells]) text += row_text + "\n" except Exception as e: print(f" Word解析失败: {e}") return text.strip() def extract_text(self, filepath: str) -> str: """ 根据文件类型提取文本 Args: filepath: 文件路径 Returns: 提取的文本内容 """ if not filepath or not os.path.exists(filepath): return "" ext = os.path.splitext(filepath)[1].lower() if ext == '.pdf': return self.extract_pdf_text(filepath) elif ext in ['.doc', '.docx']: return self.extract_docx_text(filepath) else: return "" def download_and_extract(self, url: str, filename: str = None) -> Dict: """ 下载并提取附件内容 Args: url: 附件URL filename: 保存的文件名 Returns: 包含文件路径和文本内容的字典 """ result = { "url": url, "filepath": None, "text": "", "success": False } filepath = self.download(url, filename) if filepath: result["filepath"] = filepath result["text"] = self.extract_text(filepath) result["success"] = True return result def find_attachments(page) -> List[Dict]: """ 从页面中查找附件链接 Args: page: DrissionPage页面对象 Returns: 附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...] """ attachments = [] # 查找PDF链接 pdf_links = page.eles('css:a[href*=".pdf"]') for link in pdf_links: href = link.attr('href') or '' name = link.text.strip() or href.split('/')[-1] if href: # 处理相对路径 if href.startswith('/'): base_url = '/'.join(page.url.split('/')[:3]) href = base_url + href attachments.append({"name": name, "url": href, "type": "pdf"}) # 查找Word链接 doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]'] for sel in doc_selectors: doc_links = page.eles(sel) for link in doc_links: href = link.attr('href') or '' name = link.text.strip() or href.split('/')[-1] if href: if href.startswith('/'): base_url = '/'.join(page.url.split('/')[:3]) href = base_url + href attachments.append({"name": name, "url": href, "type": "docx"}) return attachments