Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
195
utils/attachment.py
Normal file
195
utils/attachment.py
Normal file
@@ -0,0 +1,195 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
附件下载和解析模块
|
||||
支持PDF和Word文档
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import pdfplumber
|
||||
from docx import Document
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
|
||||
class AttachmentHandler:
|
||||
"""附件处理器"""
|
||||
|
||||
def __init__(self, download_dir: str = "attachments"):
|
||||
self.download_dir = download_dir
|
||||
os.makedirs(download_dir, exist_ok=True)
|
||||
|
||||
def download(self, url: str, filename: str = None) -> Optional[str]:
|
||||
"""
|
||||
下载附件
|
||||
|
||||
Args:
|
||||
url: 附件URL
|
||||
filename: 保存的文件名(可选)
|
||||
|
||||
Returns:
|
||||
保存的文件路径,失败返回None
|
||||
"""
|
||||
try:
|
||||
# 处理URL
|
||||
if not url.startswith('http'):
|
||||
return None
|
||||
|
||||
# 生成文件名
|
||||
if not filename:
|
||||
filename = url.split('/')[-1]
|
||||
# 清理文件名
|
||||
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||||
|
||||
filepath = os.path.join(self.download_dir, filename)
|
||||
|
||||
# 下载文件
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=60, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
print(f" 下载成功: {filename}")
|
||||
return filepath
|
||||
|
||||
except Exception as e:
|
||||
print(f" 下载失败: {e}")
|
||||
return None
|
||||
|
||||
def extract_pdf_text(self, filepath: str) -> str:
|
||||
"""
|
||||
提取PDF文本内容
|
||||
|
||||
Args:
|
||||
filepath: PDF文件路径
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
text = ""
|
||||
try:
|
||||
with pdfplumber.open(filepath) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text += page_text + "\n\n"
|
||||
except Exception as e:
|
||||
print(f" PDF解析失败: {e}")
|
||||
return text.strip()
|
||||
|
||||
def extract_docx_text(self, filepath: str) -> str:
|
||||
"""
|
||||
提取Word文档文本内容
|
||||
|
||||
Args:
|
||||
filepath: Word文件路径
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
text = ""
|
||||
try:
|
||||
doc = Document(filepath)
|
||||
for para in doc.paragraphs:
|
||||
text += para.text + "\n"
|
||||
|
||||
# 提取表格内容
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = " | ".join([cell.text.strip() for cell in row.cells])
|
||||
text += row_text + "\n"
|
||||
except Exception as e:
|
||||
print(f" Word解析失败: {e}")
|
||||
return text.strip()
|
||||
|
||||
def extract_text(self, filepath: str) -> str:
|
||||
"""
|
||||
根据文件类型提取文本
|
||||
|
||||
Args:
|
||||
filepath: 文件路径
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
if not filepath or not os.path.exists(filepath):
|
||||
return ""
|
||||
|
||||
ext = os.path.splitext(filepath)[1].lower()
|
||||
|
||||
if ext == '.pdf':
|
||||
return self.extract_pdf_text(filepath)
|
||||
elif ext in ['.doc', '.docx']:
|
||||
return self.extract_docx_text(filepath)
|
||||
else:
|
||||
return ""
|
||||
|
||||
def download_and_extract(self, url: str, filename: str = None) -> Dict:
|
||||
"""
|
||||
下载并提取附件内容
|
||||
|
||||
Args:
|
||||
url: 附件URL
|
||||
filename: 保存的文件名
|
||||
|
||||
Returns:
|
||||
包含文件路径和文本内容的字典
|
||||
"""
|
||||
result = {
|
||||
"url": url,
|
||||
"filepath": None,
|
||||
"text": "",
|
||||
"success": False
|
||||
}
|
||||
|
||||
filepath = self.download(url, filename)
|
||||
if filepath:
|
||||
result["filepath"] = filepath
|
||||
result["text"] = self.extract_text(filepath)
|
||||
result["success"] = True
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def find_attachments(page) -> List[Dict]:
|
||||
"""
|
||||
从页面中查找附件链接
|
||||
|
||||
Args:
|
||||
page: DrissionPage页面对象
|
||||
|
||||
Returns:
|
||||
附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...]
|
||||
"""
|
||||
attachments = []
|
||||
|
||||
# 查找PDF链接
|
||||
pdf_links = page.eles('css:a[href*=".pdf"]')
|
||||
for link in pdf_links:
|
||||
href = link.attr('href') or ''
|
||||
name = link.text.strip() or href.split('/')[-1]
|
||||
if href:
|
||||
# 处理相对路径
|
||||
if href.startswith('/'):
|
||||
base_url = '/'.join(page.url.split('/')[:3])
|
||||
href = base_url + href
|
||||
attachments.append({"name": name, "url": href, "type": "pdf"})
|
||||
|
||||
# 查找Word链接
|
||||
doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]']
|
||||
for sel in doc_selectors:
|
||||
doc_links = page.eles(sel)
|
||||
for link in doc_links:
|
||||
href = link.attr('href') or ''
|
||||
name = link.text.strip() or href.split('/')[-1]
|
||||
if href:
|
||||
if href.startswith('/'):
|
||||
base_url = '/'.join(page.url.split('/')[:3])
|
||||
href = base_url + href
|
||||
attachments.append({"name": name, "url": href, "type": "docx"})
|
||||
|
||||
return attachments
|
||||
Reference in New Issue
Block a user