Initial commit: 招标信息爬虫与分析系统

This commit is contained in:
ztb-system
2026-02-13 18:15:20 +08:00
commit d2fa06801f
38 changed files with 5415 additions and 0 deletions

195
utils/attachment.py Normal file
View File

@@ -0,0 +1,195 @@
# -*- coding: utf-8 -*-
"""
附件下载和解析模块
支持PDF和Word文档
"""
import os
import re
import requests
import pdfplumber
from docx import Document
from typing import Optional, Dict, List
class AttachmentHandler:
"""附件处理器"""
def __init__(self, download_dir: str = "attachments"):
self.download_dir = download_dir
os.makedirs(download_dir, exist_ok=True)
def download(self, url: str, filename: str = None) -> Optional[str]:
"""
下载附件
Args:
url: 附件URL
filename: 保存的文件名(可选)
Returns:
保存的文件路径失败返回None
"""
try:
# 处理URL
if not url.startswith('http'):
return None
# 生成文件名
if not filename:
filename = url.split('/')[-1]
# 清理文件名
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
filepath = os.path.join(self.download_dir, filename)
# 下载文件
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
}
response = requests.get(url, headers=headers, timeout=60, stream=True)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f" 下载成功: {filename}")
return filepath
except Exception as e:
print(f" 下载失败: {e}")
return None
def extract_pdf_text(self, filepath: str) -> str:
"""
提取PDF文本内容
Args:
filepath: PDF文件路径
Returns:
提取的文本内容
"""
text = ""
try:
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
except Exception as e:
print(f" PDF解析失败: {e}")
return text.strip()
def extract_docx_text(self, filepath: str) -> str:
"""
提取Word文档文本内容
Args:
filepath: Word文件路径
Returns:
提取的文本内容
"""
text = ""
try:
doc = Document(filepath)
for para in doc.paragraphs:
text += para.text + "\n"
# 提取表格内容
for table in doc.tables:
for row in table.rows:
row_text = " | ".join([cell.text.strip() for cell in row.cells])
text += row_text + "\n"
except Exception as e:
print(f" Word解析失败: {e}")
return text.strip()
def extract_text(self, filepath: str) -> str:
"""
根据文件类型提取文本
Args:
filepath: 文件路径
Returns:
提取的文本内容
"""
if not filepath or not os.path.exists(filepath):
return ""
ext = os.path.splitext(filepath)[1].lower()
if ext == '.pdf':
return self.extract_pdf_text(filepath)
elif ext in ['.doc', '.docx']:
return self.extract_docx_text(filepath)
else:
return ""
def download_and_extract(self, url: str, filename: str = None) -> Dict:
"""
下载并提取附件内容
Args:
url: 附件URL
filename: 保存的文件名
Returns:
包含文件路径和文本内容的字典
"""
result = {
"url": url,
"filepath": None,
"text": "",
"success": False
}
filepath = self.download(url, filename)
if filepath:
result["filepath"] = filepath
result["text"] = self.extract_text(filepath)
result["success"] = True
return result
def find_attachments(page) -> List[Dict]:
"""
从页面中查找附件链接
Args:
page: DrissionPage页面对象
Returns:
附件信息列表 [{"name": "文件名", "url": "下载链接"}, ...]
"""
attachments = []
# 查找PDF链接
pdf_links = page.eles('css:a[href*=".pdf"]')
for link in pdf_links:
href = link.attr('href') or ''
name = link.text.strip() or href.split('/')[-1]
if href:
# 处理相对路径
if href.startswith('/'):
base_url = '/'.join(page.url.split('/')[:3])
href = base_url + href
attachments.append({"name": name, "url": href, "type": "pdf"})
# 查找Word链接
doc_selectors = ['css:a[href*=".doc"]', 'css:a[href*=".docx"]']
for sel in doc_selectors:
doc_links = page.eles(sel)
for link in doc_links:
href = link.attr('href') or ''
name = link.text.strip() or href.split('/')[-1]
if href:
if href.startswith('/'):
base_url = '/'.join(page.url.split('/')[:3])
href = base_url + href
attachments.append({"name": name, "url": href, "type": "docx"})
return attachments