清理已删除的测试文件,准备云端部署

This commit is contained in:
ztb-system
2026-02-25 18:17:00 +08:00
parent 5f93dbe5e4
commit 305f6b342c
29 changed files with 143 additions and 2536 deletions

View File

@@ -6,6 +6,7 @@ import csv
import logging
import os
import random
import re
import signal
import sys
import time
@@ -181,6 +182,37 @@ class BaseSpider(ABC):
logger.info(f"[统计] 总请求: {self._total_requests}, "
f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟")
# ---------- 标题解析(统一规则) ----------
@staticmethod
def _parse_title(title: str) -> dict:
"""从标题中提取项目名称和批准文号(统一规则)"""
result = {}
# 统一正则:前缀可选,贪婪匹配项目名称,提取尾部批准文号
title_pattern = r"(?:\[(?:招标文件|招标公告)\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
match = re.search(title_pattern, title)
if match:
project_name = match.group(1).strip()
result["项目批准文号"] = match.group(2).strip()
else:
project_name = title
# 尝试从标题尾部提取批准文号
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
match = re.search(number_pattern, project_name)
if match:
result["项目批准文号"] = match.group(1).strip()
project_name = project_name[:match.start()].strip()
# 清理项目名称后缀
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
for suffix in suffixes:
if project_name.endswith(suffix):
project_name = project_name[:-len(suffix)].strip()
result["项目名称"] = project_name
return result
# ---------- 去重 ----------
def is_duplicate(self, url: str) -> bool: