清理已删除的测试文件,准备云端部署
This commit is contained in:
@@ -6,6 +6,7 @@ import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
@@ -181,6 +182,37 @@ class BaseSpider(ABC):
|
||||
logger.info(f"[统计] 总请求: {self._total_requests}, "
|
||||
f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟")
|
||||
|
||||
# ---------- 标题解析(统一规则) ----------
|
||||
|
||||
@staticmethod
|
||||
def _parse_title(title: str) -> dict:
|
||||
"""从标题中提取项目名称和批准文号(统一规则)"""
|
||||
result = {}
|
||||
|
||||
# 统一正则:前缀可选,贪婪匹配项目名称,提取尾部批准文号
|
||||
title_pattern = r"(?:\[(?:招标文件|招标公告)\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
project_name = match.group(1).strip()
|
||||
result["项目批准文号"] = match.group(2).strip()
|
||||
else:
|
||||
project_name = title
|
||||
# 尝试从标题尾部提取批准文号
|
||||
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(number_pattern, project_name)
|
||||
if match:
|
||||
result["项目批准文号"] = match.group(1).strip()
|
||||
project_name = project_name[:match.start()].strip()
|
||||
|
||||
# 清理项目名称后缀
|
||||
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
|
||||
for suffix in suffixes:
|
||||
if project_name.endswith(suffix):
|
||||
project_name = project_name[:-len(suffix)].strip()
|
||||
|
||||
result["项目名称"] = project_name
|
||||
return result
|
||||
|
||||
# ---------- 去重 ----------
|
||||
|
||||
def is_duplicate(self, url: str) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user