fix: 发布时间从详情页内容提取，避免pipeline二次请求失败

2026-03-03 17:08:01 +08:00
parent bf9be95ae8
commit e8c901b273
4 changed files with 42 additions and 4 deletions
--- a/processors/pipeline.py
+++ b/processors/pipeline.py
@@ -149,10 +149,10 @@ class ProcessingPipeline:

        # 基础字段映射：优先使用项目名称（已处理掉批准文号的名称）
        record["名称"] = item.get("项目名称", item.get("标题", ""))
-        pub_date = item.get("发布日期", item.get("项目发布时间", ""))
-        record["发布时间"] = pub_date
-        # 项目发布时间修复：使用与发布时间相同的值，确保格式一致
-        record["项目发布时间"] = pub_date  # 台州招标计划 JDY 使用此字段名
+        # 优先使用爬虫从详情页内容提取的发布时间（含时分秒）
+        pub_time = item.get("详情页发布时间", "")
+        record["发布时间"] = pub_time
+        record["项目发布时间"] = pub_time
        record["地区"] = item.get("地区", "")
        record["招标阶段"] = item.get("公告类型", notice_type)
        record["来源"] = item.get("来源", "")
--- a/spiders/base.py
+++ b/spiders/base.py
@@ -213,6 +213,32 @@ class BaseSpider(ABC):
        result["项目名称"] = project_name
        return result

+    # ---------- 发布时间提取 ----------
+
+    @staticmethod
+    def _extract_publish_time(soup, page_text: str) -> str:
+        """从详情页中提取发布时间（含时分秒）"""
+        patterns = [
+            r'信息发布时间[:：]\s*([\d-]+\s[\d:]+)',
+            r'发布时间[:：]\s*([\d-]+\s[\d:]+)',
+            r'发布日期[:：]\s*([\d-]+\s[\d:]+)',
+            r'发布时间[:：]\s*([\d-]+)',
+            r'发布日期[:：]\s*([\d-]+)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, page_text)
+            if match:
+                return match.group(1).strip()
+
+        time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I))
+        for tag in time_tags:
+            text = tag.get_text(strip=True)
+            match = re.search(r'([\d-]+\s[\d:]+)', text)
+            if match:
+                return match.group(1).strip()
+
+        return ""
+
    # ---------- 去重 ----------

    def is_duplicate(self, url: str) -> bool:
--- a/spiders/taizhou.py
+++ b/spiders/taizhou.py
@@ -135,6 +135,12 @@ class TaizhouSpider(BaseSpider):
        detail = {}
        soup = BeautifulSoup(resp.text, "html.parser")

+        # 提取发布时间（从页面内容中获取，含时分秒）
+        page_text = soup.get_text(separator="\n", strip=True)
+        publish_time = self._extract_publish_time(soup, page_text)
+        if publish_time:
+            detail["详情页发布时间"] = publish_time
+
        # 解析表格字段
        field_map = {
            "项目名称": "项目名称",
--- a/spiders/zhejiang.py
+++ b/spiders/zhejiang.py
@@ -159,6 +159,12 @@ class ZhejiangSpider(BaseSpider):
        detail = {}
        soup = BeautifulSoup(resp.text, "html.parser")

+        # 提取发布时间（从页面内容中获取，含时分秒）
+        page_text = soup.get_text(separator="\n", strip=True)
+        publish_time = self._extract_publish_time(soup, page_text)
+        if publish_time:
+            detail["详情页发布时间"] = publish_time
+
        # 解析表格字段
        field_map = {
            "项目名称": "项目名称",