diff --git a/processors/pipeline.py b/processors/pipeline.py index fcf390e..fa7eec9 100644 --- a/processors/pipeline.py +++ b/processors/pipeline.py @@ -149,10 +149,10 @@ class ProcessingPipeline: # 基础字段映射:优先使用项目名称(已处理掉批准文号的名称) record["名称"] = item.get("项目名称", item.get("标题", "")) - pub_date = item.get("发布日期", item.get("项目发布时间", "")) - record["发布时间"] = pub_date - # 项目发布时间修复:使用与发布时间相同的值,确保格式一致 - record["项目发布时间"] = pub_date # 台州招标计划 JDY 使用此字段名 + # 优先使用爬虫从详情页内容提取的发布时间(含时分秒) + pub_time = item.get("详情页发布时间", "") + record["发布时间"] = pub_time + record["项目发布时间"] = pub_time record["地区"] = item.get("地区", "") record["招标阶段"] = item.get("公告类型", notice_type) record["来源"] = item.get("来源", "") diff --git a/spiders/base.py b/spiders/base.py index 4fc04b4..fbd1046 100644 --- a/spiders/base.py +++ b/spiders/base.py @@ -213,6 +213,32 @@ class BaseSpider(ABC): result["项目名称"] = project_name return result + # ---------- 发布时间提取 ---------- + + @staticmethod + def _extract_publish_time(soup, page_text: str) -> str: + """从详情页中提取发布时间(含时分秒)""" + patterns = [ + r'信息发布时间[::]\s*([\d-]+\s[\d:]+)', + r'发布时间[::]\s*([\d-]+\s[\d:]+)', + r'发布日期[::]\s*([\d-]+\s[\d:]+)', + r'发布时间[::]\s*([\d-]+)', + r'发布日期[::]\s*([\d-]+)', + ] + for pattern in patterns: + match = re.search(pattern, page_text) + if match: + return match.group(1).strip() + + time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I)) + for tag in time_tags: + text = tag.get_text(strip=True) + match = re.search(r'([\d-]+\s[\d:]+)', text) + if match: + return match.group(1).strip() + + return "" + # ---------- 去重 ---------- def is_duplicate(self, url: str) -> bool: diff --git a/spiders/taizhou.py b/spiders/taizhou.py index 88ade63..a7a287f 100644 --- a/spiders/taizhou.py +++ b/spiders/taizhou.py @@ -135,6 +135,12 @@ class TaizhouSpider(BaseSpider): detail = {} soup = BeautifulSoup(resp.text, "html.parser") + # 提取发布时间(从页面内容中获取,含时分秒) + page_text = soup.get_text(separator="\n", strip=True) + publish_time = self._extract_publish_time(soup, page_text) + if publish_time: + detail["详情页发布时间"] = publish_time + # 解析表格字段 field_map = { "项目名称": "项目名称", diff --git a/spiders/zhejiang.py b/spiders/zhejiang.py index 5793089..b636399 100644 --- a/spiders/zhejiang.py +++ b/spiders/zhejiang.py @@ -159,6 +159,12 @@ class ZhejiangSpider(BaseSpider): detail = {} soup = BeautifulSoup(resp.text, "html.parser") + # 提取发布时间(从页面内容中获取,含时分秒) + page_text = soup.get_text(separator="\n", strip=True) + publish_time = self._extract_publish_time(soup, page_text) + if publish_time: + detail["详情页发布时间"] = publish_time + # 解析表格字段 field_map = { "项目名称": "项目名称",