fix: 发布时间从详情页内容提取,避免pipeline二次请求失败
This commit is contained in:
@@ -149,10 +149,10 @@ class ProcessingPipeline:
|
|||||||
|
|
||||||
# 基础字段映射:优先使用项目名称(已处理掉批准文号的名称)
|
# 基础字段映射:优先使用项目名称(已处理掉批准文号的名称)
|
||||||
record["名称"] = item.get("项目名称", item.get("标题", ""))
|
record["名称"] = item.get("项目名称", item.get("标题", ""))
|
||||||
pub_date = item.get("发布日期", item.get("项目发布时间", ""))
|
# 优先使用爬虫从详情页内容提取的发布时间(含时分秒)
|
||||||
record["发布时间"] = pub_date
|
pub_time = item.get("详情页发布时间", "")
|
||||||
# 项目发布时间修复:使用与发布时间相同的值,确保格式一致
|
record["发布时间"] = pub_time
|
||||||
record["项目发布时间"] = pub_date # 台州招标计划 JDY 使用此字段名
|
record["项目发布时间"] = pub_time
|
||||||
record["地区"] = item.get("地区", "")
|
record["地区"] = item.get("地区", "")
|
||||||
record["招标阶段"] = item.get("公告类型", notice_type)
|
record["招标阶段"] = item.get("公告类型", notice_type)
|
||||||
record["来源"] = item.get("来源", "")
|
record["来源"] = item.get("来源", "")
|
||||||
|
|||||||
@@ -213,6 +213,32 @@ class BaseSpider(ABC):
|
|||||||
result["项目名称"] = project_name
|
result["项目名称"] = project_name
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# ---------- 发布时间提取 ----------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_publish_time(soup, page_text: str) -> str:
|
||||||
|
"""从详情页中提取发布时间(含时分秒)"""
|
||||||
|
patterns = [
|
||||||
|
r'信息发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||||
|
r'发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||||
|
r'发布日期[::]\s*([\d-]+\s[\d:]+)',
|
||||||
|
r'发布时间[::]\s*([\d-]+)',
|
||||||
|
r'发布日期[::]\s*([\d-]+)',
|
||||||
|
]
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, page_text)
|
||||||
|
if match:
|
||||||
|
return match.group(1).strip()
|
||||||
|
|
||||||
|
time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I))
|
||||||
|
for tag in time_tags:
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
match = re.search(r'([\d-]+\s[\d:]+)', text)
|
||||||
|
if match:
|
||||||
|
return match.group(1).strip()
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
# ---------- 去重 ----------
|
# ---------- 去重 ----------
|
||||||
|
|
||||||
def is_duplicate(self, url: str) -> bool:
|
def is_duplicate(self, url: str) -> bool:
|
||||||
|
|||||||
@@ -135,6 +135,12 @@ class TaizhouSpider(BaseSpider):
|
|||||||
detail = {}
|
detail = {}
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
# 提取发布时间(从页面内容中获取,含时分秒)
|
||||||
|
page_text = soup.get_text(separator="\n", strip=True)
|
||||||
|
publish_time = self._extract_publish_time(soup, page_text)
|
||||||
|
if publish_time:
|
||||||
|
detail["详情页发布时间"] = publish_time
|
||||||
|
|
||||||
# 解析表格字段
|
# 解析表格字段
|
||||||
field_map = {
|
field_map = {
|
||||||
"项目名称": "项目名称",
|
"项目名称": "项目名称",
|
||||||
|
|||||||
@@ -159,6 +159,12 @@ class ZhejiangSpider(BaseSpider):
|
|||||||
detail = {}
|
detail = {}
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
# 提取发布时间(从页面内容中获取,含时分秒)
|
||||||
|
page_text = soup.get_text(separator="\n", strip=True)
|
||||||
|
publish_time = self._extract_publish_time(soup, page_text)
|
||||||
|
if publish_time:
|
||||||
|
detail["详情页发布时间"] = publish_time
|
||||||
|
|
||||||
# 解析表格字段
|
# 解析表格字段
|
||||||
field_map = {
|
field_map = {
|
||||||
"项目名称": "项目名称",
|
"项目名称": "项目名称",
|
||||||
|
|||||||
Reference in New Issue
Block a user