fix: 发布时间从详情页内容提取,避免pipeline二次请求失败
This commit is contained in:
@@ -213,6 +213,32 @@ class BaseSpider(ABC):
|
||||
result["项目名称"] = project_name
|
||||
return result
|
||||
|
||||
# ---------- 发布时间提取 ----------
|
||||
|
||||
@staticmethod
|
||||
def _extract_publish_time(soup, page_text: str) -> str:
|
||||
"""从详情页中提取发布时间(含时分秒)"""
|
||||
patterns = [
|
||||
r'信息发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布日期[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布时间[::]\s*([\d-]+)',
|
||||
r'发布日期[::]\s*([\d-]+)',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_text)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I))
|
||||
for tag in time_tags:
|
||||
text = tag.get_text(strip=True)
|
||||
match = re.search(r'([\d-]+\s[\d:]+)', text)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return ""
|
||||
|
||||
# ---------- 去重 ----------
|
||||
|
||||
def is_duplicate(self, url: str) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user