fix: 发布时间从详情页内容提取,避免pipeline二次请求失败
This commit is contained in:
@@ -159,6 +159,12 @@ class ZhejiangSpider(BaseSpider):
|
||||
detail = {}
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 提取发布时间(从页面内容中获取,含时分秒)
|
||||
page_text = soup.get_text(separator="\n", strip=True)
|
||||
publish_time = self._extract_publish_time(soup, page_text)
|
||||
if publish_time:
|
||||
detail["详情页发布时间"] = publish_time
|
||||
|
||||
# 解析表格字段
|
||||
field_map = {
|
||||
"项目名称": "项目名称",
|
||||
|
||||
Reference in New Issue
Block a user