修复浙江爬虫日期过滤:客户端精确过滤+智能翻页逻辑;补充python-dotenv依赖

This commit is contained in:
ztb-system
2026-02-26 10:13:46 +08:00
parent c65f3d07f8
commit bf9be95ae8
2 changed files with 22 additions and 8 deletions

View File

@@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0
lxml>=4.9.0
pdfplumber>=0.10.0
python-docx>=1.0.0
python-dotenv>=1.0.0

View File

@@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider):
page_size = self.spider_config.get("page_size", 20)
# 日期范围
target_date = None # 客户端精确过滤用
if date_filter == "yesterday":
d = datetime.now() - timedelta(days=1)
start_date = end_date = d.strftime("%Y-%m-%d")
logger.info(f"过滤日期: {start_date}(昨天)")
target_date = d.strftime("%Y-%m-%d")
start_date = end_date = target_date
logger.info(f"过滤日期: {target_date}(昨天)")
elif date_filter:
target_date = date_filter
start_date = end_date = date_filter
logger.info(f"过滤日期: {start_date}")
logger.info(f"过滤日期: {target_date}")
else:
# 默认近一个月
end_date = datetime.now().strftime("%Y-%m-%d")
@@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider):
break
count = 0
has_older = False
for rec in records:
# 客户端日期二次过滤:跳过不在目标日期范围内的记录
# 客户端日期精确过滤
rec_date = rec.get("webdate", "").split(" ")[0]
if date_filter and rec_date and rec_date != start_date:
continue
if target_date and rec_date:
if rec_date != target_date:
if rec_date < target_date:
has_older = True
continue
link = rec.get("linkurl", "")
if link and not link.startswith("http"):
@@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider):
logger.info(f" 获取 {count} 条数据")
if count == 0:
logger.info("当前页无新数据,停止翻页")
break
if not target_date or has_older:
logger.info("当前页无新数据,停止翻页")
break
else:
# 当前页全是比目标日期更新的数据,继续翻页
logger.info(" 当前页均为更新日期的数据,继续翻页")
self.delay()
continue
self.delay()