修复浙江爬虫日期过滤：客户端精确过滤+智能翻页逻辑；补充python-dotenv依赖

2026-02-26 10:13:46 +08:00
parent c65f3d07f8
commit bf9be95ae8
2 changed files with 22 additions and 8 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0
 lxml>=4.9.0
 pdfplumber>=0.10.0
 python-docx>=1.0.0
+python-dotenv>=1.0.0
--- a/spiders/zhejiang.py
+++ b/spiders/zhejiang.py
@@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider):
        page_size = self.spider_config.get("page_size", 20)

        # 日期范围
+        target_date = None  # 客户端精确过滤用
        if date_filter == "yesterday":
            d = datetime.now() - timedelta(days=1)
-            start_date = end_date = d.strftime("%Y-%m-%d")
-            logger.info(f"过滤日期: {start_date}（昨天）")
+            target_date = d.strftime("%Y-%m-%d")
+            start_date = end_date = target_date
+            logger.info(f"过滤日期: {target_date}（昨天）")
        elif date_filter:
+            target_date = date_filter
            start_date = end_date = date_filter
-            logger.info(f"过滤日期: {start_date}")
+            logger.info(f"过滤日期: {target_date}")
        else:
            # 默认近一个月
            end_date = datetime.now().strftime("%Y-%m-%d")
@@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider):
                break

            count = 0
+            has_older = False
            for rec in records:
-                # 客户端日期二次过滤：跳过不在目标日期范围内的记录
+                # 客户端日期精确过滤
                rec_date = rec.get("webdate", "").split(" ")[0]
-                if date_filter and rec_date and rec_date != start_date:
-                    continue
+                if target_date and rec_date:
+                    if rec_date != target_date:
+                        if rec_date < target_date:
+                            has_older = True
+                        continue

                link = rec.get("linkurl", "")
                if link and not link.startswith("http"):
@@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider):
            logger.info(f"  获取 {count} 条数据")

            if count == 0:
-                logger.info("当前页无新数据，停止翻页")
-                break
+                if not target_date or has_older:
+                    logger.info("当前页无新数据，停止翻页")
+                    break
+                else:
+                    # 当前页全是比目标日期更新的数据，继续翻页
+                    logger.info("  当前页均为更新日期的数据，继续翻页")
+                    self.delay()
+                    continue

            self.delay()