From bf9be95ae883143623ee2d60de464b1e72764181 Mon Sep 17 00:00:00 2001 From: ztb-system Date: Thu, 26 Feb 2026 10:13:46 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=B5=99=E6=B1=9F=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E6=97=A5=E6=9C=9F=E8=BF=87=E6=BB=A4=EF=BC=9A=E5=AE=A2?= =?UTF-8?q?=E6=88=B7=E7=AB=AF=E7=B2=BE=E7=A1=AE=E8=BF=87=E6=BB=A4+?= =?UTF-8?q?=E6=99=BA=E8=83=BD=E7=BF=BB=E9=A1=B5=E9=80=BB=E8=BE=91=EF=BC=9B?= =?UTF-8?q?=E8=A1=A5=E5=85=85python-dotenv=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + spiders/zhejiang.py | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index c0c9600..23d0768 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0 lxml>=4.9.0 pdfplumber>=0.10.0 python-docx>=1.0.0 +python-dotenv>=1.0.0 diff --git a/spiders/zhejiang.py b/spiders/zhejiang.py index 61966cf..5793089 100644 --- a/spiders/zhejiang.py +++ b/spiders/zhejiang.py @@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider): page_size = self.spider_config.get("page_size", 20) # 日期范围 + target_date = None # 客户端精确过滤用 if date_filter == "yesterday": d = datetime.now() - timedelta(days=1) - start_date = end_date = d.strftime("%Y-%m-%d") - logger.info(f"过滤日期: {start_date}(昨天)") + target_date = d.strftime("%Y-%m-%d") + start_date = end_date = target_date + logger.info(f"过滤日期: {target_date}(昨天)") elif date_filter: + target_date = date_filter start_date = end_date = date_filter - logger.info(f"过滤日期: {start_date}") + logger.info(f"过滤日期: {target_date}") else: # 默认近一个月 end_date = datetime.now().strftime("%Y-%m-%d") @@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider): break count = 0 + has_older = False for rec in records: - # 客户端日期二次过滤:跳过不在目标日期范围内的记录 + # 客户端日期精确过滤 rec_date = rec.get("webdate", "").split(" ")[0] - if date_filter and rec_date and rec_date != start_date: - continue + if target_date and rec_date: + if rec_date != target_date: + if rec_date < target_date: + has_older = True + continue link = rec.get("linkurl", "") if link and not link.startswith("http"): @@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider): logger.info(f" 获取 {count} 条数据") if count == 0: - logger.info("当前页无新数据,停止翻页") - break + if not target_date or has_older: + logger.info("当前页无新数据,停止翻页") + break + else: + # 当前页全是比目标日期更新的数据,继续翻页 + logger.info(" 当前页均为更新日期的数据,继续翻页") + self.delay() + continue self.delay()