diff --git a/requirements.txt b/requirements.txt index c0c9600..23d0768 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0 lxml>=4.9.0 pdfplumber>=0.10.0 python-docx>=1.0.0 +python-dotenv>=1.0.0 diff --git a/spiders/zhejiang.py b/spiders/zhejiang.py index 61966cf..5793089 100644 --- a/spiders/zhejiang.py +++ b/spiders/zhejiang.py @@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider): page_size = self.spider_config.get("page_size", 20) # 日期范围 + target_date = None # 客户端精确过滤用 if date_filter == "yesterday": d = datetime.now() - timedelta(days=1) - start_date = end_date = d.strftime("%Y-%m-%d") - logger.info(f"过滤日期: {start_date}(昨天)") + target_date = d.strftime("%Y-%m-%d") + start_date = end_date = target_date + logger.info(f"过滤日期: {target_date}(昨天)") elif date_filter: + target_date = date_filter start_date = end_date = date_filter - logger.info(f"过滤日期: {start_date}") + logger.info(f"过滤日期: {target_date}") else: # 默认近一个月 end_date = datetime.now().strftime("%Y-%m-%d") @@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider): break count = 0 + has_older = False for rec in records: - # 客户端日期二次过滤:跳过不在目标日期范围内的记录 + # 客户端日期精确过滤 rec_date = rec.get("webdate", "").split(" ")[0] - if date_filter and rec_date and rec_date != start_date: - continue + if target_date and rec_date: + if rec_date != target_date: + if rec_date < target_date: + has_older = True + continue link = rec.get("linkurl", "") if link and not link.startswith("http"): @@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider): logger.info(f" 获取 {count} 条数据") if count == 0: - logger.info("当前页无新数据,停止翻页") - break + if not target_date or has_older: + logger.info("当前页无新数据,停止翻页") + break + else: + # 当前页全是比目标日期更新的数据,继续翻页 + logger.info(" 当前页均为更新日期的数据,继续翻页") + self.delay() + continue self.delay()