修复浙江爬虫日期过滤:客户端精确过滤+智能翻页逻辑;补充python-dotenv依赖
This commit is contained in:
@@ -3,3 +3,4 @@ beautifulsoup4>=4.12.0
|
||||
lxml>=4.9.0
|
||||
pdfplumber>=0.10.0
|
||||
python-docx>=1.0.0
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
@@ -251,13 +251,16 @@ class ZhejiangSpider(BaseSpider):
|
||||
page_size = self.spider_config.get("page_size", 20)
|
||||
|
||||
# 日期范围
|
||||
target_date = None # 客户端精确过滤用
|
||||
if date_filter == "yesterday":
|
||||
d = datetime.now() - timedelta(days=1)
|
||||
start_date = end_date = d.strftime("%Y-%m-%d")
|
||||
logger.info(f"过滤日期: {start_date}(昨天)")
|
||||
target_date = d.strftime("%Y-%m-%d")
|
||||
start_date = end_date = target_date
|
||||
logger.info(f"过滤日期: {target_date}(昨天)")
|
||||
elif date_filter:
|
||||
target_date = date_filter
|
||||
start_date = end_date = date_filter
|
||||
logger.info(f"过滤日期: {start_date}")
|
||||
logger.info(f"过滤日期: {target_date}")
|
||||
else:
|
||||
# 默认近一个月
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
@@ -294,11 +297,15 @@ class ZhejiangSpider(BaseSpider):
|
||||
break
|
||||
|
||||
count = 0
|
||||
has_older = False
|
||||
for rec in records:
|
||||
# 客户端日期二次过滤:跳过不在目标日期范围内的记录
|
||||
# 客户端日期精确过滤
|
||||
rec_date = rec.get("webdate", "").split(" ")[0]
|
||||
if date_filter and rec_date and rec_date != start_date:
|
||||
continue
|
||||
if target_date and rec_date:
|
||||
if rec_date != target_date:
|
||||
if rec_date < target_date:
|
||||
has_older = True
|
||||
continue
|
||||
|
||||
link = rec.get("linkurl", "")
|
||||
if link and not link.startswith("http"):
|
||||
@@ -338,8 +345,14 @@ class ZhejiangSpider(BaseSpider):
|
||||
logger.info(f" 获取 {count} 条数据")
|
||||
|
||||
if count == 0:
|
||||
logger.info("当前页无新数据,停止翻页")
|
||||
break
|
||||
if not target_date or has_older:
|
||||
logger.info("当前页无新数据,停止翻页")
|
||||
break
|
||||
else:
|
||||
# 当前页全是比目标日期更新的数据,继续翻页
|
||||
logger.info(" 当前页均为更新日期的数据,继续翻页")
|
||||
self.delay()
|
||||
continue
|
||||
|
||||
self.delay()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user